184 files changed, 88058 insertions, 29637 deletions
diff --git a/src/intel/vulkan/TODO b/src/intel/vulkan/TODO
deleted file mode 100644
index 4c41e251888..00000000000
--- a/src/intel/vulkan/TODO
+++ /dev/null
@@ -1,13 +0,0 @@
-Intel Vulkan ToDo
-=================
-
-Missing Features:
- - Investigate CTS failures on HSW
- - Sparse memory
-
-Performance:
- - Multi-{sampled/gfx8,LOD} HiZ
- - MSAA fast clears
- - Pushing pieces of UBOs?
- - Enable guardband clipping
- - Use soft-pin to avoid relocations
diff --git a/src/intel/vulkan/anv_acceleration_structure.c b/src/intel/vulkan/anv_acceleration_structure.c
deleted file mode 100644
index 1d0ccc0b410..00000000000
--- a/src/intel/vulkan/anv_acceleration_structure.c
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Copyright © 2020 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "anv_private.h"
-
-void
-anv_GetAccelerationStructureBuildSizesKHR(
-    VkDevice                                    device,
-    VkAccelerationStructureBuildTypeKHR         buildType,
-    const VkAccelerationStructureBuildGeometryInfoKHR* pBuildInfo,
-    const uint32_t*                             pMaxPrimitiveCounts,
-    VkAccelerationStructureBuildSizesInfoKHR*   pSizeInfo)
-{
-   assert(pSizeInfo->sType ==
-          VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_SIZES_INFO_KHR);
-
-   uint64_t max_prim_count = 0;
-   for (uint32_t i = 0; i < pBuildInfo->geometryCount; i++)
-      max_prim_count += pMaxPrimitiveCounts[i];
-
-   pSizeInfo->accelerationStructureSize = 0; /* TODO */
-
-   uint64_t cpu_build_scratch_size = 0; /* TODO */
-   uint64_t cpu_update_scratch_size = cpu_build_scratch_size;
-
-   uint64_t gpu_build_scratch_size = 0; /* TODO */
-   uint64_t gpu_update_scratch_size = gpu_build_scratch_size;
-
-   switch (buildType) {
-   case VK_ACCELERATION_STRUCTURE_BUILD_TYPE_HOST_KHR:
-      pSizeInfo->buildScratchSize = cpu_build_scratch_size;
-      pSizeInfo->updateScratchSize = cpu_update_scratch_size;
-      break;
-
-   case VK_ACCELERATION_STRUCTURE_BUILD_TYPE_DEVICE_KHR:
-      pSizeInfo->buildScratchSize = gpu_build_scratch_size;
-      pSizeInfo->updateScratchSize = gpu_update_scratch_size;
-      break;
-
-   case VK_ACCELERATION_STRUCTURE_BUILD_TYPE_HOST_OR_DEVICE_KHR:
-      pSizeInfo->buildScratchSize = MAX2(cpu_build_scratch_size,
-                                         gpu_build_scratch_size);
-      pSizeInfo->updateScratchSize = MAX2(cpu_update_scratch_size,
-                                          gpu_update_scratch_size);
-      break;
-
-   default:
-      unreachable("Invalid acceleration structure build type");
-   }
-}
-
-VkResult
-anv_CreateAccelerationStructureKHR(
-    VkDevice                                    _device,
-    const VkAccelerationStructureCreateInfoKHR* pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkAccelerationStructureKHR*                 pAccelerationStructure)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_buffer, buffer, pCreateInfo->buffer);
-   struct anv_acceleration_structure *accel;
-
-   accel = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*accel), 8,
-                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-   if (accel == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   vk_object_base_init(&device->vk, &accel->base,
-                       VK_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR);
-
-   accel->size = pCreateInfo->size;
-   accel->address = anv_address_add(buffer->address, pCreateInfo->offset);
-
-   *pAccelerationStructure = anv_acceleration_structure_to_handle(accel);
-
-   return VK_SUCCESS;
-}
-
-void
-anv_DestroyAccelerationStructureKHR(
-    VkDevice                                    _device,
-    VkAccelerationStructureKHR                  accelerationStructure,
-    const VkAllocationCallbacks*                pAllocator)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_acceleration_structure, accel, accelerationStructure);
-
-   if (!accel)
-      return;
-
-   vk_object_base_finish(&accel->base);
-   vk_free2(&device->vk.alloc, pAllocator, accel);
-}
-
-VkDeviceAddress
-anv_GetAccelerationStructureDeviceAddressKHR(
-    VkDevice                                    device,
-    const VkAccelerationStructureDeviceAddressInfoKHR* pInfo)
-{
-   ANV_FROM_HANDLE(anv_acceleration_structure, accel,
-                   pInfo->accelerationStructure);
-
-   assert(!anv_address_is_null(accel->address));
-   assert(accel->address.bo->flags & EXEC_OBJECT_PINNED);
-
-   return anv_address_physical(accel->address);
-}
-
-void
-anv_GetDeviceAccelerationStructureCompatibilityKHR(
-    VkDevice                                    device,
-    const VkAccelerationStructureVersionInfoKHR* pVersionInfo,
-    VkAccelerationStructureCompatibilityKHR*    pCompatibility)
-{
-   unreachable("Unimplemented");
-}
-
-VkResult
-anv_BuildAccelerationStructuresKHR(
-    VkDevice                                    device,
-    VkDeferredOperationKHR                      deferredOperation,
-    uint32_t                                    infoCount,
-    const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
-    const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos)
-{
-   unreachable("Unimplemented");
-   return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
-}
-
-VkResult
-anv_CopyAccelerationStructureKHR(
-    VkDevice                                    device,
-    VkDeferredOperationKHR                      deferredOperation,
-    const VkCopyAccelerationStructureInfoKHR*   pInfo)
-{
-   unreachable("Unimplemented");
-   return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
-}
-
-VkResult
-anv_CopyAccelerationStructureToMemoryKHR(
-    VkDevice                                    device,
-    VkDeferredOperationKHR                      deferredOperation,
-    const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo)
-{
-   unreachable("Unimplemented");
-   return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
-}
-
-VkResult
-anv_CopyMemoryToAccelerationStructureKHR(
-    VkDevice                                    device,
-    VkDeferredOperationKHR                      deferredOperation,
-    const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo)
-{
-   unreachable("Unimplemented");
-   return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
-}
-
-VkResult
-anv_WriteAccelerationStructuresPropertiesKHR(
-    VkDevice                                    device,
-    uint32_t                                    accelerationStructureCount,
-    const VkAccelerationStructureKHR*           pAccelerationStructures,
-    VkQueryType                                 queryType,
-    size_t                                      dataSize,
-    void*                                       pData,
-    size_t                                      stride)
-{
-   unreachable("Unimplemented");
-   return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
-}
-
-void
-anv_CmdBuildAccelerationStructuresKHR(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    infoCount,
-    const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
-    const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos)
-{
-   unreachable("Unimplemented");
-}
-
-void
-anv_CmdBuildAccelerationStructuresIndirectKHR(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    infoCount,
-    const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
-    const VkDeviceAddress*                      pIndirectDeviceAddresses,
-    const uint32_t*                             pIndirectStrides,
-    const uint32_t* const*                      ppMaxPrimitiveCounts)
-{
-   unreachable("Unimplemented");
-}
-
-void
-anv_CmdCopyAccelerationStructureKHR(
-    VkCommandBuffer                             commandBuffer,
-    const VkCopyAccelerationStructureInfoKHR*   pInfo)
-{
-   unreachable("Unimplemented");
-}
-
-void
-anv_CmdCopyAccelerationStructureToMemoryKHR(
-    VkCommandBuffer                             commandBuffer,
-    const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo)
-{
-   unreachable("Unimplemented");
-}
-
-void
-anv_CmdCopyMemoryToAccelerationStructureKHR(
-    VkCommandBuffer                             commandBuffer,
-    const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo)
-{
-   unreachable("Unimplemented");
-}
-
-void
-anv_CmdWriteAccelerationStructuresPropertiesKHR(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    accelerationStructureCount,
-    const VkAccelerationStructureKHR*           pAccelerationStructures,
-    VkQueryType                                 queryType,
-    VkQueryPool                                 queryPool,
-    uint32_t                                    firstQuery)
-{
-   unreachable("Unimplemented");
-}
diff --git a/src/intel/vulkan/anv_allocator.c b/src/intel/vulkan/anv_allocator.c
index 63eb855b85e..22af7f0ed1b 100644
--- a/src/intel/vulkan/anv_allocator.c
+++ b/src/intel/vulkan/anv_allocator.c
@@ -31,6 +31,7 @@
 
 #include "common/intel_aux_map.h"
 #include "util/anon_file.h"
+#include "util/futex.h"
 
 #ifdef HAVE_VALGRIND
 #define VG_NOACCESS_READ(__ptr) ({                       \
@@ -78,7 +79,7 @@
  * our allocation fast-path, there isn't really a way to munmap the old mmap,
  * so we just keep it around until garbage collection time.  While the block
  * allocator is lockless for normal operations, we block other threads trying
- * to allocate while we're growing the map.  It sholdn't happen often, and
+ * to allocate while we're growing the map.  It shouldn't happen often, and
  * growing is fast anyway.
  *
  * At the next level we can use various sub-allocators.  The state pool is a
@@ -112,24 +113,6 @@
 #define PAGE_SIZE 4096
 #endif
 
-struct anv_mmap_cleanup {
-   void *map;
-   size_t size;
-};
-
-static inline uint32_t
-ilog2_round_up(uint32_t value)
-{
-   assert(value != 0);
-   return 32 - __builtin_clz(value - 1);
-}
-
-static inline uint32_t
-round_to_power_of_two(uint32_t value)
-{
-   return 1 << ilog2_round_up(value);
-}
-
 struct anv_state_table_cleanup {
    void *map;
    size_t size;
@@ -155,15 +138,12 @@ anv_state_table_init(struct anv_state_table *table,
     * userptr and send a chunk of it off to the GPU.
     */
    table->fd = os_create_anonymous_file(BLOCK_POOL_MEMFD_SIZE, "state table");
-   if (table->fd == -1) {
-      result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
-      goto fail_fd;
-   }
+   if (table->fd == -1)
+      return vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
 
-   if (!u_vector_init(&table->cleanups,
-                      round_to_power_of_two(sizeof(struct anv_state_table_cleanup)),
-                      128)) {
-      result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
+   if (!u_vector_init(&table->cleanups, 8,
+                      sizeof(struct anv_state_table_cleanup))) {
+      result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
       goto fail_fd;
    }
 
@@ -197,11 +177,11 @@ anv_state_table_expand_range(struct anv_state_table *table, uint32_t size)
 
    /* Make sure that we don't go outside the bounds of the memfd */
    if (size > BLOCK_POOL_MEMFD_SIZE)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(table->device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    cleanup = u_vector_add(&table->cleanups);
    if (!cleanup)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(table->device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    *cleanup = ANV_STATE_TABLE_CLEANUP_INIT;
 
@@ -214,8 +194,8 @@ anv_state_table_expand_range(struct anv_state_table *table, uint32_t size)
    map = mmap(NULL, size, PROT_READ | PROT_WRITE,
               MAP_SHARED | MAP_POPULATE, table->fd, 0);
    if (map == MAP_FAILED) {
-      return vk_errorf(table->device, &table->device->vk.base,
-                       VK_ERROR_OUT_OF_HOST_MEMORY, "mmap failed: %m");
+      return vk_errorf(table->device, VK_ERROR_OUT_OF_HOST_MEMORY,
+                       "mmap failed: %m");
    }
 
    cleanup->map = map;
@@ -232,8 +212,8 @@ anv_state_table_grow(struct anv_state_table *table)
 {
    VkResult result = VK_SUCCESS;
 
-   uint32_t used = align_u32(table->state.next * ANV_STATE_ENTRY_SIZE,
-                             PAGE_SIZE);
+   uint32_t used = align(table->state.next * ANV_STATE_ENTRY_SIZE,
+                         PAGE_SIZE);
    uint32_t old_size = table->size;
 
    /* The block pool is always initialized to a nonzero size and this function
@@ -312,7 +292,7 @@ anv_state_table_add(struct anv_state_table *table, uint32_t *idx,
 
          old.u64 = __sync_lock_test_and_set(&table->state.u64, new.u64);
          if (old.next != state.next)
-            futex_wake(&table->state.end, INT_MAX);
+            futex_wake(&table->state.end, INT32_MAX);
       } else {
          futex_wait(&table->state.end, state.end, NULL);
          continue;
@@ -364,62 +344,46 @@ anv_free_list_pop(union anv_free_list *list,
 }
 
 static VkResult
-anv_block_pool_expand_range(struct anv_block_pool *pool,
-                            uint32_t center_bo_offset, uint32_t size);
+anv_block_pool_expand_range(struct anv_block_pool *pool, uint32_t size);
 
 VkResult
 anv_block_pool_init(struct anv_block_pool *pool,
                     struct anv_device *device,
                     const char *name,
                     uint64_t start_address,
-                    uint32_t initial_size)
+                    uint32_t initial_size,
+                    uint32_t max_size)
 {
    VkResult result;
 
+   /* Make sure VMA addresses are aligned for the block pool */
+   assert(anv_is_aligned(start_address, device->info->mem_alignment));
+   assert(anv_is_aligned(initial_size, device->info->mem_alignment));
+   assert(max_size > 0);
+   assert(max_size > initial_size);
+
    pool->name = name;
    pool->device = device;
-   pool->use_softpin = device->physical->use_softpin;
    pool->nbos = 0;
    pool->size = 0;
-   pool->center_bo_offset = 0;
    pool->start_address = intel_canonical_address(start_address);
-   pool->map = NULL;
+   pool->max_size = max_size;
 
-   if (pool->use_softpin) {
-      pool->bo = NULL;
-      pool->fd = -1;
-   } else {
-      /* Just make it 2GB up-front.  The Linux kernel won't actually back it
-       * with pages until we either map and fault on one of them or we use
-       * userptr and send a chunk of it off to the GPU.
-       */
-      pool->fd = os_create_anonymous_file(BLOCK_POOL_MEMFD_SIZE, "block pool");
-      if (pool->fd == -1)
-         return vk_error(VK_ERROR_INITIALIZATION_FAILED);
-
-      pool->wrapper_bo = (struct anv_bo) {
-         .refcount = 1,
-         .offset = -1,
-         .is_wrapper = true,
-      };
-      pool->bo = &pool->wrapper_bo;
-   }
-
-   if (!u_vector_init(&pool->mmap_cleanups,
-                      round_to_power_of_two(sizeof(struct anv_mmap_cleanup)),
-                      128)) {
-      result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
-      goto fail_fd;
-   }
+   pool->bo = NULL;
 
    pool->state.next = 0;
    pool->state.end = 0;
-   pool->back_state.next = 0;
-   pool->back_state.end = 0;
 
-   result = anv_block_pool_expand_range(pool, 0, initial_size);
+   pool->bo_alloc_flags =
+      ANV_BO_ALLOC_FIXED_ADDRESS |
+      ANV_BO_ALLOC_MAPPED |
+      ANV_BO_ALLOC_HOST_CACHED_COHERENT |
+      ANV_BO_ALLOC_CAPTURE |
+      ANV_BO_ALLOC_INTERNAL;
+
+   result = anv_block_pool_expand_range(pool, initial_size);
    if (result != VK_SUCCESS)
-      goto fail_mmap_cleanups;
+      return result;
 
    /* Make the entire pool available in the front of the pool.  If back
     * allocation needs to use this space, the "ends" will be re-arranged.
@@ -427,47 +391,22 @@ anv_block_pool_init(struct anv_block_pool *pool,
    pool->state.end = pool->size;
 
    return VK_SUCCESS;
-
- fail_mmap_cleanups:
-   u_vector_finish(&pool->mmap_cleanups);
- fail_fd:
-   if (pool->fd >= 0)
-      close(pool->fd);
-
-   return result;
 }
 
 void
 anv_block_pool_finish(struct anv_block_pool *pool)
 {
    anv_block_pool_foreach_bo(bo, pool) {
-      if (bo->map)
-         anv_gem_munmap(pool->device, bo->map, bo->size);
-      anv_gem_close(pool->device, bo->gem_handle);
+      assert(bo->refcount == 1);
+      anv_device_release_bo(pool->device, bo);
    }
-
-   struct anv_mmap_cleanup *cleanup;
-   u_vector_foreach(cleanup, &pool->mmap_cleanups)
-      munmap(cleanup->map, cleanup->size);
-   u_vector_finish(&pool->mmap_cleanups);
-
-   if (pool->fd >= 0)
-      close(pool->fd);
 }
 
 static VkResult
-anv_block_pool_expand_range(struct anv_block_pool *pool,
-                            uint32_t center_bo_offset, uint32_t size)
+anv_block_pool_expand_range(struct anv_block_pool *pool, uint32_t size)
 {
    /* Assert that we only ever grow the pool */
-   assert(center_bo_offset >= pool->back_state.end);
-   assert(size - center_bo_offset >= pool->state.end);
-
-   /* Assert that we don't go outside the bounds of the memfd */
-   assert(center_bo_offset <= BLOCK_POOL_MEMFD_CENTER);
-   assert(pool->use_softpin ||
-          size - center_bo_offset <=
-          BLOCK_POOL_MEMFD_SIZE - BLOCK_POOL_MEMFD_CENTER);
+   assert(size >= pool->state.end);
 
    /* For state pool BOs we have to be a bit careful about where we place them
     * in the GTT.  There are two documented workarounds for state base address
@@ -495,73 +434,22 @@ anv_block_pool_expand_range(struct anv_block_pool *pool,
     * hard work for us.  When using softpin, we're in control and the fixed
     * addresses we choose are fine for base addresses.
     */
-   enum anv_bo_alloc_flags bo_alloc_flags = ANV_BO_ALLOC_CAPTURE;
-   if (!pool->use_softpin)
-      bo_alloc_flags |= ANV_BO_ALLOC_32BIT_ADDRESS;
-
-   if (pool->use_softpin) {
-      uint32_t new_bo_size = size - pool->size;
-      struct anv_bo *new_bo;
-      assert(center_bo_offset == 0);
-      VkResult result = anv_device_alloc_bo(pool->device,
-                                            pool->name,
-                                            new_bo_size,
-                                            bo_alloc_flags |
-                                            ANV_BO_ALLOC_LOCAL_MEM |
-                                            ANV_BO_ALLOC_FIXED_ADDRESS |
-                                            ANV_BO_ALLOC_MAPPED |
-                                            ANV_BO_ALLOC_SNOOPED,
-                                            pool->start_address + pool->size,
-                                            &new_bo);
-      if (result != VK_SUCCESS)
-         return result;
-
-      pool->bos[pool->nbos++] = new_bo;
-
-      /* This pointer will always point to the first BO in the list */
-      pool->bo = pool->bos[0];
-   } else {
-      /* Just leak the old map until we destroy the pool.  We can't munmap it
-       * without races or imposing locking on the block allocate fast path. On
-       * the whole the leaked maps adds up to less than the size of the
-       * current map.  MAP_POPULATE seems like the right thing to do, but we
-       * should try to get some numbers.
-       */
-      void *map = mmap(NULL, size, PROT_READ | PROT_WRITE,
-                       MAP_SHARED | MAP_POPULATE, pool->fd,
-                       BLOCK_POOL_MEMFD_CENTER - center_bo_offset);
-      if (map == MAP_FAILED)
-         return vk_errorf(pool->device, &pool->device->vk.base,
-                          VK_ERROR_MEMORY_MAP_FAILED, "mmap failed: %m");
-
-      struct anv_bo *new_bo;
-      VkResult result = anv_device_import_bo_from_host_ptr(pool->device,
-                                                           map, size,
-                                                           bo_alloc_flags,
-                                                           0 /* client_address */,
-                                                           &new_bo);
-      if (result != VK_SUCCESS) {
-         munmap(map, size);
-         return result;
-      }
 
-      struct anv_mmap_cleanup *cleanup = u_vector_add(&pool->mmap_cleanups);
-      if (!cleanup) {
-         munmap(map, size);
-         anv_device_release_bo(pool->device, new_bo);
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-      }
-      cleanup->map = map;
-      cleanup->size = size;
+   uint32_t new_bo_size = size - pool->size;
+   struct anv_bo *new_bo = NULL;
+   VkResult result = anv_device_alloc_bo(pool->device,
+                                         pool->name,
+                                         new_bo_size,
+                                         pool->bo_alloc_flags,
+                                         intel_48b_address(pool->start_address + pool->size),
+                                         &new_bo);
+   if (result != VK_SUCCESS)
+      return result;
 
-      /* Now that we mapped the new memory, we can write the new
-       * center_bo_offset back into pool and update pool->map. */
-      pool->center_bo_offset = center_bo_offset;
-      pool->map = map + center_bo_offset;
+   pool->bos[pool->nbos++] = new_bo;
 
-      pool->bos[pool->nbos++] = new_bo;
-      pool->wrapper_bo.map = new_bo;
-   }
+   /* This pointer will always point to the first BO in the list */
+   pool->bo = pool->bos[0];
 
    assert(pool->nbos < ANV_MAX_BLOCK_POOL_BOS);
    pool->size = size;
@@ -578,24 +466,20 @@ anv_block_pool_expand_range(struct anv_block_pool *pool,
 void*
 anv_block_pool_map(struct anv_block_pool *pool, int32_t offset, uint32_t size)
 {
-   if (pool->use_softpin) {
-      struct anv_bo *bo = NULL;
-      int32_t bo_offset = 0;
-      anv_block_pool_foreach_bo(iter_bo, pool) {
-         if (offset < bo_offset + iter_bo->size) {
-            bo = iter_bo;
-            break;
-         }
-         bo_offset += iter_bo->size;
+   struct anv_bo *bo = NULL;
+   int32_t bo_offset = 0;
+   anv_block_pool_foreach_bo(iter_bo, pool) {
+      if (offset < bo_offset + iter_bo->size) {
+         bo = iter_bo;
+         break;
       }
-      assert(bo != NULL);
-      assert(offset >= bo_offset);
-      assert((offset - bo_offset) + size <= bo->size);
-
-      return bo->map + (offset - bo_offset);
-   } else {
-      return pool->map + offset;
+      bo_offset += iter_bo->size;
    }
+   assert(bo != NULL);
+   assert(offset >= bo_offset);
+   assert((offset - bo_offset) + size <= bo->size);
+
+   return bo->map + (offset - bo_offset);
 }
 
 /** Grows and re-centers the block pool.
@@ -612,14 +496,10 @@ anv_block_pool_map(struct anv_block_pool *pool, int32_t offset, uint32_t size)
  *     allocated for each end as we have used.  This way the pool doesn't
  *     grow too far in one direction or the other.
  *
- *  4) If the _alloc_back() has never been called, then the back portion of
- *     the pool retains a size of zero.  (This makes it easier for users of
- *     the block pool that only want a one-sided pool.)
- *
- *  5) We have enough space allocated for at least one more block in
+ *  4) We have enough space allocated for at least one more block in
  *     whichever side `state` points to.
  *
- *  6) The center of the pool is always aligned to both the block_size of
+ *  5) The center of the pool is always aligned to both the block_size of
  *     the pool and a 4K CPU page.
  */
 static uint32_t
@@ -630,10 +510,10 @@ anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state,
 
    pthread_mutex_lock(&pool->device->mutex);
 
-   assert(state == &pool->state || state == &pool->back_state);
+   assert(state == &pool->state);
 
    /* Gather a little usage information on the pool.  Since we may have
-    * threadsd waiting in queue to get some storage while we resize, it's
+    * threads waiting in queue to get some storage while we resize, it's
     * actually possible that total_used will be larger than old_size.  In
     * particular, block_pool_alloc() increments state->next prior to
     * calling block_pool_grow, so this ensures that we get enough space for
@@ -642,11 +522,7 @@ anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state,
     * We align to a page size because it makes it easier to do our
     * calculations later in such a way that we state page-aigned.
     */
-   uint32_t back_used = align_u32(pool->back_state.next, PAGE_SIZE);
-   uint32_t front_used = align_u32(pool->state.next, PAGE_SIZE);
-   uint32_t total_used = front_used + back_used;
-
-   assert(state == &pool->state || back_used > 0);
+   uint32_t total_used = align(pool->state.next, PAGE_SIZE);
 
    uint32_t old_size = pool->size;
 
@@ -655,97 +531,49 @@ anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state,
     */
    assert(old_size > 0);
 
-   const uint32_t old_back = pool->center_bo_offset;
-   const uint32_t old_front = old_size - pool->center_bo_offset;
-
-   /* The back_used and front_used may actually be smaller than the actual
-    * requirement because they are based on the next pointers which are
-    * updated prior to calling this function.
+   /* total_used may actually be smaller than the actual requirement because
+    * they are based on the next pointers which are updated prior to calling
+    * this function.
     */
-   uint32_t back_required = MAX2(back_used, old_back);
-   uint32_t front_required = MAX2(front_used, old_front);
-
-   if (pool->use_softpin) {
-      /* With softpin, the pool is made up of a bunch of buffers with separate
-       * maps.  Make sure we have enough contiguous space that we can get a
-       * properly contiguous map for the next chunk.
-       */
-      assert(old_back == 0);
-      front_required = MAX2(front_required, old_front + contiguous_size);
-   }
-
-   if (back_used * 2 <= back_required && front_used * 2 <= front_required) {
-      /* If we're in this case then this isn't the firsta allocation and we
-       * already have enough space on both sides to hold double what we
-       * have allocated.  There's nothing for us to do.
-       */
-      goto done;
-   }
-
-   uint32_t size = old_size * 2;
-   while (size < back_required + front_required)
-      size *= 2;
-
-   assert(size > pool->size);
+   uint32_t required = MAX2(total_used, old_size);
 
-   /* We compute a new center_bo_offset such that, when we double the size
-    * of the pool, we maintain the ratio of how much is used by each side.
-    * This way things should remain more-or-less balanced.
+   /* With softpin, the pool is made up of a bunch of buffers with separate
+    * maps.  Make sure we have enough contiguous space that we can get a
+    * properly contiguous map for the next chunk.
     */
-   uint32_t center_bo_offset;
-   if (back_used == 0) {
-      /* If we're in this case then we have never called alloc_back().  In
-       * this case, we want keep the offset at 0 to make things as simple
-       * as possible for users that don't care about back allocations.
-       */
-      center_bo_offset = 0;
-   } else {
-      /* Try to "center" the allocation based on how much is currently in
-       * use on each side of the center line.
-       */
-      center_bo_offset = ((uint64_t)size * back_used) / total_used;
-
-      /* Align down to a multiple of the page size */
-      center_bo_offset &= ~(PAGE_SIZE - 1);
+   required = MAX2(required, old_size + contiguous_size);
 
-      assert(center_bo_offset >= back_used);
+   if (required > pool->max_size) {
+      result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
+   } else if (total_used * 2 > required) {
+      uint32_t size = old_size * 2;
+      while (size < required)
+         size *= 2;
 
-      /* Make sure we don't shrink the back end of the pool */
-      if (center_bo_offset < back_required)
-         center_bo_offset = back_required;
+      size = MIN2(size, pool->max_size);
+      assert(size > pool->size);
 
-      /* Make sure that we don't shrink the front end of the pool */
-      if (size - center_bo_offset < front_required)
-         center_bo_offset = size - front_required;
+      result = anv_block_pool_expand_range(pool, size);
    }
 
-   assert(center_bo_offset % PAGE_SIZE == 0);
-
-   result = anv_block_pool_expand_range(pool, center_bo_offset, size);
-
-done:
    pthread_mutex_unlock(&pool->device->mutex);
 
-   if (result == VK_SUCCESS) {
-      /* Return the appropriate new size.  This function never actually
-       * updates state->next.  Instead, we let the caller do that because it
-       * needs to do so in order to maintain its concurrency model.
-       */
-      if (state == &pool->state) {
-         return pool->size - pool->center_bo_offset;
-      } else {
-         assert(pool->center_bo_offset > 0);
-         return pool->center_bo_offset;
-      }
-   } else {
+   if (result != VK_SUCCESS)
       return 0;
-   }
+
+   /* Return the appropriate new size.  This function never actually
+    * updates state->next.  Instead, we let the caller do that because it
+    * needs to do so in order to maintain its concurrency model.
+    */
+   return pool->size;
 }
 
-static uint32_t
+static VkResult
 anv_block_pool_alloc_new(struct anv_block_pool *pool,
                          struct anv_block_state *pool_state,
-                         uint32_t block_size, uint32_t *padding)
+                         uint32_t block_size,
+                         int64_t *offset,
+                         uint32_t *padding)
 {
    struct anv_block_state state, old, new;
 
@@ -755,10 +583,13 @@ anv_block_pool_alloc_new(struct anv_block_pool *pool,
 
    while (1) {
       state.u64 = __sync_fetch_and_add(&pool_state->u64, block_size);
-      if (state.next + block_size <= state.end) {
-         return state.next;
+      if (state.next + block_size > pool->max_size) {
+         return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+      } else if (state.next + block_size <= state.end) {
+         *offset =  state.next;
+         return VK_SUCCESS;
       } else if (state.next <= state.end) {
-         if (pool->use_softpin && state.next < state.end) {
+         if (state.next < state.end) {
             /* We need to grow the block pool, but still have some leftover
              * space that can't be used by that particular allocation. So we
              * add that as a "padding", and return it.
@@ -782,12 +613,17 @@ anv_block_pool_alloc_new(struct anv_block_pool *pool,
          new.next = state.next + block_size;
          do {
             new.end = anv_block_pool_grow(pool, pool_state, block_size);
+            if (pool->size > 0 && new.end == 0) {
+               futex_wake(&pool_state->end, INT32_MAX);
+               return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+            }
          } while (new.end < new.next);
 
          old.u64 = __sync_lock_test_and_set(&pool_state->u64, new.u64);
          if (old.next != state.next)
-            futex_wake(&pool_state->end, INT_MAX);
-         return state.next;
+            futex_wake(&pool_state->end, INT32_MAX);
+         *offset = state.next;
+         return VK_SUCCESS;
       } else {
          futex_wait(&pool_state->end, state.end, NULL);
          continue;
@@ -795,60 +631,31 @@ anv_block_pool_alloc_new(struct anv_block_pool *pool,
    }
 }
 
-int32_t
+VkResult
 anv_block_pool_alloc(struct anv_block_pool *pool,
-                     uint32_t block_size, uint32_t *padding)
-{
-   uint32_t offset;
-
-   offset = anv_block_pool_alloc_new(pool, &pool->state, block_size, padding);
-
-   return offset;
-}
-
-/* Allocates a block out of the back of the block pool.
- *
- * This will allocated a block earlier than the "start" of the block pool.
- * The offsets returned from this function will be negative but will still
- * be correct relative to the block pool's map pointer.
- *
- * If you ever use anv_block_pool_alloc_back, then you will have to do
- * gymnastics with the block pool's BO when doing relocations.
- */
-int32_t
-anv_block_pool_alloc_back(struct anv_block_pool *pool,
-                          uint32_t block_size)
+                     uint32_t block_size,
+                     int64_t *offset, uint32_t *padding)
 {
-   int32_t offset = anv_block_pool_alloc_new(pool, &pool->back_state,
-                                             block_size, NULL);
-
-   /* The offset we get out of anv_block_pool_alloc_new() is actually the
-    * number of bytes downwards from the middle to the end of the block.
-    * We need to turn it into a (negative) offset from the middle to the
-    * start of the block.
-    */
-   assert(offset >= 0);
-   return -(offset + block_size);
+   return anv_block_pool_alloc_new(pool, &pool->state, block_size, offset, padding);
 }
 
 VkResult
 anv_state_pool_init(struct anv_state_pool *pool,
                     struct anv_device *device,
-                    const char *name,
-                    uint64_t base_address,
-                    int32_t start_offset,
-                    uint32_t block_size)
+                    const struct anv_state_pool_params *params)
 {
-   /* We don't want to ever see signed overflow */
-   assert(start_offset < INT32_MAX - (int32_t)BLOCK_POOL_MEMFD_SIZE);
-
-   VkResult result = anv_block_pool_init(&pool->block_pool, device, name,
-                                         base_address + start_offset,
-                                         block_size * 16);
+   uint32_t initial_size = MAX2(params->block_size * 16,
+                                device->info->mem_alignment);
+
+   VkResult result = anv_block_pool_init(&pool->block_pool, device,
+                                         params->name,
+                                         params->base_address + params->start_offset,
+                                         initial_size,
+                                         params->max_size);
    if (result != VK_SUCCESS)
       return result;
 
-   pool->start_offset = start_offset;
+   pool->start_offset = params->start_offset;
 
    result = anv_state_table_init(&pool->table, device, 64);
    if (result != VK_SUCCESS) {
@@ -856,9 +663,8 @@ anv_state_pool_init(struct anv_state_pool *pool,
       return result;
    }
 
-   assert(util_is_power_of_two_or_zero(block_size));
-   pool->block_size = block_size;
-   pool->back_alloc_free_list = ANV_FREE_LIST_EMPTY;
+   assert(util_is_power_of_two_or_zero(params->block_size));
+   pool->block_size = params->block_size;
    for (unsigned i = 0; i < ANV_STATE_BUCKETS; i++) {
       pool->buckets[i].free_list = ANV_FREE_LIST_EMPTY;
       pool->buckets[i].block.next = 0;
@@ -877,15 +683,15 @@ anv_state_pool_finish(struct anv_state_pool *pool)
    anv_block_pool_finish(&pool->block_pool);
 }
 
-static uint32_t
+static VkResult
 anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool *pool,
                                     struct anv_block_pool *block_pool,
                                     uint32_t state_size,
                                     uint32_t block_size,
+                                    int64_t *offset,
                                     uint32_t *padding)
 {
    struct anv_block_state block, old, new;
-   uint32_t offset;
 
    /* We don't always use anv_block_pool_alloc(), which would set *padding to
     * zero for us. So if we have a pointer to padding, we must zero it out
@@ -898,21 +704,25 @@ anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool *pool,
     * Instead, we just grab whole (potentially large) blocks.
     */
    if (state_size >= block_size)
-      return anv_block_pool_alloc(block_pool, state_size, padding);
+      return anv_block_pool_alloc(block_pool, state_size, offset, padding);
 
  restart:
    block.u64 = __sync_fetch_and_add(&pool->block.u64, state_size);
 
    if (block.next < block.end) {
-      return block.next;
+      *offset = block.next;
+      return VK_SUCCESS;
    } else if (block.next == block.end) {
-      offset = anv_block_pool_alloc(block_pool, block_size, padding);
-      new.next = offset + state_size;
-      new.end = offset + block_size;
+      VkResult result = anv_block_pool_alloc(block_pool, block_size,
+                                             offset, padding);
+      if (result != VK_SUCCESS)
+         return result;
+      new.next = *offset + state_size;
+      new.end = *offset + block_size;
       old.u64 = __sync_lock_test_and_set(&pool->block.u64, new.u64);
       if (old.next != block.next)
-         futex_wake(&pool->block.end, INT_MAX);
-      return offset;
+         futex_wake(&pool->block.end, INT32_MAX);
+      return result;
    } else {
       futex_wait(&pool->block.end, block.end, NULL);
       goto restart;
@@ -922,7 +732,7 @@ anv_fixed_size_state_pool_alloc_new(struct anv_fixed_size_state_pool *pool,
 static uint32_t
 anv_state_pool_get_bucket(uint32_t size)
 {
-   unsigned size_log2 = ilog2_round_up(size);
+   unsigned size_log2 = util_logbase2_ceil(size);
    assert(size_log2 <= ANV_MAX_STATE_SIZE_LOG2);
    if (size_log2 < ANV_MIN_STATE_SIZE_LOG2)
       size_log2 = ANV_MIN_STATE_SIZE_LOG2;
@@ -992,7 +802,7 @@ anv_state_pool_return_chunk(struct anv_state_pool *pool,
 
    if (nblocks > 0) {
       /* First return divisor aligned and sized chunks. We start returning
-       * larger blocks from the end fo the chunk, since they should already be
+       * larger blocks from the end of the chunk, since they should already be
        * aligned to divisor. Also anv_state_pool_return_blocks() only accepts
        * aligned chunks.
        */
@@ -1031,7 +841,7 @@ anv_state_pool_alloc_no_vg(struct anv_state_pool *pool,
 
    struct anv_state *state;
    uint32_t alloc_size = anv_state_pool_get_bucket_size(bucket);
-   int32_t offset;
+   int64_t offset;
 
    /* Try free list first. */
    state = anv_free_list_pop(&pool->buckets[bucket].free_list,
@@ -1091,14 +901,19 @@ anv_state_pool_alloc_no_vg(struct anv_state_pool *pool,
    }
 
    uint32_t padding;
-   offset = anv_fixed_size_state_pool_alloc_new(&pool->buckets[bucket],
-                                                &pool->block_pool,
-                                                alloc_size,
-                                                pool->block_size,
-                                                &padding);
-   /* Everytime we allocate a new state, add it to the state pool */
-   uint32_t idx;
-   UNUSED VkResult result = anv_state_table_add(&pool->table, &idx, 1);
+   VkResult result =
+      anv_fixed_size_state_pool_alloc_new(&pool->buckets[bucket],
+                                          &pool->block_pool,
+                                          alloc_size,
+                                          pool->block_size,
+                                          &offset,
+                                          &padding);
+   if (result != VK_SUCCESS)
+      return ANV_STATE_NULL;
+
+   /* Every time we allocate a new state, add it to the state pool */
+   uint32_t idx = 0;
+   result = anv_state_table_add(&pool->table, &idx, 1);
    assert(result == VK_SUCCESS);
 
    state = anv_state_table_get(&pool->table, idx);
@@ -1126,52 +941,16 @@ anv_state_pool_alloc(struct anv_state_pool *pool, uint32_t size, uint32_t align)
    return state;
 }
 
-struct anv_state
-anv_state_pool_alloc_back(struct anv_state_pool *pool)
-{
-   struct anv_state *state;
-   uint32_t alloc_size = pool->block_size;
-
-   /* This function is only used with pools where start_offset == 0 */
-   assert(pool->start_offset == 0);
-
-   state = anv_free_list_pop(&pool->back_alloc_free_list, &pool->table);
-   if (state) {
-      assert(state->offset < pool->start_offset);
-      goto done;
-   }
-
-   int32_t offset;
-   offset = anv_block_pool_alloc_back(&pool->block_pool,
-                                      pool->block_size);
-   uint32_t idx;
-   UNUSED VkResult result = anv_state_table_add(&pool->table, &idx, 1);
-   assert(result == VK_SUCCESS);
-
-   state = anv_state_table_get(&pool->table, idx);
-   state->offset = pool->start_offset + offset;
-   state->alloc_size = alloc_size;
-   state->map = anv_block_pool_map(&pool->block_pool, offset, alloc_size);
-
-done:
-   VG(VALGRIND_MEMPOOL_ALLOC(pool, state->map, state->alloc_size));
-   return *state;
-}
-
 static void
 anv_state_pool_free_no_vg(struct anv_state_pool *pool, struct anv_state state)
 {
    assert(util_is_power_of_two_or_zero(state.alloc_size));
    unsigned bucket = anv_state_pool_get_bucket(state.alloc_size);
 
-   if (state.offset < pool->start_offset) {
-      assert(state.alloc_size == pool->block_size);
-      anv_free_list_push(&pool->back_alloc_free_list,
-                         &pool->table, state.idx, 1);
-   } else {
-      anv_free_list_push(&pool->buckets[bucket].free_list,
-                         &pool->table, state.idx, 1);
-   }
+   assert(state.offset >= pool->start_offset);
+
+   anv_free_list_push(&pool->buckets[bucket].free_list,
+                      &pool->table, state.idx, 1);
 }
 
 void
@@ -1216,6 +995,7 @@ anv_state_stream_init(struct anv_state_stream *stream,
     */
    stream->next = block_size;
 
+   stream->total_size = 0;
    util_dynarray_init(&stream->all_blocks, NULL);
 
    VG(VALGRIND_CREATE_MEMPOOL(stream, 0, false));
@@ -1243,14 +1023,17 @@ anv_state_stream_alloc(struct anv_state_stream *stream,
 
    assert(alignment <= PAGE_SIZE);
 
-   uint32_t offset = align_u32(stream->next, alignment);
+   uint32_t offset = align(stream->next, alignment);
    if (offset + size > stream->block.alloc_size) {
       uint32_t block_size = stream->block_size;
       if (block_size < size)
-         block_size = round_to_power_of_two(size);
+         block_size = util_next_power_of_two(size);
 
       stream->block = anv_state_pool_alloc_no_vg(stream->state_pool,
                                                  block_size, PAGE_SIZE);
+      if (stream->block.alloc_size == 0)
+         return ANV_STATE_NULL;
+
       util_dynarray_append(&stream->all_blocks,
                            struct anv_state, stream->block);
       VG(VALGRIND_MAKE_MEM_NOACCESS(stream->block.map, block_size));
@@ -1258,6 +1041,7 @@ anv_state_stream_alloc(struct anv_state_stream *stream,
       /* Reset back to the start */
       stream->next = offset = 0;
       assert(offset + size <= stream->block.alloc_size);
+      stream->total_size += block_size;
    }
    const bool new_block = stream->next == 0;
 
@@ -1323,12 +1107,108 @@ anv_state_reserved_pool_free(struct anv_state_reserved_pool *pool,
    anv_free_list_push(&pool->reserved_blocks, &pool->pool->table, state.idx, 1);
 }
 
+VkResult
+anv_state_reserved_array_pool_init(struct anv_state_reserved_array_pool *pool,
+                                   struct anv_state_pool *parent,
+                                   uint32_t count, uint32_t size, uint32_t alignment)
+{
+   pool->pool = parent;
+   pool->count = count;
+   pool->size = size;
+   pool->stride = align(size, alignment);
+   pool->states = vk_zalloc(&pool->pool->block_pool.device->vk.alloc,
+                            sizeof(BITSET_WORD) * BITSET_WORDS(pool->count), 8,
+                            VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (pool->states == NULL)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   BITSET_SET_RANGE(pool->states, 0, pool->count - 1);
+   simple_mtx_init(&pool->mutex, mtx_plain);
+
+   pool->state = anv_state_pool_alloc(pool->pool, pool->stride * count, alignment);
+
+   return VK_SUCCESS;
+}
+
+void
+anv_state_reserved_array_pool_finish(struct anv_state_reserved_array_pool *pool)
+{
+   anv_state_pool_free(pool->pool, pool->state);
+   vk_free(&pool->pool->block_pool.device->vk.alloc, pool->states);
+   simple_mtx_destroy(&pool->mutex);
+}
+
+struct anv_state
+anv_state_reserved_array_pool_alloc(struct anv_state_reserved_array_pool *pool,
+                                    bool alloc_back)
+{
+   simple_mtx_lock(&pool->mutex);
+   int idx = alloc_back ?
+      __bitset_last_bit(pool->states, BITSET_WORDS(pool->count)) :
+      __bitset_ffs(pool->states, BITSET_WORDS(pool->count));
+   if (idx != 0)
+      BITSET_CLEAR(pool->states, idx - 1);
+   simple_mtx_unlock(&pool->mutex);
+
+   if (idx == 0)
+      return ANV_STATE_NULL;
+
+   idx--;
+
+   struct anv_state state = pool->state;
+   state.offset += idx * pool->stride;
+   state.map += idx * pool->stride;
+   state.alloc_size = pool->size;
+
+   return state;
+}
+
+struct anv_state
+anv_state_reserved_array_pool_alloc_index(struct anv_state_reserved_array_pool *pool,
+                                          uint32_t idx)
+{
+   simple_mtx_lock(&pool->mutex);
+   bool already_allocated = !BITSET_TEST(pool->states, idx);
+   if (!already_allocated)
+      BITSET_CLEAR(pool->states, idx);
+   simple_mtx_unlock(&pool->mutex);
+
+   if (already_allocated)
+      return ANV_STATE_NULL;
+
+   struct anv_state state = pool->state;
+   state.offset += idx * pool->stride;
+   state.map += idx * pool->stride;
+   state.alloc_size = pool->size;
+
+   return state;
+}
+
+uint32_t
+anv_state_reserved_array_pool_state_index(struct anv_state_reserved_array_pool *pool,
+                                          struct anv_state state)
+{
+   return (state.offset - pool->state.offset) / pool->stride;
+}
+
+void
+anv_state_reserved_array_pool_free(struct anv_state_reserved_array_pool *pool,
+                                  struct anv_state state)
+{
+   unsigned idx = (state.offset - pool->state.offset) / pool->stride;
+   simple_mtx_lock(&pool->mutex);
+   BITSET_SET(pool->states, idx);
+   simple_mtx_unlock(&pool->mutex);
+ }
+
 void
 anv_bo_pool_init(struct anv_bo_pool *pool, struct anv_device *device,
-                 const char *name)
+                 const char *name, enum anv_bo_alloc_flags alloc_flags)
 {
    pool->name = name;
    pool->device = device;
+   pool->bo_alloc_flags = alloc_flags;
+
    for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) {
       util_sparse_array_free_list_init(&pool->free_list[i],
                                        &device->bo_cache.bo_map, 0,
@@ -1361,7 +1241,7 @@ VkResult
 anv_bo_pool_alloc(struct anv_bo_pool *pool, uint32_t size,
                   struct anv_bo **bo_out)
 {
-   const unsigned size_log2 = size < 4096 ? 12 : ilog2_round_up(size);
+   const unsigned size_log2 = size < 4096 ? 12 : util_logbase2_ceil(size);
    const unsigned pow2_size = 1 << size_log2;
    const unsigned bucket = size_log2 - 12;
    assert(bucket < ARRAY_SIZE(pool->free_list));
@@ -1377,10 +1257,7 @@ anv_bo_pool_alloc(struct anv_bo_pool *pool, uint32_t size,
    VkResult result = anv_device_alloc_bo(pool->device,
                                          pool->name,
                                          pow2_size,
-                                         ANV_BO_ALLOC_LOCAL_MEM |
-                                         ANV_BO_ALLOC_MAPPED |
-                                         ANV_BO_ALLOC_SNOOPED |
-                                         ANV_BO_ALLOC_CAPTURE,
+                                         pool->bo_alloc_flags,
                                          0 /* explicit_address */,
                                          &bo);
    if (result != VK_SUCCESS)
@@ -1401,7 +1278,7 @@ anv_bo_pool_free(struct anv_bo_pool *pool, struct anv_bo *bo)
    VG(VALGRIND_MEMPOOL_FREE(pool, bo->map));
 
    assert(util_is_power_of_two_or_zero(bo->size));
-   const unsigned size_log2 = ilog2_round_up(bo->size);
+   const unsigned size_log2 = util_logbase2_ceil(bo->size);
    const unsigned bucket = size_log2 - 12;
    assert(bucket < ARRAY_SIZE(pool->free_list));
 
@@ -1431,7 +1308,7 @@ anv_scratch_pool_finish(struct anv_device *device, struct anv_scratch_pool *pool
 
    for (unsigned i = 0; i < 16; i++) {
       if (pool->surf_states[i].map != NULL) {
-         anv_state_pool_free(&device->surface_state_pool,
+         anv_state_pool_free(&device->scratch_surface_state_pool,
                              pool->surf_states[i]);
       }
    }
@@ -1449,7 +1326,7 @@ anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool,
 
    assert(stage < ARRAY_SIZE(pool->bos));
 
-   const struct intel_device_info *devinfo = &device->info;
+   const struct intel_device_info *devinfo = device->info;
 
    /* On GFX version 12.5, scratch access changed to a surface-based model.
     * Instead of each shader type having its own layout based on IDs passed
@@ -1484,9 +1361,11 @@ anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool,
     *
     * so nothing will ever touch the top page.
     */
+   const enum anv_bo_alloc_flags alloc_flags =
+      ANV_BO_ALLOC_INTERNAL |
+      (devinfo->verx10 < 125 ? ANV_BO_ALLOC_32BIT_ADDRESS : 0);
    VkResult result = anv_device_alloc_bo(device, "scratch", size,
-                                         ANV_BO_ALLOC_32BIT_ADDRESS |
-                                         ANV_BO_ALLOC_LOCAL_MEM,
+                                         alloc_flags,
                                          0 /* explicit_address */,
                                          &bo);
    if (result != VK_SUCCESS)
@@ -1507,6 +1386,8 @@ anv_scratch_pool_get_surf(struct anv_device *device,
                           struct anv_scratch_pool *pool,
                           unsigned per_thread_scratch)
 {
+   assert(device->info->verx10 >= 125);
+
    if (per_thread_scratch == 0)
       return 0;
 
@@ -1523,7 +1404,7 @@ anv_scratch_pool_get_surf(struct anv_device *device,
    struct anv_address addr = { .bo = bo };
 
    struct anv_state state =
-      anv_state_pool_alloc(&device->surface_state_pool,
+      anv_state_pool_alloc(&device->scratch_surface_state_pool,
                            device->isl_dev.ss.size, 64);
 
    isl_buffer_fill_state(&device->isl_dev, state.map,
@@ -1538,7 +1419,7 @@ anv_scratch_pool_get_surf(struct anv_device *device,
    uint32_t current = p_atomic_cmpxchg(&pool->surfs[scratch_size_log2],
                                        0, state.offset);
    if (current) {
-      anv_state_pool_free(&device->surface_state_pool, state);
+      anv_state_pool_free(&device->scratch_surface_state_pool, state);
       return current;
    } else {
       pool->surf_states[scratch_size_log2] = state;
@@ -1547,13 +1428,13 @@ anv_scratch_pool_get_surf(struct anv_device *device,
 }
 
 VkResult
-anv_bo_cache_init(struct anv_bo_cache *cache)
+anv_bo_cache_init(struct anv_bo_cache *cache, struct anv_device *device)
 {
    util_sparse_array_init(&cache->bo_map, sizeof(struct anv_bo), 1024);
 
    if (pthread_mutex_init(&cache->mutex, NULL)) {
       util_sparse_array_finish(&cache->bo_map);
-      return vk_errorf(NULL, NULL, VK_ERROR_OUT_OF_HOST_MEMORY,
+      return vk_errorf(device, VK_ERROR_OUT_OF_HOST_MEMORY,
                        "pthread_mutex_init failed: %m");
    }
 
@@ -1567,50 +1448,113 @@ anv_bo_cache_finish(struct anv_bo_cache *cache)
    pthread_mutex_destroy(&cache->mutex);
 }
 
-#define ANV_BO_CACHE_SUPPORTED_FLAGS \
-   (EXEC_OBJECT_WRITE | \
-    EXEC_OBJECT_ASYNC | \
-    EXEC_OBJECT_SUPPORTS_48B_ADDRESS | \
-    EXEC_OBJECT_PINNED | \
-    EXEC_OBJECT_CAPTURE)
+static void
+anv_bo_unmap_close(struct anv_device *device, struct anv_bo *bo)
+{
+   if (bo->map && !bo->from_host_ptr)
+      anv_device_unmap_bo(device, bo, bo->map, bo->size, false /* replace */);
 
-static uint32_t
-anv_bo_alloc_flags_to_bo_flags(struct anv_device *device,
-                               enum anv_bo_alloc_flags alloc_flags)
+   assert(bo->gem_handle != 0);
+   device->kmd_backend->gem_close(device, bo);
+}
+
+static void
+anv_bo_vma_free(struct anv_device *device, struct anv_bo *bo)
 {
-   struct anv_physical_device *pdevice = device->physical;
+   if (bo->offset != 0 && !(bo->alloc_flags & ANV_BO_ALLOC_FIXED_ADDRESS)) {
+      assert(bo->vma_heap != NULL);
+      anv_vma_free(device, bo->vma_heap, bo->offset, bo->size);
+   }
+   bo->vma_heap = NULL;
+}
 
-   uint64_t bo_flags = 0;
-   if (!(alloc_flags & ANV_BO_ALLOC_32BIT_ADDRESS) &&
-       pdevice->supports_48bit_addresses)
-      bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+static void
+anv_bo_finish(struct anv_device *device, struct anv_bo *bo)
+{
+   /* Not releasing vma in case unbind fails */
+   if (device->kmd_backend->vm_unbind_bo(device, bo) == VK_SUCCESS)
+      anv_bo_vma_free(device, bo);
 
-   if ((alloc_flags & ANV_BO_ALLOC_CAPTURE) && pdevice->has_exec_capture)
-      bo_flags |= EXEC_OBJECT_CAPTURE;
+   anv_bo_unmap_close(device, bo);
+}
 
-   if (alloc_flags & ANV_BO_ALLOC_IMPLICIT_WRITE) {
-      assert(alloc_flags & ANV_BO_ALLOC_IMPLICIT_SYNC);
-      bo_flags |= EXEC_OBJECT_WRITE;
-   }
+static VkResult
+anv_bo_vma_alloc_or_close(struct anv_device *device,
+                          struct anv_bo *bo,
+                          enum anv_bo_alloc_flags alloc_flags,
+                          uint64_t explicit_address)
+{
+   assert(bo->vma_heap == NULL);
+   assert(explicit_address == intel_48b_address(explicit_address));
 
-   if (!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_SYNC) && pdevice->has_exec_async)
-      bo_flags |= EXEC_OBJECT_ASYNC;
+   uint32_t align = device->physical->info.mem_alignment;
 
-   if (pdevice->use_softpin)
-      bo_flags |= EXEC_OBJECT_PINNED;
+   /* If it's big enough to store a tiled resource, we need 64K alignment */
+   if (bo->size >= 64 * 1024)
+      align = MAX2(64 * 1024, align);
 
-   return bo_flags;
+   /* If we're using the AUX map, make sure we follow the required
+    * alignment.
+    */
+   if (alloc_flags & ANV_BO_ALLOC_AUX_TT_ALIGNED)
+      align = MAX2(intel_aux_map_get_alignment(device->aux_map_ctx), align);
+
+   /* Opportunistically align addresses to 2Mb when above 1Mb. We do this
+    * because this gives an opportunity for the kernel to use Transparent Huge
+    * Pages (the 2MB page table layout) for faster memory access.
+    *
+    * Only available on ICL+.
+    */
+   if (device->info->ver >= 11 && bo->size >= 1 * 1024 * 1024)
+      align = MAX2(2 * 1024 * 1024, align);
+
+   if (alloc_flags & ANV_BO_ALLOC_FIXED_ADDRESS) {
+      bo->offset = intel_canonical_address(explicit_address);
+   } else {
+      bo->offset = anv_vma_alloc(device, bo->size, align, alloc_flags,
+                                 explicit_address, &bo->vma_heap);
+      if (bo->offset == 0) {
+         anv_bo_unmap_close(device, bo);
+         return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                          "failed to allocate virtual address for BO");
+      }
+   }
+
+   return VK_SUCCESS;
 }
 
-static uint32_t
-anv_device_get_bo_align(struct anv_device *device,
-                        enum anv_bo_alloc_flags alloc_flags)
+enum intel_device_info_mmap_mode
+anv_bo_get_mmap_mode(struct anv_device *device, struct anv_bo *bo)
 {
-   /* Gfx12 CCS surface addresses need to be 64K aligned. */
-   if (device->info.ver >= 12 && (alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS))
-      return 64 * 1024;
+   enum anv_bo_alloc_flags alloc_flags = bo->alloc_flags;
+
+   if (device->info->has_set_pat_uapi)
+      return anv_device_get_pat_entry(device, alloc_flags)->mmap;
+
+   if (anv_physical_device_has_vram(device->physical)) {
+      if ((alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM) ||
+          (alloc_flags & ANV_BO_ALLOC_IMPORTED))
+         return INTEL_DEVICE_INFO_MMAP_MODE_WB;
+
+      return INTEL_DEVICE_INFO_MMAP_MODE_WC;
+   }
+
+   /* gfx9 atom */
+   if (!device->info->has_llc) {
+      /* user wants a cached and coherent memory but to achieve it without
+       * LLC in older platforms DRM_IOCTL_I915_GEM_SET_CACHING needs to be
+       * supported and set.
+       */
+      if (alloc_flags & ANV_BO_ALLOC_HOST_CACHED)
+         return INTEL_DEVICE_INFO_MMAP_MODE_WB;
+
+      return INTEL_DEVICE_INFO_MMAP_MODE_WC;
+   }
+
+   if (alloc_flags & (ANV_BO_ALLOC_SCANOUT | ANV_BO_ALLOC_EXTERNAL))
+      return INTEL_DEVICE_INFO_MMAP_MODE_WC;
 
-   return 4096;
+   return INTEL_DEVICE_INFO_MMAP_MODE_WB;
 }
 
 VkResult
@@ -1621,57 +1565,70 @@ anv_device_alloc_bo(struct anv_device *device,
                     uint64_t explicit_address,
                     struct anv_bo **bo_out)
 {
-   if (!(alloc_flags & ANV_BO_ALLOC_LOCAL_MEM))
-      anv_perf_warn(device, NULL, "system memory used");
+   /* bo that needs CPU access needs to be HOST_CACHED, HOST_COHERENT or both */
+   assert((alloc_flags & ANV_BO_ALLOC_MAPPED) == 0 ||
+          (alloc_flags & (ANV_BO_ALLOC_HOST_CACHED | ANV_BO_ALLOC_HOST_COHERENT)));
 
-   if (!device->physical->has_implicit_ccs)
-      assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS));
-
-   const uint32_t bo_flags =
-      anv_bo_alloc_flags_to_bo_flags(device, alloc_flags);
-   assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS));
+   /* KMD requires a valid PAT index, so setting HOST_COHERENT/WC to bos that
+    * don't need CPU access
+    */
+   if ((alloc_flags & ANV_BO_ALLOC_MAPPED) == 0)
+      alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT;
 
-   /* The kernel is going to give us whole pages anyway */
-   size = align_u64(size, 4096);
+   /* In platforms with LLC we can promote all bos to cached+coherent for free */
+   const enum anv_bo_alloc_flags not_allowed_promotion = ANV_BO_ALLOC_SCANOUT |
+                                                         ANV_BO_ALLOC_EXTERNAL |
+                                                         ANV_BO_ALLOC_PROTECTED;
+   if (device->info->has_llc && ((alloc_flags & not_allowed_promotion) == 0))
+      alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT;
 
-   const uint32_t align = anv_device_get_bo_align(device, alloc_flags);
+   const uint32_t bo_flags =
+         device->kmd_backend->bo_alloc_flags_to_bo_flags(device, alloc_flags);
 
-   uint64_t ccs_size = 0;
-   if (device->info.has_aux_map && (alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS)) {
-      /* Align the size up to the next multiple of 64K so we don't have any
-       * AUX-TT entries pointing from a 64K page to itself.
-       */
-      size = align_u64(size, 64 * 1024);
+   /* The kernel is going to give us whole pages anyway. */
+   size = align64(size, 4096);
 
-      /* See anv_bo::_ccs_size */
-      ccs_size = align_u64(DIV_ROUND_UP(size, INTEL_AUX_MAP_GFX12_CCS_SCALE), 4096);
+   const uint64_t ccs_offset = size;
+   if (alloc_flags & ANV_BO_ALLOC_AUX_CCS) {
+      assert(device->info->has_aux_map);
+      size += DIV_ROUND_UP(size, intel_aux_get_main_to_aux_ratio(device->aux_map_ctx));
+      size = align64(size, 4096);
    }
 
-   uint32_t gem_handle;
+   const struct intel_memory_class_instance *regions[2];
+   uint32_t nregions = 0;
 
    /* If we have vram size, we have multiple memory regions and should choose
     * one of them.
     */
-   if (device->physical->vram.size > 0) {
-      struct drm_i915_gem_memory_class_instance regions[2];
-      uint32_t nregions = 0;
-
-      if (alloc_flags & ANV_BO_ALLOC_LOCAL_MEM) {
-         /* For vram allocation, still use system memory as a fallback. */
-         regions[nregions++] = device->physical->vram.region;
-         regions[nregions++] = device->physical->sys.region;
-      } else {
+   if (anv_physical_device_has_vram(device->physical)) {
+      /* This always try to put the object in local memory. Here
+       * vram_non_mappable & vram_mappable actually are the same region.
+       */
+      if (alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM)
          regions[nregions++] = device->physical->sys.region;
-      }
+      else
+         regions[nregions++] = device->physical->vram_non_mappable.region;
 
-      gem_handle = anv_gem_create_regions(device, size + ccs_size,
-                                          nregions, regions);
+      /* If the buffer is mapped on the host, add the system memory region.
+       * This ensures that if the buffer cannot live in mappable local memory,
+       * it can be spilled to system memory.
+       */
+      if (!(alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM) &&
+          ((alloc_flags & ANV_BO_ALLOC_MAPPED) ||
+           (alloc_flags & ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE)))
+         regions[nregions++] = device->physical->sys.region;
    } else {
-      gem_handle = anv_gem_create(device, size + ccs_size);
+      regions[nregions++] = device->physical->sys.region;
    }
 
+   uint64_t actual_size;
+   uint32_t gem_handle = device->kmd_backend->gem_create(device, regions,
+                                                         nregions, size,
+                                                         alloc_flags,
+                                                         &actual_size);
    if (gem_handle == 0)
-      return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 
    struct anv_bo new_bo = {
       .name = name,
@@ -1679,67 +1636,32 @@ anv_device_alloc_bo(struct anv_device *device,
       .refcount = 1,
       .offset = -1,
       .size = size,
-      ._ccs_size = ccs_size,
+      .ccs_offset = ccs_offset,
+      .actual_size = actual_size,
       .flags = bo_flags,
-      .is_external = (alloc_flags & ANV_BO_ALLOC_EXTERNAL),
-      .has_client_visible_address =
-         (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0,
-      .has_implicit_ccs = ccs_size > 0,
+      .alloc_flags = alloc_flags,
    };
 
    if (alloc_flags & ANV_BO_ALLOC_MAPPED) {
-      new_bo.map = anv_gem_mmap(device, new_bo.gem_handle, 0, size, 0);
-      if (new_bo.map == MAP_FAILED) {
-         anv_gem_close(device, new_bo.gem_handle);
-         return vk_errorf(device, &device->vk.base,
-                          VK_ERROR_OUT_OF_HOST_MEMORY,
-                          "mmap failed: %m");
-      }
-   }
-
-   if (alloc_flags & ANV_BO_ALLOC_SNOOPED) {
-      assert(alloc_flags & ANV_BO_ALLOC_MAPPED);
-      /* We don't want to change these defaults if it's going to be shared
-       * with another process.
-       */
-      assert(!(alloc_flags & ANV_BO_ALLOC_EXTERNAL));
-
-      /* Regular objects are created I915_CACHING_CACHED on LLC platforms and
-       * I915_CACHING_NONE on non-LLC platforms.  For many internal state
-       * objects, we'd rather take the snooping overhead than risk forgetting
-       * a CLFLUSH somewhere.  Userptr objects are always created as
-       * I915_CACHING_CACHED, which on non-LLC means snooped so there's no
-       * need to do this there.
-       */
-      if (!device->info.has_llc) {
-         anv_gem_set_caching(device, new_bo.gem_handle,
-                             I915_CACHING_CACHED);
+      VkResult result = anv_device_map_bo(device, &new_bo, 0, size,
+                                          NULL, &new_bo.map);
+      if (unlikely(result != VK_SUCCESS)) {
+         device->kmd_backend->gem_close(device, &new_bo);
+         return result;
       }
    }
 
-   if (alloc_flags & ANV_BO_ALLOC_FIXED_ADDRESS) {
-      new_bo.has_fixed_address = true;
-      new_bo.offset = explicit_address;
-   } else if (new_bo.flags & EXEC_OBJECT_PINNED) {
-      new_bo.offset = anv_vma_alloc(device, new_bo.size + new_bo._ccs_size,
-                                    align, alloc_flags, explicit_address);
-      if (new_bo.offset == 0) {
-         if (new_bo.map)
-            anv_gem_munmap(device, new_bo.map, size);
-         anv_gem_close(device, new_bo.gem_handle);
-         return vk_errorf(device, NULL, VK_ERROR_OUT_OF_DEVICE_MEMORY,
-                          "failed to allocate virtual address for BO");
-      }
-   } else {
-      assert(!new_bo.has_client_visible_address);
-   }
+   VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
+                                               alloc_flags,
+                                               explicit_address);
+   if (result != VK_SUCCESS)
+      return result;
 
-   if (new_bo._ccs_size > 0) {
-      assert(device->info.has_aux_map);
-      intel_aux_map_add_mapping(device->aux_map_ctx,
-                                intel_canonical_address(new_bo.offset),
-                                intel_canonical_address(new_bo.offset + new_bo.size),
-                                new_bo.size, 0 /* format_bits */);
+   result = device->kmd_backend->vm_bind_bo(device, &new_bo);
+   if (result != VK_SUCCESS) {
+      anv_bo_vma_free(device, &new_bo);
+      anv_bo_unmap_close(device, &new_bo);
+      return result;
    }
 
    assert(new_bo.gem_handle);
@@ -1752,6 +1674,56 @@ anv_device_alloc_bo(struct anv_device *device,
 
    *bo_out = bo;
 
+   ANV_RMV(bo_allocate, device, bo);
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_device_map_bo(struct anv_device *device,
+                  struct anv_bo *bo,
+                  uint64_t offset,
+                  size_t size,
+                  void *placed_addr,
+                  void **map_out)
+{
+   assert(!bo->from_host_ptr);
+   assert(size > 0);
+
+   void *map = device->kmd_backend->gem_mmap(device, bo, offset, size, placed_addr);
+   if (unlikely(map == MAP_FAILED))
+      return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED, "mmap failed: %m");
+
+   assert(placed_addr == NULL || map == placed_addr);
+
+   assert(map != NULL);
+   VG(VALGRIND_MALLOCLIKE_BLOCK(map, size, 0, 1));
+
+   if (map_out)
+      *map_out = map;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_device_unmap_bo(struct anv_device *device,
+                    struct anv_bo *bo,
+                    void *map, size_t map_size,
+                    bool replace)
+{
+   assert(!bo->from_host_ptr);
+
+   if (replace) {
+      map = mmap(map, map_size, PROT_NONE,
+                 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+      if (map == MAP_FAILED) {
+         return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED,
+                          "Failed to map over original mapping");
+      }
+   } else {
+      VG(VALGRIND_FREELIKE_BLOCK(map, 0));
+      munmap(map, map_size);
+   }
    return VK_SUCCESS;
 }
 
@@ -1763,25 +1735,35 @@ anv_device_import_bo_from_host_ptr(struct anv_device *device,
                                    struct anv_bo **bo_out)
 {
    assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED |
-                           ANV_BO_ALLOC_SNOOPED |
+                           ANV_BO_ALLOC_HOST_CACHED |
+                           ANV_BO_ALLOC_HOST_COHERENT |
+                           ANV_BO_ALLOC_AUX_CCS |
+                           ANV_BO_ALLOC_PROTECTED |
                            ANV_BO_ALLOC_FIXED_ADDRESS)));
-
-   /* We can't do implicit CCS with an aux table on shared memory */
-   if (!device->physical->has_implicit_ccs || device->info.has_aux_map)
-       assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS));
+   assert(alloc_flags & ANV_BO_ALLOC_EXTERNAL);
 
    struct anv_bo_cache *cache = &device->bo_cache;
    const uint32_t bo_flags =
-      anv_bo_alloc_flags_to_bo_flags(device, alloc_flags);
-   assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS));
+         device->kmd_backend->bo_alloc_flags_to_bo_flags(device, alloc_flags);
 
-   uint32_t gem_handle = anv_gem_userptr(device, host_ptr, size);
+   uint32_t gem_handle = device->kmd_backend->gem_create_userptr(device, host_ptr, size);
    if (!gem_handle)
-      return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
+      return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
 
    pthread_mutex_lock(&cache->mutex);
 
-   struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
+   struct anv_bo *bo = NULL;
+   if (device->info->kmd_type == INTEL_KMD_TYPE_XE) {
+      bo = vk_zalloc(&device->vk.alloc, sizeof(*bo), 8,
+                     VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+      if (!bo) {
+         pthread_mutex_unlock(&cache->mutex);
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+      }
+   } else {
+      bo = anv_device_lookup_bo(device, gem_handle);
+   }
+
    if (bo->refcount > 0) {
       /* VK_EXT_external_memory_host doesn't require handling importing the
        * same pointer twice at the same time, but we don't get in the way.  If
@@ -1790,59 +1772,59 @@ anv_device_import_bo_from_host_ptr(struct anv_device *device,
       assert(bo->gem_handle == gem_handle);
       if (bo_flags != bo->flags) {
          pthread_mutex_unlock(&cache->mutex);
-         return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+         return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
                           "same host pointer imported two different ways");
       }
 
-      if (bo->has_client_visible_address !=
-          ((alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0)) {
+      if ((bo->alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) !=
+          (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS)) {
          pthread_mutex_unlock(&cache->mutex);
-         return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+         return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
                           "The same BO was imported with and without buffer "
                           "device address");
       }
 
       if (client_address && client_address != intel_48b_address(bo->offset)) {
          pthread_mutex_unlock(&cache->mutex);
-         return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+         return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
                           "The same BO was imported at two different "
                           "addresses");
       }
 
       __sync_fetch_and_add(&bo->refcount, 1);
    } else {
+      alloc_flags |= ANV_BO_ALLOC_IMPORTED;
       struct anv_bo new_bo = {
          .name = "host-ptr",
          .gem_handle = gem_handle,
          .refcount = 1,
          .offset = -1,
          .size = size,
+         .actual_size = size,
          .map = host_ptr,
          .flags = bo_flags,
-         .is_external = true,
+         .alloc_flags = alloc_flags,
          .from_host_ptr = true,
-         .has_client_visible_address =
-            (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0,
       };
 
-      assert(client_address == intel_48b_address(client_address));
-      if (new_bo.flags & EXEC_OBJECT_PINNED) {
-         assert(new_bo._ccs_size == 0);
-         new_bo.offset = anv_vma_alloc(device, new_bo.size,
-                                       anv_device_get_bo_align(device,
-                                                               alloc_flags),
-                                       alloc_flags, client_address);
-         if (new_bo.offset == 0) {
-            anv_gem_close(device, new_bo.gem_handle);
-            pthread_mutex_unlock(&cache->mutex);
-            return vk_errorf(device, NULL, VK_ERROR_OUT_OF_DEVICE_MEMORY,
-                             "failed to allocate virtual address for BO");
-         }
-      } else {
-         assert(!new_bo.has_client_visible_address);
+      VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
+                                                  alloc_flags,
+                                                  client_address);
+      if (result != VK_SUCCESS) {
+         pthread_mutex_unlock(&cache->mutex);
+         return result;
+      }
+
+      result = device->kmd_backend->vm_bind_bo(device, &new_bo);
+      if (result != VK_SUCCESS) {
+         anv_bo_vma_free(device, &new_bo);
+         pthread_mutex_unlock(&cache->mutex);
+         return result;
       }
 
       *bo = new_bo;
+
+      ANV_RMV(bo_allocate, device, bo);
    }
 
    pthread_mutex_unlock(&cache->mutex);
@@ -1859,125 +1841,90 @@ anv_device_import_bo(struct anv_device *device,
                      struct anv_bo **bo_out)
 {
    assert(!(alloc_flags & (ANV_BO_ALLOC_MAPPED |
-                           ANV_BO_ALLOC_SNOOPED |
+                           ANV_BO_ALLOC_HOST_CACHED |
+                           ANV_BO_ALLOC_HOST_COHERENT |
                            ANV_BO_ALLOC_FIXED_ADDRESS)));
-
-   /* We can't do implicit CCS with an aux table on shared memory */
-   if (!device->physical->has_implicit_ccs || device->info.has_aux_map)
-       assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS));
+   assert(alloc_flags & ANV_BO_ALLOC_EXTERNAL);
 
    struct anv_bo_cache *cache = &device->bo_cache;
-   const uint32_t bo_flags =
-      anv_bo_alloc_flags_to_bo_flags(device, alloc_flags);
-   assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS));
 
    pthread_mutex_lock(&cache->mutex);
 
    uint32_t gem_handle = anv_gem_fd_to_handle(device, fd);
    if (!gem_handle) {
       pthread_mutex_unlock(&cache->mutex);
-      return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
+      return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
    }
 
    struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
-   if (bo->refcount > 0) {
-      /* We have to be careful how we combine flags so that it makes sense.
-       * Really, though, if we get to this case and it actually matters, the
-       * client has imported a BO twice in different ways and they get what
-       * they have coming.
-       */
-      uint64_t new_flags = 0;
-      new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_WRITE;
-      new_flags |= (bo->flags & bo_flags) & EXEC_OBJECT_ASYNC;
-      new_flags |= (bo->flags & bo_flags) & EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
-      new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_PINNED;
-      new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_CAPTURE;
-
-      /* It's theoretically possible for a BO to get imported such that it's
-       * both pinned and not pinned.  The only way this can happen is if it
-       * gets imported as both a semaphore and a memory object and that would
-       * be an application error.  Just fail out in that case.
-       */
-      if ((bo->flags & EXEC_OBJECT_PINNED) !=
-          (bo_flags & EXEC_OBJECT_PINNED)) {
-         pthread_mutex_unlock(&cache->mutex);
-         return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,
-                          "The same BO was imported two different ways");
-      }
 
-      /* It's also theoretically possible that someone could export a BO from
-       * one heap and import it into another or to import the same BO into two
-       * different heaps.  If this happens, we could potentially end up both
-       * allowing and disallowing 48-bit addresses.  There's not much we can
-       * do about it if we're pinning so we just throw an error and hope no
-       * app is actually that stupid.
-       */
-      if ((new_flags & EXEC_OBJECT_PINNED) &&
-          (bo->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) !=
-          (bo_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) {
-         pthread_mutex_unlock(&cache->mutex);
-         return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,
-                          "The same BO was imported on two different heaps");
-      }
+   uint32_t bo_flags;
+   VkResult result = anv_gem_import_bo_alloc_flags_to_bo_flags(device, bo,
+                                                               alloc_flags,
+                                                               &bo_flags);
+   if (result != VK_SUCCESS) {
+      pthread_mutex_unlock(&cache->mutex);
+      return result;
+   }
 
-      if (bo->has_client_visible_address !=
-          ((alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0)) {
+   if (bo->refcount > 0) {
+      if ((bo->alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) !=
+          (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS)) {
          pthread_mutex_unlock(&cache->mutex);
-         return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+         return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
                           "The same BO was imported with and without buffer "
                           "device address");
       }
 
       if (client_address && client_address != intel_48b_address(bo->offset)) {
          pthread_mutex_unlock(&cache->mutex);
-         return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+         return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
                           "The same BO was imported at two different "
                           "addresses");
       }
 
-      bo->flags = new_flags;
-
       __sync_fetch_and_add(&bo->refcount, 1);
    } else {
-      off_t size = lseek(fd, 0, SEEK_END);
-      if (size == (off_t)-1) {
-         anv_gem_close(device, gem_handle);
-         pthread_mutex_unlock(&cache->mutex);
-         return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
-      }
-
+      alloc_flags |= ANV_BO_ALLOC_IMPORTED;
       struct anv_bo new_bo = {
          .name = "imported",
          .gem_handle = gem_handle,
          .refcount = 1,
          .offset = -1,
-         .size = size,
-         .flags = bo_flags,
-         .is_external = true,
-         .has_client_visible_address =
-            (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0,
+         .alloc_flags = alloc_flags,
       };
 
-      assert(client_address == intel_48b_address(client_address));
-      if (new_bo.flags & EXEC_OBJECT_PINNED) {
-         assert(new_bo._ccs_size == 0);
-         new_bo.offset = anv_vma_alloc(device, new_bo.size,
-                                       anv_device_get_bo_align(device,
-                                                               alloc_flags),
-                                       alloc_flags, client_address);
-         if (new_bo.offset == 0) {
-            anv_gem_close(device, new_bo.gem_handle);
-            pthread_mutex_unlock(&cache->mutex);
-            return vk_errorf(device, NULL, VK_ERROR_OUT_OF_DEVICE_MEMORY,
-                             "failed to allocate virtual address for BO");
-         }
-      } else {
-         assert(!new_bo.has_client_visible_address);
+      off_t size = lseek(fd, 0, SEEK_END);
+      if (size == (off_t)-1) {
+         device->kmd_backend->gem_close(device, &new_bo);
+         pthread_mutex_unlock(&cache->mutex);
+         return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
+      }
+      new_bo.size = size;
+      new_bo.actual_size = size;
+
+      VkResult result = anv_bo_vma_alloc_or_close(device, &new_bo,
+                                                  alloc_flags,
+                                                  client_address);
+      if (result != VK_SUCCESS) {
+         pthread_mutex_unlock(&cache->mutex);
+         return result;
+      }
+
+      result = device->kmd_backend->vm_bind_bo(device, &new_bo);
+      if (result != VK_SUCCESS) {
+         anv_bo_vma_free(device, &new_bo);
+         pthread_mutex_unlock(&cache->mutex);
+         return result;
       }
 
       *bo = new_bo;
+
+      ANV_RMV(bo_allocate, device, bo);
    }
 
+   bo->flags = bo_flags;
+
    pthread_mutex_unlock(&cache->mutex);
    *bo_out = bo;
 
@@ -1994,17 +1941,49 @@ anv_device_export_bo(struct anv_device *device,
     * to export it.  This is done based on external options passed into
     * anv_AllocateMemory.
     */
-   assert(bo->is_external);
+   assert(anv_bo_is_external(bo));
 
    int fd = anv_gem_handle_to_fd(device, bo->gem_handle);
    if (fd < 0)
-      return vk_error(VK_ERROR_TOO_MANY_OBJECTS);
+      return vk_error(device, VK_ERROR_TOO_MANY_OBJECTS);
 
    *fd_out = fd;
 
    return VK_SUCCESS;
 }
 
+VkResult
+anv_device_get_bo_tiling(struct anv_device *device,
+                         struct anv_bo *bo,
+                         enum isl_tiling *tiling_out)
+{
+   int i915_tiling = anv_gem_get_tiling(device, bo->gem_handle);
+   if (i915_tiling < 0) {
+      return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+                       "failed to get BO tiling: %m");
+   }
+
+   *tiling_out = isl_tiling_from_i915_tiling(i915_tiling);
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_device_set_bo_tiling(struct anv_device *device,
+                         struct anv_bo *bo,
+                         uint32_t row_pitch_B,
+                         enum isl_tiling tiling)
+{
+   int ret = anv_gem_set_tiling(device, bo->gem_handle, row_pitch_B,
+                                isl_tiling_to_i915_tiling(tiling));
+   if (ret) {
+      return vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                       "failed to set BO tiling: %m");
+   }
+
+   return VK_SUCCESS;
+}
+
 static bool
 atomic_dec_not_one(uint32_t *counter)
 {
@@ -2028,7 +2007,10 @@ anv_device_release_bo(struct anv_device *device,
                       struct anv_bo *bo)
 {
    struct anv_bo_cache *cache = &device->bo_cache;
-   assert(anv_device_lookup_bo(device, bo->gem_handle) == bo);
+   const bool bo_is_xe_userptr = device->info->kmd_type == INTEL_KMD_TYPE_XE &&
+                                 bo->from_host_ptr;
+   assert(bo_is_xe_userptr ||
+          anv_device_lookup_bo(device, bo->gem_handle) == bo);
 
    /* Try to decrement the counter but don't go below one.  If this succeeds
     * then the refcount has been decremented and we are not the last
@@ -2037,6 +2019,8 @@ anv_device_release_bo(struct anv_device *device,
    if (atomic_dec_not_one(&bo->refcount))
       return;
 
+   ANV_RMV(bo_destroy, device, bo);
+
    pthread_mutex_lock(&cache->mutex);
 
    /* We are probably the last reference since our attempt to decrement above
@@ -2051,33 +2035,21 @@ anv_device_release_bo(struct anv_device *device,
    }
    assert(bo->refcount == 0);
 
-   if (bo->map && !bo->from_host_ptr)
-      anv_gem_munmap(device, bo->map, bo->size);
-
-   if (bo->_ccs_size > 0) {
-      assert(device->physical->has_implicit_ccs);
-      assert(device->info.has_aux_map);
-      assert(bo->has_implicit_ccs);
-      intel_aux_map_unmap_range(device->aux_map_ctx,
-                                intel_canonical_address(bo->offset),
-                                bo->size);
-   }
-
-   if ((bo->flags & EXEC_OBJECT_PINNED) && !bo->has_fixed_address)
-      anv_vma_free(device, bo->offset, bo->size + bo->_ccs_size);
-
-   uint32_t gem_handle = bo->gem_handle;
-
    /* Memset the BO just in case.  The refcount being zero should be enough to
     * prevent someone from assuming the data is valid but it's safer to just
-    * stomp to zero just in case.  We explicitly do this *before* we close the
-    * GEM handle to ensure that if anyone allocates something and gets the
-    * same GEM handle, the memset has already happen and won't stomp all over
-    * any data they may write in this BO.
+    * stomp to zero just in case.  We explicitly do this *before* we actually
+    * close the GEM handle to ensure that if anyone allocates something and
+    * gets the same GEM handle, the memset has already happen and won't stomp
+    * all over any data they may write in this BO.
     */
-   memset(bo, 0, sizeof(*bo));
+   struct anv_bo old_bo = *bo;
+
+   if (bo_is_xe_userptr)
+      vk_free(&device->vk.alloc, bo);
+   else
+      memset(bo, 0, sizeof(*bo));
 
-   anv_gem_close(device, gem_handle);
+   anv_bo_finish(device, &old_bo);
 
    /* Don't unlock until we've actually closed the BO.  The whole point of
     * the BO cache is to ensure that we correctly handle races with creating
diff --git a/src/intel/vulkan/anv_android.c b/src/intel/vulkan/anv_android.c
index 418e844c471..2cea3fc9f36 100644
--- a/src/intel/vulkan/anv_android.c
+++ b/src/intel/vulkan/anv_android.c
@@ -34,16 +34,14 @@
 #include <sync/sync.h>
 
 #include "anv_private.h"
+#include "vk_android.h"
+#include "vk_common_entrypoints.h"
 #include "vk_util.h"
 
 static int anv_hal_open(const struct hw_module_t* mod, const char* id, struct hw_device_t** dev);
 static int anv_hal_close(struct hw_device_t *dev);
 
-static void UNUSED
-static_asserts(void)
-{
-   STATIC_ASSERT(HWVULKAN_DISPATCH_MAGIC == ICD_LOADER_MAGIC);
-}
+static_assert(HWVULKAN_DISPATCH_MAGIC == ICD_LOADER_MAGIC, "");
 
 PUBLIC struct hwvulkan_module_t HAL_MODULE_INFO_SYM = {
    .common = {
@@ -109,52 +107,34 @@ anv_hal_close(struct hw_device_t *dev)
 
 enum {
    /* Usage bit equal to GRALLOC_USAGE_HW_CAMERA_MASK */
-   AHARDWAREBUFFER_USAGE_CAMERA_MASK = 0x00060000U,
+   BUFFER_USAGE_CAMERA_MASK = 0x00060000U,
 };
 
 inline VkFormat
 vk_format_from_android(unsigned android_format, unsigned android_usage)
 {
    switch (android_format) {
-   case AHARDWAREBUFFER_FORMAT_R8G8B8A8_UNORM:
-      return VK_FORMAT_R8G8B8A8_UNORM;
    case AHARDWAREBUFFER_FORMAT_R8G8B8X8_UNORM:
-   case AHARDWAREBUFFER_FORMAT_R8G8B8_UNORM:
       return VK_FORMAT_R8G8B8_UNORM;
-   case AHARDWAREBUFFER_FORMAT_R5G6B5_UNORM:
-      return VK_FORMAT_R5G6B5_UNORM_PACK16;
-   case AHARDWAREBUFFER_FORMAT_R16G16B16A16_FLOAT:
-      return VK_FORMAT_R16G16B16A16_SFLOAT;
-   case AHARDWAREBUFFER_FORMAT_R10G10B10A2_UNORM:
-      return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
    case AHARDWAREBUFFER_FORMAT_Y8Cb8Cr8_420:
    case HAL_PIXEL_FORMAT_NV12_Y_TILED_INTEL:
       return VK_FORMAT_G8_B8R8_2PLANE_420_UNORM;
+   case AHARDWAREBUFFER_FORMAT_YV12:
+      return VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM;
    case AHARDWAREBUFFER_FORMAT_IMPLEMENTATION_DEFINED:
-      if (android_usage & AHARDWAREBUFFER_USAGE_CAMERA_MASK)
+      if (android_usage & BUFFER_USAGE_CAMERA_MASK)
          return VK_FORMAT_G8_B8R8_2PLANE_420_UNORM;
       else
          return VK_FORMAT_R8G8B8_UNORM;
-   case AHARDWAREBUFFER_FORMAT_BLOB:
    default:
-      return VK_FORMAT_UNDEFINED;
+      return vk_ahb_format_to_image_format(android_format);
    }
 }
 
-static inline unsigned
-android_format_from_vk(unsigned vk_format)
+unsigned
+anv_ahb_format_for_vk_format(VkFormat vk_format)
 {
    switch (vk_format) {
-   case VK_FORMAT_R8G8B8A8_UNORM:
-      return AHARDWAREBUFFER_FORMAT_R8G8B8A8_UNORM;
-   case VK_FORMAT_R8G8B8_UNORM:
-      return AHARDWAREBUFFER_FORMAT_R8G8B8_UNORM;
-   case VK_FORMAT_R5G6B5_UNORM_PACK16:
-      return AHARDWAREBUFFER_FORMAT_R5G6B5_UNORM;
-   case VK_FORMAT_R16G16B16A16_SFLOAT:
-      return AHARDWAREBUFFER_FORMAT_R16G16B16A16_FLOAT;
-   case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
-      return AHARDWAREBUFFER_FORMAT_R10G10B10A2_UNORM;
    case VK_FORMAT_G8_B8R8_2PLANE_420_UNORM:
 #ifdef HAVE_CROS_GRALLOC
       return AHARDWAREBUFFER_FORMAT_Y8Cb8Cr8_420;
@@ -162,15 +142,15 @@ android_format_from_vk(unsigned vk_format)
       return HAL_PIXEL_FORMAT_NV12_Y_TILED_INTEL;
 #endif
    default:
-      return AHARDWAREBUFFER_FORMAT_BLOB;
+      return vk_image_format_to_ahb_format(vk_format);
    }
 }
 
 static VkResult
-get_ahw_buffer_format_properties(
+get_ahw_buffer_format_properties2(
    VkDevice device_h,
    const struct AHardwareBuffer *buffer,
-   VkAndroidHardwareBufferFormatPropertiesANDROID *pProperties)
+   VkAndroidHardwareBufferFormatProperties2ANDROID *pProperties)
 {
    ANV_FROM_HANDLE(anv_device, device, device_h);
 
@@ -191,12 +171,12 @@ get_ahw_buffer_format_properties(
       return VK_ERROR_INVALID_EXTERNAL_HANDLE;
 
    /* Fill properties fields based on description. */
-   VkAndroidHardwareBufferFormatPropertiesANDROID *p = pProperties;
+   VkAndroidHardwareBufferFormatProperties2ANDROID *p = pProperties;
 
    p->format = vk_format_from_android(desc.format, desc.usage);
+   p->externalFormat = p->format;
 
    const struct anv_format *anv_format = anv_get_format(p->format);
-   p->externalFormat = (uint64_t) (uintptr_t) anv_format;
 
    /* Default to OPTIMAL tiling but set to linear in case
     * of AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER usage.
@@ -207,8 +187,8 @@ get_ahw_buffer_format_properties(
       tiling = VK_IMAGE_TILING_LINEAR;
 
    p->formatFeatures =
-      anv_get_image_format_features(&device->info, p->format, anv_format,
-                                    tiling, NULL);
+      anv_get_image_format_features2(device->physical, p->format, anv_format,
+                                     tiling, NULL);
 
    /* "Images can be created with an external format even if the Android hardware
     *  buffer has a format which has an equivalent Vulkan format to enable
@@ -223,7 +203,7 @@ get_ahw_buffer_format_properties(
     *  VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT"
     */
    p->formatFeatures |=
-      VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT;
+      VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT;
 
    /* "Implementations may not always be able to determine the color model,
     *  numerical range, or chroma offsets of the image contents, so the values
@@ -257,10 +237,30 @@ anv_GetAndroidHardwareBufferPropertiesANDROID(
    VkAndroidHardwareBufferFormatPropertiesANDROID *format_prop =
       vk_find_struct(pProperties->pNext,
                      ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_ANDROID);
-
    /* Fill format properties of an Android hardware buffer. */
-   if (format_prop)
-      get_ahw_buffer_format_properties(device_h, buffer, format_prop);
+   if (format_prop) {
+      VkAndroidHardwareBufferFormatProperties2ANDROID format_prop2 = {
+         .sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_2_ANDROID,
+      };
+      get_ahw_buffer_format_properties2(device_h, buffer, &format_prop2);
+
+      format_prop->format                 = format_prop2.format;
+      format_prop->externalFormat         = format_prop2.externalFormat;
+      format_prop->formatFeatures         =
+         vk_format_features2_to_features(format_prop2.formatFeatures);
+      format_prop->samplerYcbcrConversionComponents =
+         format_prop2.samplerYcbcrConversionComponents;
+      format_prop->suggestedYcbcrModel    = format_prop2.suggestedYcbcrModel;
+      format_prop->suggestedYcbcrRange    = format_prop2.suggestedYcbcrRange;
+      format_prop->suggestedXChromaOffset = format_prop2.suggestedXChromaOffset;
+      format_prop->suggestedYChromaOffset = format_prop2.suggestedYChromaOffset;
+   }
+
+   VkAndroidHardwareBufferFormatProperties2ANDROID *format_prop2 =
+      vk_find_struct(pProperties->pNext,
+                     ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_2_ANDROID);
+   if (format_prop2)
+      get_ahw_buffer_format_properties2(device_h, buffer, format_prop2);
 
    /* NOTE - We support buffers with only one handle but do not error on
     * multiple handle case. Reason is that we want to support YUV formats
@@ -282,81 +282,21 @@ anv_GetAndroidHardwareBufferPropertiesANDROID(
    return VK_SUCCESS;
 }
 
-VkResult
-anv_GetMemoryAndroidHardwareBufferANDROID(
-   VkDevice device_h,
-   const VkMemoryGetAndroidHardwareBufferInfoANDROID *pInfo,
-   struct AHardwareBuffer **pBuffer)
-{
-   ANV_FROM_HANDLE(anv_device_memory, mem, pInfo->memory);
-
-   /* Some quotes from Vulkan spec:
-    *
-    * "If the device memory was created by importing an Android hardware
-    * buffer, vkGetMemoryAndroidHardwareBufferANDROID must return that same
-    * Android hardware buffer object."
-    *
-    * "VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID must
-    * have been included in VkExportMemoryAllocateInfo::handleTypes when
-    * memory was created."
-    */
-   if (mem->ahw) {
-      *pBuffer = mem->ahw;
-      /* Increase refcount. */
-      AHardwareBuffer_acquire(mem->ahw);
-      return VK_SUCCESS;
-   }
-
-   return VK_ERROR_OUT_OF_HOST_MEMORY;
-}
-
-#endif
-
-/* Construct ahw usage mask from image usage bits, see
- * 'AHardwareBuffer Usage Equivalence' in Vulkan spec.
- */
-uint64_t
-anv_ahw_usage_from_vk_usage(const VkImageCreateFlags vk_create,
-                            const VkImageUsageFlags vk_usage)
-{
-   uint64_t ahw_usage = 0;
-#if ANDROID_API_LEVEL >= 26
-   if (vk_usage & VK_IMAGE_USAGE_SAMPLED_BIT)
-      ahw_usage |= AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE;
-
-   if (vk_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)
-      ahw_usage |= AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE;
-
-   if (vk_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)
-      ahw_usage |= AHARDWAREBUFFER_USAGE_GPU_COLOR_OUTPUT;
-
-   if (vk_create & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT)
-      ahw_usage |= AHARDWAREBUFFER_USAGE_GPU_CUBE_MAP;
-
-   if (vk_create & VK_IMAGE_CREATE_PROTECTED_BIT)
-      ahw_usage |= AHARDWAREBUFFER_USAGE_PROTECTED_CONTENT;
-
-   /* No usage bits set - set at least one GPU usage. */
-   if (ahw_usage == 0)
-      ahw_usage = AHARDWAREBUFFER_USAGE_GPU_SAMPLED_IMAGE;
 #endif
-   return ahw_usage;
-}
 
 /*
  * Called from anv_AllocateMemory when import AHardwareBuffer.
  */
 VkResult
 anv_import_ahw_memory(VkDevice device_h,
-                      struct anv_device_memory *mem,
-                      const VkImportAndroidHardwareBufferInfoANDROID *info)
+                      struct anv_device_memory *mem)
 {
 #if ANDROID_API_LEVEL >= 26
    ANV_FROM_HANDLE(anv_device, device, device_h);
 
    /* Import from AHardwareBuffer to anv_device_memory. */
    const native_handle_t *handle =
-      AHardwareBuffer_getNativeHandle(info->buffer);
+      AHardwareBuffer_getNativeHandle(mem->vk.ahardware_buffer);
 
    /* NOTE - We support buffers with only one handle but do not error on
     * multiple handle case. Reason is that we want to support YUV formats
@@ -372,14 +312,6 @@ anv_import_ahw_memory(VkDevice device_h,
                                           &mem->bo);
    assert(result == VK_SUCCESS);
 
-   /* "If the vkAllocateMemory command succeeds, the implementation must
-    * acquire a reference to the imported hardware buffer, which it must
-    * release when the device memory object is freed. If the command fails,
-    * the implementation must not retain a reference."
-    */
-   AHardwareBuffer_acquire(info->buffer);
-   mem->ahw = info->buffer;
-
    return VK_SUCCESS;
 #else
    return VK_ERROR_EXTENSION_NOT_PRESENT;
@@ -387,80 +319,11 @@ anv_import_ahw_memory(VkDevice device_h,
 }
 
 VkResult
-anv_create_ahw_memory(VkDevice device_h,
-                      struct anv_device_memory *mem,
-                      const VkMemoryAllocateInfo *pAllocateInfo)
-{
-#if ANDROID_API_LEVEL >= 26
-   const VkMemoryDedicatedAllocateInfo *dedicated_info =
-      vk_find_struct_const(pAllocateInfo->pNext,
-                           MEMORY_DEDICATED_ALLOCATE_INFO);
-
-   uint32_t w = 0;
-   uint32_t h = 1;
-   uint32_t layers = 1;
-   uint32_t format = 0;
-   uint64_t usage = 0;
-
-   /* If caller passed dedicated information. */
-   if (dedicated_info && dedicated_info->image) {
-      ANV_FROM_HANDLE(anv_image, image, dedicated_info->image);
-      w = image->vk.extent.width;
-      h = image->vk.extent.height;
-      layers = image->vk.array_layers;
-      format = android_format_from_vk(image->vk.format);
-      usage = anv_ahw_usage_from_vk_usage(image->vk.create_flags, image->vk.usage);
-   } else if (dedicated_info && dedicated_info->buffer) {
-      ANV_FROM_HANDLE(anv_buffer, buffer, dedicated_info->buffer);
-      w = buffer->size;
-      format = AHARDWAREBUFFER_FORMAT_BLOB;
-      usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN |
-              AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN;
-   } else {
-      w = pAllocateInfo->allocationSize;
-      format = AHARDWAREBUFFER_FORMAT_BLOB;
-      usage = AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN |
-              AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN;
-   }
-
-   struct AHardwareBuffer *ahw = NULL;
-   struct AHardwareBuffer_Desc desc = {
-      .width = w,
-      .height = h,
-      .layers = layers,
-      .format = format,
-      .usage = usage,
-    };
-
-   if (AHardwareBuffer_allocate(&desc, &ahw) != 0)
-      return VK_ERROR_OUT_OF_HOST_MEMORY;
-
-   const VkImportAndroidHardwareBufferInfoANDROID import_info = {
-      .buffer = ahw,
-   };
-   VkResult result = anv_import_ahw_memory(device_h, mem, &import_info);
-
-   /* Release a reference to avoid leak for AHB allocation. */
-   AHardwareBuffer_release(ahw);
-
-   return result;
-#else
-   return VK_ERROR_EXTENSION_NOT_PRESENT;
-#endif
-
-}
-
-VkResult
-anv_image_from_gralloc(VkDevice device_h,
-                       const VkImageCreateInfo *base_info,
-                       const VkNativeBufferANDROID *gralloc_info,
-                       const VkAllocationCallbacks *alloc,
-                       VkImage *out_image_h)
-
+anv_image_init_from_gralloc(struct anv_device *device,
+                            struct anv_image *image,
+                            const VkImageCreateInfo *base_info,
+                            const VkNativeBufferANDROID *gralloc_info)
 {
-   ANV_FROM_HANDLE(anv_device, device, device_h);
-   VkImage image_h = VK_NULL_HANDLE;
-   struct anv_image *image = NULL;
    struct anv_bo *bo = NULL;
    VkResult result;
 
@@ -469,13 +332,6 @@ anv_image_from_gralloc(VkDevice device_h,
       .isl_extra_usage_flags = ISL_SURF_USAGE_DISABLE_AUX_BIT,
    };
 
-   if (gralloc_info->handle->numFds != 1) {
-      return vk_errorf(device, &device->vk.base,
-                       VK_ERROR_INVALID_EXTERNAL_HANDLE,
-                       "VkNativeBufferANDROID::handle::numFds is %d, "
-                       "expected 1", gralloc_info->handle->numFds);
-   }
-
    /* Do not close the gralloc handle's dma_buf. The lifetime of the dma_buf
     * must exceed that of the gralloc handle, and we do not own the gralloc
     * handle.
@@ -492,69 +348,43 @@ anv_image_from_gralloc(VkDevice device_h,
     *
     */
    result = anv_device_import_bo(device, dma_buf,
+                                 ANV_BO_ALLOC_EXTERNAL |
                                  ANV_BO_ALLOC_IMPLICIT_SYNC |
                                  ANV_BO_ALLOC_IMPLICIT_WRITE,
                                  0 /* client_address */,
                                  &bo);
    if (result != VK_SUCCESS) {
-      return vk_errorf(device, &device->vk.base, result,
+      return vk_errorf(device, result,
                        "failed to import dma-buf from VkNativeBufferANDROID");
    }
 
-   int i915_tiling = anv_gem_get_tiling(device, bo->gem_handle);
-   switch (i915_tiling) {
-   case I915_TILING_NONE:
-      anv_info.isl_tiling_flags = ISL_TILING_LINEAR_BIT;
-      break;
-   case I915_TILING_X:
-      anv_info.isl_tiling_flags = ISL_TILING_X_BIT;
-      break;
-   case I915_TILING_Y:
-      anv_info.isl_tiling_flags = ISL_TILING_Y0_BIT;
-      break;
-   case -1:
-      result = vk_errorf(device, &device->vk.base,
-                         VK_ERROR_INVALID_EXTERNAL_HANDLE,
-                         "DRM_IOCTL_I915_GEM_GET_TILING failed for "
-                         "VkNativeBufferANDROID");
-      goto fail_tiling;
-   default:
-      result = vk_errorf(device, &device->vk.base,
-                         VK_ERROR_INVALID_EXTERNAL_HANDLE,
-                         "DRM_IOCTL_I915_GEM_GET_TILING returned unknown "
-                         "tiling %d for VkNativeBufferANDROID", i915_tiling);
-      goto fail_tiling;
+   enum isl_tiling tiling;
+   result = anv_device_get_bo_tiling(device, bo, &tiling);
+   if (result != VK_SUCCESS) {
+      return vk_errorf(device, result,
+                       "failed to get tiling from VkNativeBufferANDROID");
    }
+   anv_info.isl_tiling_flags = 1u << tiling;
 
-   enum isl_format format = anv_get_isl_format(&device->info,
-                                               base_info->format,
-                                               VK_IMAGE_ASPECT_COLOR_BIT,
-                                               base_info->tiling);
-   assert(format != ISL_FORMAT_UNSUPPORTED);
+   anv_info.stride = gralloc_info->stride;
 
-   result = anv_image_create(device_h, &anv_info, alloc, &image_h);
-   image = anv_image_from_handle(image_h);
+   result = anv_image_init(device, image, &anv_info);
    if (result != VK_SUCCESS)
-      goto fail_create;
-
-   VkImageMemoryRequirementsInfo2 mem_reqs_info = {
-      .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2,
-      .image = image_h,
-   };
+      goto fail_init;
 
    VkMemoryRequirements2 mem_reqs = {
       .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
    };
 
-   anv_GetImageMemoryRequirements2(device_h, &mem_reqs_info, &mem_reqs);
+   anv_image_get_memory_requirements(device, image, image->vk.aspects,
+                                     &mem_reqs);
 
    VkDeviceSize aligned_image_size =
-      align_u64(mem_reqs.memoryRequirements.size,
-                mem_reqs.memoryRequirements.alignment);
+      align64(mem_reqs.memoryRequirements.size,
+              mem_reqs.memoryRequirements.alignment);
 
    if (bo->size < aligned_image_size) {
-      result = vk_errorf(device, &device->vk.base,
-                         VK_ERROR_INVALID_EXTERNAL_HANDLE,
+      result = vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
                          "dma-buf from VkNativeBufferANDROID is too small for "
                          "VkImage: %"PRIu64"B < %"PRIu64"B",
                          bo->size, aligned_image_size);
@@ -570,15 +400,11 @@ anv_image_from_gralloc(VkDevice device_h,
    image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo = bo;
    image->from_gralloc = true;
 
-   /* Don't clobber the out-parameter until success is certain. */
-   *out_image_h = image_h;
-
    return VK_SUCCESS;
 
  fail_size:
-   anv_DestroyImage(device_h, image_h, alloc);
- fail_create:
- fail_tiling:
+   anv_image_finish(image);
+ fail_init:
    anv_device_release_bo(device, bo);
 
    return result;
@@ -606,18 +432,19 @@ anv_image_bind_from_gralloc(struct anv_device *device,
     */
    struct anv_bo *bo = NULL;
    VkResult result = anv_device_import_bo(device, dma_buf,
+                                          ANV_BO_ALLOC_EXTERNAL |
                                           ANV_BO_ALLOC_IMPLICIT_SYNC |
                                           ANV_BO_ALLOC_IMPLICIT_WRITE,
                                           0 /* client_address */,
                                           &bo);
    if (result != VK_SUCCESS) {
-      return vk_errorf(device, &device->vk.base, result,
+      return vk_errorf(device, result,
                        "failed to import dma-buf from VkNativeBufferANDROID");
    }
 
    uint64_t img_size = image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].memory_range.size;
    if (img_size < bo->size) {
-      result = vk_errorf(device, &device->vk.base, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+      result = vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
                          "dma-buf from VkNativeBufferANDROID is too small for "
                          "VkImage: %"PRIu64"B < %"PRIu64"B",
                          bo->size, img_size);
@@ -661,7 +488,7 @@ format_supported_with_usage(VkDevice device_h, VkFormat format,
    result = anv_GetPhysicalDeviceImageFormatProperties2(phys_dev_h,
                &image_format_info, &image_format_props);
    if (result != VK_SUCCESS) {
-      return vk_errorf(device, &device->vk.base, result,
+      return vk_errorf(device, result,
                        "anv_GetPhysicalDeviceImageFormatProperties2 failed "
                        "inside %s", __func__);
    }
@@ -700,7 +527,7 @@ setup_gralloc0_usage(struct anv_device *device, VkFormat format,
     * gralloc swapchains.
     */
    if (imageUsage != 0) {
-      return vk_errorf(device, &device->vk.base, VK_ERROR_FORMAT_NOT_SUPPORTED,
+      return vk_errorf(device, VK_ERROR_FORMAT_NOT_SUPPORTED,
                        "unsupported VkImageUsageFlags(0x%x) for gralloc "
                        "swapchain", imageUsage);
    }
@@ -745,7 +572,8 @@ VkResult anv_GetSwapchainGrallocUsage2ANDROID(
 
    *grallocConsumerUsage = 0;
    *grallocProducerUsage = 0;
-   mesa_logd("%s: format=%d, usage=0x%x", __func__, format, imageUsage);
+   mesa_logd("%s: format=%d, usage=0x%x, swapchainUsage=0x%x", __func__, format,
+             imageUsage, swapchainImageUsage);
 
    result = format_supported_with_usage(device_h, format, imageUsage);
    if (result != VK_SUCCESS)
@@ -774,6 +602,13 @@ VkResult anv_GetSwapchainGrallocUsage2ANDROID(
       *grallocConsumerUsage |= GRALLOC1_CONSUMER_USAGE_HWCOMPOSER;
    }
 
+   if ((swapchainImageUsage & VK_SWAPCHAIN_IMAGE_USAGE_SHARED_BIT_ANDROID) &&
+       device->u_gralloc != NULL) {
+      uint64_t front_rendering_usage = 0;
+      u_gralloc_get_front_rendering_usage(device->u_gralloc, &front_rendering_usage);
+      *grallocProducerUsage |= front_rendering_usage;
+   }
+
    return VK_SUCCESS;
 }
 #endif
@@ -796,115 +631,3 @@ VkResult anv_GetSwapchainGrallocUsageANDROID(
 
    return setup_gralloc0_usage(device, format, imageUsage, grallocUsage);
 }
-
-VkResult
-anv_AcquireImageANDROID(
-      VkDevice            device_h,
-      VkImage             image_h,
-      int                 nativeFenceFd,
-      VkSemaphore         semaphore_h,
-      VkFence             fence_h)
-{
-   VkResult result = VK_SUCCESS;
-
-   /* From https://source.android.com/devices/graphics/implement-vulkan :
-    *
-    *    "The driver takes ownership of the fence file descriptor and closes
-    *    the fence file descriptor when no longer needed. The driver must do
-    *    so even if neither a semaphore or fence object is provided, or even
-    *    if vkAcquireImageANDROID fails and returns an error."
-    *
-    * The Vulkan spec for VkImportFence/SemaphoreFdKHR(), however, requires
-    * the file descriptor to be left alone on failure.
-    */
-   int semaphore_fd = -1, fence_fd = -1;
-   if (nativeFenceFd >= 0) {
-      if (semaphore_h != VK_NULL_HANDLE && fence_h != VK_NULL_HANDLE) {
-         /* We have both so we have to import the sync file twice. One of
-          * them needs to be a dup.
-          */
-         semaphore_fd = nativeFenceFd;
-         fence_fd = dup(nativeFenceFd);
-         if (fence_fd < 0) {
-            VkResult err = (errno == EMFILE) ? VK_ERROR_TOO_MANY_OBJECTS :
-                                               VK_ERROR_OUT_OF_HOST_MEMORY;
-            close(nativeFenceFd);
-            return vk_error(err);
-         }
-      } else if (semaphore_h != VK_NULL_HANDLE) {
-         semaphore_fd = nativeFenceFd;
-      } else if (fence_h != VK_NULL_HANDLE) {
-         fence_fd = nativeFenceFd;
-      } else {
-         /* Nothing to import into so we have to close the file */
-         close(nativeFenceFd);
-      }
-   }
-
-   if (semaphore_h != VK_NULL_HANDLE) {
-      const VkImportSemaphoreFdInfoKHR info = {
-         .sType = VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR,
-         .semaphore = semaphore_h,
-         .flags = VK_SEMAPHORE_IMPORT_TEMPORARY_BIT,
-         .handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT,
-         .fd = semaphore_fd,
-      };
-      result = anv_ImportSemaphoreFdKHR(device_h, &info);
-      if (result == VK_SUCCESS)
-         semaphore_fd = -1; /* ANV took ownership */
-   }
-
-   if (result == VK_SUCCESS && fence_h != VK_NULL_HANDLE) {
-      const VkImportFenceFdInfoKHR info = {
-         .sType = VK_STRUCTURE_TYPE_IMPORT_FENCE_FD_INFO_KHR,
-         .fence = fence_h,
-         .flags = VK_FENCE_IMPORT_TEMPORARY_BIT,
-         .handleType = VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT,
-         .fd = fence_fd,
-      };
-      result = anv_ImportFenceFdKHR(device_h, &info);
-      if (result == VK_SUCCESS)
-         fence_fd = -1; /* ANV took ownership */
-   }
-
-   if (semaphore_fd >= 0)
-      close(semaphore_fd);
-   if (fence_fd >= 0)
-      close(fence_fd);
-
-   return result;
-}
-
-VkResult
-anv_QueueSignalReleaseImageANDROID(
-      VkQueue             queue,
-      uint32_t            waitSemaphoreCount,
-      const VkSemaphore*  pWaitSemaphores,
-      VkImage             image,
-      int*                pNativeFenceFd)
-{
-   VkResult result;
-
-   if (waitSemaphoreCount == 0)
-      goto done;
-
-   result = anv_QueueSubmit(queue, 1,
-      &(VkSubmitInfo) {
-            .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
-            .waitSemaphoreCount = 1,
-            .pWaitSemaphores = pWaitSemaphores,
-      },
-      (VkFence) VK_NULL_HANDLE);
-   if (result != VK_SUCCESS)
-      return result;
-
- done:
-   if (pNativeFenceFd) {
-      /* We can rely implicit on sync because above we submitted all
-       * semaphores to the queue.
-       */
-      *pNativeFenceFd = -1;
-   }
-
-   return VK_SUCCESS;
-}
diff --git a/src/intel/vulkan/anv_android.h b/src/intel/vulkan/anv_android.h
index 2e329b3029c..cbd0a0a1634 100644
--- a/src/intel/vulkan/anv_android.h
+++ b/src/intel/vulkan/anv_android.h
@@ -24,7 +24,9 @@
 #ifndef ANV_ANDROID_H
 #define ANV_ANDROID_H
 
-#if defined(ANDROID) && ANDROID_API_LEVEL >= 26
+#include "util/detect_os.h"
+
+#if DETECT_OS_ANDROID && ANDROID_API_LEVEL >= 26
 #include <vndk/hardware_buffer.h>
 #endif
 #include <vulkan/vulkan.h>
@@ -35,30 +37,21 @@ struct anv_device_memory;
 struct anv_device;
 struct anv_image;
 
-VkResult anv_image_from_gralloc(VkDevice device_h,
-                                const VkImageCreateInfo *base_info,
-                                const VkNativeBufferANDROID *gralloc_info,
-                                const VkAllocationCallbacks *alloc,
-                                VkImage *pImage);
+VkResult anv_image_init_from_gralloc(struct anv_device *device,
+                                     struct anv_image *image,
+                                     const VkImageCreateInfo *base_info,
+                                     const VkNativeBufferANDROID *gralloc_info);
 
 VkResult anv_image_bind_from_gralloc(struct anv_device *device,
                                      struct anv_image *image,
                                      const VkNativeBufferANDROID *gralloc_info);
 
-VkResult anv_image_from_external(VkDevice device_h,
-                                 const VkImageCreateInfo *base_info,
-                                 const VkExternalMemoryImageCreateInfo *create_info,
-                                 const VkAllocationCallbacks *alloc,
-                                 VkImage *out_image_h);
-
-uint64_t anv_ahw_usage_from_vk_usage(const VkImageCreateFlags vk_create,
-                                     const VkImageUsageFlags vk_usage);
+unsigned anv_ahb_format_for_vk_format(VkFormat vk_format);
 
 VkResult anv_import_ahw_memory(VkDevice device_h,
-                               struct anv_device_memory *mem,
-                               const VkImportAndroidHardwareBufferInfoANDROID *info);
+                               struct anv_device_memory *mem);
 
 VkResult anv_create_ahw_memory(VkDevice device_h,
                                struct anv_device_memory *mem,
-                               const VkMemoryAllocateInfo *pAllocateInfo);
+                               const VkMemoryDedicatedAllocateInfo *dedicated_info);
 #endif /* ANV_ANDROID_H */
diff --git a/src/intel/vulkan/anv_android_stubs.c b/src/intel/vulkan/anv_android_stubs.c
index f6b2d1c8dd1..f1b2ef6b6f8 100644
--- a/src/intel/vulkan/anv_android_stubs.c
+++ b/src/intel/vulkan/anv_android_stubs.c
@@ -24,11 +24,10 @@
 #include "anv_android.h"
 
 VkResult
-anv_image_from_gralloc(VkDevice device_h,
-                       const VkImageCreateInfo *base_info,
-                       const VkNativeBufferANDROID *gralloc_info,
-                       const VkAllocationCallbacks *alloc,
-                       VkImage *pImage)
+anv_image_init_from_gralloc(struct anv_device *device,
+                            struct anv_image *image,
+                            const VkImageCreateInfo *base_info,
+                            const VkNativeBufferANDROID *gralloc_info)
 {
    return VK_ERROR_EXTENSION_NOT_PRESENT;
 }
@@ -40,17 +39,14 @@ VkResult anv_image_bind_from_gralloc(struct anv_device *device,
    return VK_ERROR_EXTENSION_NOT_PRESENT;
 }
 
-uint64_t
-anv_ahw_usage_from_vk_usage(const VkImageCreateFlags vk_create,
-                            const VkImageUsageFlags vk_usage)
+unsigned anv_ahb_format_for_vk_format(VkFormat vk_format)
 {
    return 0;
 }
 
 VkResult
 anv_import_ahw_memory(VkDevice device_h,
-                      struct anv_device_memory *mem,
-                      const VkImportAndroidHardwareBufferInfoANDROID *info)
+                      struct anv_device_memory *mem)
 {
    return VK_ERROR_EXTENSION_NOT_PRESENT;
 }
@@ -58,17 +54,7 @@ anv_import_ahw_memory(VkDevice device_h,
 VkResult
 anv_create_ahw_memory(VkDevice device_h,
                       struct anv_device_memory *mem,
-                      const VkMemoryAllocateInfo *pAllocateInfo)
-{
-   return VK_ERROR_EXTENSION_NOT_PRESENT;
-}
-
-VkResult
-anv_image_from_external(VkDevice device_h,
-                        const VkImageCreateInfo *base_info,
-                        const VkExternalMemoryImageCreateInfo *create_info,
-                        const VkAllocationCallbacks *alloc,
-                        VkImage *out_image_h)
+                      const VkMemoryDedicatedAllocateInfo *dedicated_info)
 {
    return VK_ERROR_EXTENSION_NOT_PRESENT;
 }
diff --git a/src/intel/vulkan/anv_astc_emu.c b/src/intel/vulkan/anv_astc_emu.c
new file mode 100644
index 00000000000..7a0f354a5e5
--- /dev/null
+++ b/src/intel/vulkan/anv_astc_emu.c
@@ -0,0 +1,516 @@
+/*
+ * Copyright 2023 Google LLC
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "anv_private.h"
+
+#include "compiler/nir/nir_builder.h"
+
+static void
+astc_emu_init_image_view(struct anv_cmd_buffer *cmd_buffer,
+                         struct anv_image_view *iview,
+                         struct anv_image *image,
+                         VkFormat format,
+                         VkImageUsageFlags usage,
+                         uint32_t level, uint32_t layer)
+{
+   struct anv_device *device = cmd_buffer->device;
+
+   const VkImageViewCreateInfo create_info = {
+      .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+      .pNext = &(VkImageViewUsageCreateInfo){
+         .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_USAGE_CREATE_INFO,
+         .usage = usage,
+      },
+      .image = anv_image_to_handle(image),
+      /* XXX we only need 2D but the shader expects 2D_ARRAY */
+      .viewType = VK_IMAGE_VIEW_TYPE_2D_ARRAY,
+      .format = format,
+      .subresourceRange = {
+         .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+         .baseMipLevel = level,
+         .levelCount = 1,
+         .baseArrayLayer = layer,
+         .layerCount = 1,
+      },
+   };
+
+   memset(iview, 0, sizeof(*iview));
+   anv_image_view_init(device, iview, &create_info,
+                       &cmd_buffer->surface_state_stream);
+}
+
+static void
+astc_emu_init_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
+                                  struct anv_push_descriptor_set *push_set,
+                                  VkDescriptorSetLayout _layout,
+                                  uint32_t write_count,
+                                  const VkWriteDescriptorSet *writes)
+{
+   struct anv_device *device = cmd_buffer->device;
+   struct anv_descriptor_set_layout *layout =
+      anv_descriptor_set_layout_from_handle(_layout);
+
+   memset(push_set, 0, sizeof(*push_set));
+   anv_push_descriptor_set_init(cmd_buffer, push_set, layout);
+
+   anv_descriptor_set_write(device, &push_set->set, write_count, writes);
+}
+
+static void
+astc_emu_init_flush_denorm_shader(nir_builder *b)
+{
+   b->shader->info.workgroup_size[0] = 8;
+   b->shader->info.workgroup_size[1] = 8;
+
+   const struct glsl_type *src_type =
+      glsl_sampler_type(GLSL_SAMPLER_DIM_2D, false, true, GLSL_TYPE_UINT);
+   nir_variable *src_var =
+      nir_variable_create(b->shader, nir_var_uniform, src_type, "src");
+   src_var->data.descriptor_set = 0;
+   src_var->data.binding = 0;
+
+   const struct glsl_type *dst_type =
+      glsl_image_type(GLSL_SAMPLER_DIM_2D, true, GLSL_TYPE_UINT);
+   nir_variable *dst_var =
+      nir_variable_create(b->shader, nir_var_uniform, dst_type, "dst");
+   dst_var->data.descriptor_set = 0;
+   dst_var->data.binding = 1;
+
+   nir_def *zero = nir_imm_int(b, 0);
+   nir_def *consts = nir_load_push_constant(b, 4, 32, zero, .range = 16);
+   nir_def *offset = nir_channels(b, consts, 0x3);
+   nir_def *extent = nir_channels(b, consts, 0x3 << 2);
+
+   nir_def *coord = nir_load_global_invocation_id(b, 32);
+   coord = nir_iadd(b, nir_channels(b, coord, 0x3), offset);
+
+   nir_def *cond = nir_ilt(b, coord, extent);
+   cond = nir_iand(b, nir_channel(b, cond, 0), nir_channel(b, cond, 1));
+   nir_push_if(b, cond);
+   {
+      const struct glsl_type *val_type = glsl_vector_type(GLSL_TYPE_UINT, 4);
+      nir_variable *val_var =
+         nir_variable_create(b->shader, nir_var_shader_temp, val_type, "val");
+
+      coord = nir_vec3(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1),
+                       zero);
+      nir_def *val =
+         nir_txf_deref(b, nir_build_deref_var(b, src_var), coord, zero);
+      nir_store_var(b, val_var, val, 0xf);
+
+      /* A void-extent block has this layout
+       *
+       *   struct astc_void_extent_block {
+       *      uint16_t header;
+       *      uint16_t dontcare0;
+       *      uint16_t dontcare1;
+       *      uint16_t dontcare2;
+       *      uint16_t R;
+       *      uint16_t G;
+       *      uint16_t B;
+       *      uint16_t A;
+       *   };
+       *
+       * where the lower 12 bits are 0xdfc for 2D LDR.
+       */
+      nir_def *block_mode = nir_iand_imm(b, nir_channel(b, val, 0), 0xfff);
+      nir_push_if(b, nir_ieq_imm(b, block_mode, 0xdfc));
+      {
+         nir_def *color = nir_channels(b, val, 0x3 << 2);
+         nir_def *comps = nir_unpack_64_4x16(b, nir_pack_64_2x32(b, color));
+
+         /* flush denorms */
+         comps = nir_bcsel(b, nir_ult_imm(b, comps, 4),
+                           nir_imm_intN_t(b, 0, 16), comps);
+
+         color = nir_unpack_64_2x32(b, nir_pack_64_4x16(b, comps));
+         val = nir_vec4(b, nir_channel(b, val, 0), nir_channel(b, val, 1),
+                        nir_channel(b, color, 0), nir_channel(b, color, 1));
+         nir_store_var(b, val_var, val, 0x3 << 2);
+      }
+      nir_pop_if(b, NULL);
+
+      nir_def *dst = &nir_build_deref_var(b, dst_var)->def;
+      coord = nir_pad_vector(b, coord, 4);
+      val = nir_load_var(b, val_var);
+      nir_image_deref_store(b, dst, coord, nir_undef(b, 1, 32), val, zero,
+                            .image_dim = GLSL_SAMPLER_DIM_2D,
+                            .image_array = true);
+   }
+   nir_pop_if(b, NULL);
+}
+
+static VkResult
+astc_emu_init_flush_denorm_pipeline_locked(struct anv_device *device)
+{
+   struct anv_device_astc_emu *astc_emu = &device->astc_emu;
+   VkDevice _device = anv_device_to_handle(device);
+   VkResult result = VK_SUCCESS;
+
+   if (astc_emu->ds_layout == VK_NULL_HANDLE) {
+      const VkDescriptorSetLayoutCreateInfo ds_layout_create_info = {
+         .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+         .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
+         .bindingCount = 2,
+         .pBindings = (VkDescriptorSetLayoutBinding[]){
+            {
+               .binding = 0,
+               .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+               .descriptorCount = 1,
+               .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+            },
+            {
+               .binding = 1,
+               .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+               .descriptorCount = 1,
+               .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+            },
+         },
+      };
+      result = anv_CreateDescriptorSetLayout(_device, &ds_layout_create_info,
+                                             NULL, &astc_emu->ds_layout);
+      if (result != VK_SUCCESS)
+         goto out;
+   }
+
+   if (astc_emu->pipeline_layout == VK_NULL_HANDLE) {
+      const VkPipelineLayoutCreateInfo pipeline_layout_create_info = {
+         .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+         .setLayoutCount = 1,
+         .pSetLayouts = &astc_emu->ds_layout,
+         .pushConstantRangeCount = 1,
+         .pPushConstantRanges = &(VkPushConstantRange){
+            .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+            .size = sizeof(uint32_t) * 4,
+         },
+      };
+      result = anv_CreatePipelineLayout(_device, &pipeline_layout_create_info,
+                                        NULL, &astc_emu->pipeline_layout);
+      if (result != VK_SUCCESS)
+         goto out;
+   }
+
+   if (astc_emu->pipeline == VK_NULL_HANDLE) {
+      const struct nir_shader_compiler_options *options =
+         device->physical->compiler->nir_options[MESA_SHADER_COMPUTE];
+      nir_builder b = nir_builder_init_simple_shader(
+            MESA_SHADER_COMPUTE, options, "astc_emu_flush_denorm");
+      astc_emu_init_flush_denorm_shader(&b);
+
+      const VkComputePipelineCreateInfo pipeline_create_info = {
+         .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+         .stage =
+            (VkPipelineShaderStageCreateInfo){
+               .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+               .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+               .module = vk_shader_module_handle_from_nir(b.shader),
+               .pName = "main",
+            },
+         .layout = astc_emu->pipeline_layout,
+      };
+      result = anv_CreateComputePipelines(_device, VK_NULL_HANDLE, 1,
+                                          &pipeline_create_info, NULL,
+                                          &astc_emu->pipeline);
+      ralloc_free(b.shader);
+
+      if (result != VK_SUCCESS)
+         goto out;
+   }
+
+out:
+   return result;
+}
+
+static VkResult
+astc_emu_init_flush_denorm_pipeline(struct anv_device *device)
+{
+   struct anv_device_astc_emu *astc_emu = &device->astc_emu;
+   VkResult result = VK_SUCCESS;
+
+   simple_mtx_lock(&astc_emu->mutex);
+   if (!astc_emu->pipeline)
+      result = astc_emu_init_flush_denorm_pipeline_locked(device);
+   simple_mtx_unlock(&astc_emu->mutex);
+
+   return result;
+}
+
+static void
+astc_emu_flush_denorm_slice(struct anv_cmd_buffer *cmd_buffer,
+                            VkFormat astc_format,
+                            VkImageLayout layout,
+                            VkImageView src_view,
+                            VkImageView dst_view,
+                            VkRect2D rect)
+{
+   struct anv_device *device = cmd_buffer->device;
+   struct anv_device_astc_emu *astc_emu = &device->astc_emu;
+   VkCommandBuffer cmd_buffer_ = anv_cmd_buffer_to_handle(cmd_buffer);
+
+   VkResult result = astc_emu_init_flush_denorm_pipeline(device);
+   if (result != VK_SUCCESS) {
+      anv_batch_set_error(&cmd_buffer->batch, result);
+      return;
+   }
+
+   const uint32_t push_const[] = {
+      rect.offset.x,
+      rect.offset.y,
+      rect.offset.x + rect.extent.width,
+      rect.offset.y + rect.extent.height,
+   };
+
+   const VkWriteDescriptorSet set_writes[] = {
+      {
+         .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+         .dstBinding = 0,
+         .descriptorCount = 1,
+         .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+         .pImageInfo = &(VkDescriptorImageInfo){
+            .imageView = src_view,
+            .imageLayout = layout,
+         },
+      },
+      {
+         .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+         .dstBinding = 1,
+         .descriptorCount = 1,
+         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+         .pImageInfo = &(VkDescriptorImageInfo){
+            .imageView = dst_view,
+            .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
+         },
+      },
+   };
+   struct anv_push_descriptor_set push_set;
+   astc_emu_init_push_descriptor_set(cmd_buffer,
+                                     &push_set,
+                                     astc_emu->ds_layout,
+                                     ARRAY_SIZE(set_writes),
+                                     set_writes);
+   VkDescriptorSet set = anv_descriptor_set_to_handle(&push_set.set);
+
+   anv_CmdBindPipeline(cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE,
+                       astc_emu->pipeline);
+
+   VkPushConstantsInfoKHR push_info = {
+      .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO_KHR,
+      .layout = astc_emu->pipeline_layout,
+      .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      .offset = 0,
+      .size = sizeof(push_const),
+      .pValues = push_const,
+   };
+   anv_CmdPushConstants2KHR(cmd_buffer_, &push_info);
+
+   VkBindDescriptorSetsInfoKHR bind_info = {
+      .sType = VK_STRUCTURE_TYPE_BIND_DESCRIPTOR_SETS_INFO_KHR,
+      .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      .layout = astc_emu->pipeline_layout,
+      .firstSet = 0,
+      .descriptorSetCount = 1,
+      .pDescriptorSets = &set,
+      .dynamicOffsetCount = 0,
+      .pDynamicOffsets = NULL,
+   };
+   anv_CmdBindDescriptorSets2KHR(cmd_buffer_, &bind_info);
+
+   /* each workgroup processes 8x8 texel blocks */
+   rect.extent.width = DIV_ROUND_UP(rect.extent.width, 8);
+   rect.extent.height = DIV_ROUND_UP(rect.extent.height, 8);
+
+   anv_genX(device->info, CmdDispatchBase)(cmd_buffer_, 0, 0, 0,
+                                           rect.extent.width,
+                                           rect.extent.height,
+                                           1);
+
+   anv_push_descriptor_set_finish(&push_set);
+}
+
+static void
+astc_emu_decompress_slice(struct anv_cmd_buffer *cmd_buffer,
+                          VkFormat astc_format,
+                          VkImageLayout layout,
+                          VkImageView src_view,
+                          VkImageView dst_view,
+                          VkRect2D rect)
+{
+   struct anv_device *device = cmd_buffer->device;
+   struct anv_device_astc_emu *astc_emu = &device->astc_emu;
+   VkCommandBuffer cmd_buffer_ = anv_cmd_buffer_to_handle(cmd_buffer);
+
+   VkPipeline pipeline =
+      vk_texcompress_astc_get_decode_pipeline(&device->vk, &device->vk.alloc,
+                                              astc_emu->texcompress,
+                                              VK_NULL_HANDLE, astc_format);
+   if (pipeline == VK_NULL_HANDLE) {
+      anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN);
+      return;
+   }
+
+   anv_CmdBindPipeline(cmd_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
+
+   struct vk_texcompress_astc_write_descriptor_set writes;
+   vk_texcompress_astc_fill_write_descriptor_sets(astc_emu->texcompress,
+                                                  &writes, src_view, layout,
+                                                  dst_view, astc_format);
+
+   struct anv_push_descriptor_set push_set;
+   astc_emu_init_push_descriptor_set(cmd_buffer, &push_set,
+                                     astc_emu->texcompress->ds_layout,
+                                     ARRAY_SIZE(writes.descriptor_set),
+                                     writes.descriptor_set);
+
+   VkDescriptorSet set = anv_descriptor_set_to_handle(&push_set.set);
+
+   VkBindDescriptorSetsInfoKHR bind_info = {
+      .sType = VK_STRUCTURE_TYPE_BIND_DESCRIPTOR_SETS_INFO_KHR,
+      .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      .layout = astc_emu->texcompress->p_layout,
+      .firstSet = 0,
+      .descriptorSetCount = 1,
+      .pDescriptorSets = &set,
+      .dynamicOffsetCount = 0,
+      .pDynamicOffsets = NULL,
+   };
+   anv_CmdBindDescriptorSets2KHR(cmd_buffer_, &bind_info);
+
+   const uint32_t push_const[] = {
+      rect.offset.x,
+      rect.offset.y,
+      (rect.offset.x + rect.extent.width) *
+         vk_format_get_blockwidth(astc_format),
+      (rect.offset.y + rect.extent.height) *
+         vk_format_get_blockheight(astc_format),
+      false, /* we don't use VK_IMAGE_VIEW_TYPE_3D */
+   };
+   VkPushConstantsInfoKHR push_info = {
+      .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO_KHR,
+      .layout = astc_emu->texcompress->p_layout,
+      .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+      .offset = 0,
+      .size = sizeof(push_const),
+      .pValues = push_const,
+   };
+   anv_CmdPushConstants2KHR(cmd_buffer_, &push_info);
+
+   /* each workgroup processes 2x2 texel blocks */
+   rect.extent.width = DIV_ROUND_UP(rect.extent.width, 2);
+   rect.extent.height = DIV_ROUND_UP(rect.extent.height, 2);
+
+   anv_genX(device->info, CmdDispatchBase)(cmd_buffer_, 0, 0, 0,
+                                           rect.extent.width,
+                                           rect.extent.height,
+                                           1);
+
+   anv_push_descriptor_set_finish(&push_set);
+}
+
+void
+anv_astc_emu_process(struct anv_cmd_buffer *cmd_buffer,
+                     struct anv_image *image,
+                     VkImageLayout layout,
+                     const VkImageSubresourceLayers *subresource,
+                     VkOffset3D block_offset,
+                     VkExtent3D block_extent)
+{
+   const bool flush_denorms =
+      cmd_buffer->device->physical->flush_astc_ldr_void_extent_denorms;
+
+   assert(image->emu_plane_format != VK_FORMAT_UNDEFINED);
+
+   const VkRect2D rect = {
+      .offset = {
+         .x = block_offset.x,
+         .y = block_offset.y,
+      },
+      .extent = {
+         .width = block_extent.width,
+         .height = block_extent.height,
+      },
+   };
+
+   /* process one layer at a time because anv_image_fill_surface_state
+    * requires an uncompressed view of a compressed image to be single layer
+    */
+   const bool is_3d = image->vk.image_type == VK_IMAGE_TYPE_3D;
+   const uint32_t slice_base = is_3d ?
+      block_offset.z : subresource->baseArrayLayer;
+   const uint32_t slice_count = is_3d ?
+      block_extent.depth : subresource->layerCount;
+
+   struct anv_cmd_saved_state saved;
+   anv_cmd_buffer_save_state(cmd_buffer,
+                             ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE |
+                             ANV_CMD_SAVED_STATE_DESCRIPTOR_SET_0 |
+                             ANV_CMD_SAVED_STATE_PUSH_CONSTANTS,
+                             &saved);
+
+   for (uint32_t i = 0; i < slice_count; i++) {
+      struct anv_image_view src_view;
+      struct anv_image_view dst_view;
+      astc_emu_init_image_view(cmd_buffer, &src_view, image,
+                               VK_FORMAT_R32G32B32A32_UINT,
+                               VK_IMAGE_USAGE_SAMPLED_BIT,
+                               subresource->mipLevel, slice_base + i);
+      astc_emu_init_image_view(cmd_buffer, &dst_view, image,
+                               flush_denorms ? VK_FORMAT_R32G32B32A32_UINT
+                                             : VK_FORMAT_R8G8B8A8_UINT,
+                               VK_IMAGE_USAGE_STORAGE_BIT,
+                               subresource->mipLevel, slice_base + i);
+
+      if (flush_denorms) {
+         astc_emu_flush_denorm_slice(cmd_buffer, image->vk.format, layout,
+                                     anv_image_view_to_handle(&src_view),
+                                     anv_image_view_to_handle(&dst_view),
+                                     rect);
+      } else {
+         astc_emu_decompress_slice(cmd_buffer, image->vk.format, layout,
+                                   anv_image_view_to_handle(&src_view),
+                                   anv_image_view_to_handle(&dst_view),
+                                   rect);
+      }
+   }
+
+   anv_cmd_buffer_restore_state(cmd_buffer, &saved);
+}
+
+VkResult
+anv_device_init_astc_emu(struct anv_device *device)
+{
+   struct anv_device_astc_emu *astc_emu = &device->astc_emu;
+   VkResult result = VK_SUCCESS;
+
+   if (device->physical->flush_astc_ldr_void_extent_denorms)
+      simple_mtx_init(&astc_emu->mutex, mtx_plain);
+
+   if (device->physical->emu_astc_ldr) {
+      result = vk_texcompress_astc_init(&device->vk, &device->vk.alloc,
+                                        VK_NULL_HANDLE,
+                                        &astc_emu->texcompress);
+   }
+
+   return result;
+}
+
+void
+anv_device_finish_astc_emu(struct anv_device *device)
+{
+   struct anv_device_astc_emu *astc_emu = &device->astc_emu;
+
+   if (device->physical->flush_astc_ldr_void_extent_denorms) {
+      VkDevice _device = anv_device_to_handle(device);
+
+      anv_DestroyPipeline(_device, astc_emu->pipeline, NULL);
+      anv_DestroyPipelineLayout(_device, astc_emu->pipeline_layout, NULL);
+      anv_DestroyDescriptorSetLayout(_device, astc_emu->ds_layout, NULL);
+      simple_mtx_destroy(&astc_emu->mutex);
+   }
+
+   if (astc_emu->texcompress) {
+      vk_texcompress_astc_finish(&device->vk, &device->vk.alloc,
+                                 astc_emu->texcompress);
+   }
+}
diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c
index e7eda9bf9fa..bb986847a08 100644
--- a/src/intel/vulkan/anv_batch_chain.c
+++ b/src/intel/vulkan/anv_batch_chain.c
@@ -27,22 +27,24 @@
 #include <unistd.h>
 #include <fcntl.h>
 
+#include <xf86drm.h>
+
 #include "anv_private.h"
 #include "anv_measure.h"
 
-#include "genxml/gen8_pack.h"
+#include "genxml/gen9_pack.h"
 #include "genxml/genX_bits.h"
-#include "perf/intel_perf.h"
 
-#include "util/debug.h"
+#include "util/perf/u_trace.h"
 
 /** \file anv_batch_chain.c
  *
  * This file contains functions related to anv_cmd_buffer as a data
  * structure.  This involves everything required to create and destroy
- * the actual batch buffers as well as link them together and handle
- * relocations and surface state.  It specifically does *not* contain any
- * handling of actual vkCmd calls beyond vkCmdExecuteCommands.
+ * the actual batch buffers as well as link them together.
+ *
+ * It specifically does *not* contain any handling of actual vkCmd calls
+ * beyond vkCmdExecuteCommands.
  */
 
 /*-----------------------------------------------------------------------*
@@ -51,49 +53,25 @@
 
 VkResult
 anv_reloc_list_init(struct anv_reloc_list *list,
-                    const VkAllocationCallbacks *alloc)
+                    const VkAllocationCallbacks *alloc,
+                    bool uses_relocs)
 {
+   assert(alloc != NULL);
    memset(list, 0, sizeof(*list));
+   list->uses_relocs = uses_relocs;
+   list->alloc = alloc;
    return VK_SUCCESS;
 }
 
 static VkResult
 anv_reloc_list_init_clone(struct anv_reloc_list *list,
-                          const VkAllocationCallbacks *alloc,
                           const struct anv_reloc_list *other_list)
 {
-   list->num_relocs = other_list->num_relocs;
-   list->array_length = other_list->array_length;
-
-   if (list->num_relocs > 0) {
-      list->relocs =
-         vk_alloc(alloc, list->array_length * sizeof(*list->relocs), 8,
-                   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-      if (list->relocs == NULL)
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-      list->reloc_bos =
-         vk_alloc(alloc, list->array_length * sizeof(*list->reloc_bos), 8,
-                   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-      if (list->reloc_bos == NULL) {
-         vk_free(alloc, list->relocs);
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-      }
-
-      memcpy(list->relocs, other_list->relocs,
-             list->array_length * sizeof(*list->relocs));
-      memcpy(list->reloc_bos, other_list->reloc_bos,
-             list->array_length * sizeof(*list->reloc_bos));
-   } else {
-      list->relocs = NULL;
-      list->reloc_bos = NULL;
-   }
-
    list->dep_words = other_list->dep_words;
 
    if (list->dep_words > 0) {
       list->deps =
-         vk_alloc(alloc, list->dep_words * sizeof(BITSET_WORD), 8,
+         vk_alloc(list->alloc, list->dep_words * sizeof(BITSET_WORD), 8,
                   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
       memcpy(list->deps, other_list->deps,
              list->dep_words * sizeof(BITSET_WORD));
@@ -105,50 +83,13 @@ anv_reloc_list_init_clone(struct anv_reloc_list *list,
 }
 
 void
-anv_reloc_list_finish(struct anv_reloc_list *list,
-                      const VkAllocationCallbacks *alloc)
+anv_reloc_list_finish(struct anv_reloc_list *list)
 {
-   vk_free(alloc, list->relocs);
-   vk_free(alloc, list->reloc_bos);
-   vk_free(alloc, list->deps);
-}
-
-static VkResult
-anv_reloc_list_grow(struct anv_reloc_list *list,
-                    const VkAllocationCallbacks *alloc,
-                    size_t num_additional_relocs)
-{
-   if (list->num_relocs + num_additional_relocs <= list->array_length)
-      return VK_SUCCESS;
-
-   size_t new_length = MAX2(16, list->array_length * 2);
-   while (new_length < list->num_relocs + num_additional_relocs)
-      new_length *= 2;
-
-   struct drm_i915_gem_relocation_entry *new_relocs =
-      vk_realloc(alloc, list->relocs,
-                 new_length * sizeof(*list->relocs), 8,
-                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-   if (new_relocs == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-   list->relocs = new_relocs;
-
-   struct anv_bo **new_reloc_bos =
-      vk_realloc(alloc, list->reloc_bos,
-                 new_length * sizeof(*list->reloc_bos), 8,
-                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-   if (new_reloc_bos == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-   list->reloc_bos = new_reloc_bos;
-
-   list->array_length = new_length;
-
-   return VK_SUCCESS;
+   vk_free(list->alloc, list->deps);
 }
 
 static VkResult
 anv_reloc_list_grow_deps(struct anv_reloc_list *list,
-                         const VkAllocationCallbacks *alloc,
                          uint32_t min_num_words)
 {
    if (min_num_words <= list->dep_words)
@@ -159,10 +100,10 @@ anv_reloc_list_grow_deps(struct anv_reloc_list *list,
       new_length *= 2;
 
    BITSET_WORD *new_deps =
-      vk_realloc(alloc, list->deps, new_length * sizeof(BITSET_WORD), 8,
+      vk_realloc(list->alloc, list->deps, new_length * sizeof(BITSET_WORD), 8,
                  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (new_deps == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
    list->deps = new_deps;
 
    /* Zero out the new data */
@@ -173,18 +114,16 @@ anv_reloc_list_grow_deps(struct anv_reloc_list *list,
    return VK_SUCCESS;
 }
 
-#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
-
 VkResult
-anv_reloc_list_add_bo(struct anv_reloc_list *list,
-                      const VkAllocationCallbacks *alloc,
-                      struct anv_bo *target_bo)
+anv_reloc_list_add_bo_impl(struct anv_reloc_list *list,
+                           struct anv_bo *target_bo)
 {
-   assert(!target_bo->is_wrapper);
-   assert(target_bo->flags & EXEC_OBJECT_PINNED);
+   /* This can happen with sparse resources. */
+   if (!target_bo)
+      return VK_SUCCESS;
 
    uint32_t idx = target_bo->gem_handle;
-   VkResult result = anv_reloc_list_grow_deps(list, alloc,
+   VkResult result = anv_reloc_list_grow_deps(list,
                                               (idx / BITSET_WORDBITS) + 1);
    if (unlikely(result != VK_SUCCESS))
       return result;
@@ -194,75 +133,18 @@ anv_reloc_list_add_bo(struct anv_reloc_list *list,
    return VK_SUCCESS;
 }
 
-VkResult
-anv_reloc_list_add(struct anv_reloc_list *list,
-                   const VkAllocationCallbacks *alloc,
-                   uint32_t offset, struct anv_bo *target_bo, uint32_t delta,
-                   uint64_t *address_u64_out)
-{
-   struct drm_i915_gem_relocation_entry *entry;
-   int index;
-
-   struct anv_bo *unwrapped_target_bo = anv_bo_unwrap(target_bo);
-   uint64_t target_bo_offset = READ_ONCE(unwrapped_target_bo->offset);
-   if (address_u64_out)
-      *address_u64_out = target_bo_offset + delta;
-
-   assert(unwrapped_target_bo->gem_handle > 0);
-   assert(unwrapped_target_bo->refcount > 0);
-
-   if (unwrapped_target_bo->flags & EXEC_OBJECT_PINNED)
-      return anv_reloc_list_add_bo(list, alloc, unwrapped_target_bo);
-
-   VkResult result = anv_reloc_list_grow(list, alloc, 1);
-   if (result != VK_SUCCESS)
-      return result;
-
-   /* XXX: Can we use I915_EXEC_HANDLE_LUT? */
-   index = list->num_relocs++;
-   list->reloc_bos[index] = target_bo;
-   entry = &list->relocs[index];
-   entry->target_handle = -1; /* See also anv_cmd_buffer_process_relocs() */
-   entry->delta = delta;
-   entry->offset = offset;
-   entry->presumed_offset = target_bo_offset;
-   entry->read_domains = 0;
-   entry->write_domain = 0;
-   VG(VALGRIND_CHECK_MEM_IS_DEFINED(entry, sizeof(*entry)));
-
-   return VK_SUCCESS;
-}
-
 static void
 anv_reloc_list_clear(struct anv_reloc_list *list)
 {
-   list->num_relocs = 0;
    if (list->dep_words > 0)
       memset(list->deps, 0, list->dep_words * sizeof(BITSET_WORD));
 }
 
-static VkResult
+VkResult
 anv_reloc_list_append(struct anv_reloc_list *list,
-                      const VkAllocationCallbacks *alloc,
-                      struct anv_reloc_list *other, uint32_t offset)
+                      struct anv_reloc_list *other)
 {
-   VkResult result = anv_reloc_list_grow(list, alloc, other->num_relocs);
-   if (result != VK_SUCCESS)
-      return result;
-
-   if (other->num_relocs > 0) {
-      memcpy(&list->relocs[list->num_relocs], &other->relocs[0],
-             other->num_relocs * sizeof(other->relocs[0]));
-      memcpy(&list->reloc_bos[list->num_relocs], &other->reloc_bos[0],
-             other->num_relocs * sizeof(other->reloc_bos[0]));
-
-      for (uint32_t i = 0; i < other->num_relocs; i++)
-         list->relocs[i + list->num_relocs].offset += offset;
-
-      list->num_relocs += other->num_relocs;
-   }
-
-   anv_reloc_list_grow_deps(list, alloc, other->dep_words);
+   anv_reloc_list_grow_deps(list, other->dep_words);
    for (uint32_t w = 0; w < other->dep_words; w++)
       list->deps[w] |= other->deps[w];
 
@@ -273,15 +155,23 @@ anv_reloc_list_append(struct anv_reloc_list *list,
  * Functions related to anv_batch
  *-----------------------------------------------------------------------*/
 
+static VkResult
+anv_extend_batch(struct anv_batch *batch, uint32_t size)
+{
+   assert(batch->extend_cb != NULL);
+   VkResult result = batch->extend_cb(batch, size, batch->user_data);
+   if (result != VK_SUCCESS)
+      return anv_batch_set_error(batch, result);
+   return result;
+}
+
 void *
 anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords)
 {
-   if (batch->next + num_dwords * 4 > batch->end) {
-      VkResult result = batch->extend_cb(batch, batch->user_data);
-      if (result != VK_SUCCESS) {
-         anv_batch_set_error(batch, result);
+   uint32_t size = num_dwords * 4;
+   if (batch->next + size > batch->end) {
+      if (anv_extend_batch(batch, size) != VK_SUCCESS)
          return NULL;
-      }
    }
 
    void *p = batch->next;
@@ -292,10 +182,33 @@ anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords)
    return p;
 }
 
+/* Ensure enough contiguous space is available */
+VkResult
+anv_batch_emit_ensure_space(struct anv_batch *batch, uint32_t size)
+{
+   if (batch->next + size > batch->end) {
+      VkResult result = anv_extend_batch(batch, size);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   assert(batch->next + size <= batch->end);
+
+   return VK_SUCCESS;
+}
+
+void
+anv_batch_advance(struct anv_batch *batch, uint32_t size)
+{
+   assert(batch->next + size <= batch->end);
+
+   batch->next += size;
+}
+
 struct anv_address
 anv_batch_address(struct anv_batch *batch, void *batch_location)
 {
-   assert(batch->start < batch_location);
+   assert(batch->start <= batch_location);
 
    /* Allow a jump at the current location of the batch. */
    assert(batch->next >= batch_location);
@@ -306,17 +219,12 @@ anv_batch_address(struct anv_batch *batch, void *batch_location)
 void
 anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other)
 {
-   uint32_t size, offset;
-
-   size = other->next - other->start;
+   uint32_t size = other->next - other->start;
    assert(size % 4 == 0);
 
    if (batch->next + size > batch->end) {
-      VkResult result = batch->extend_cb(batch, batch->user_data);
-      if (result != VK_SUCCESS) {
-         anv_batch_set_error(batch, result);
+      if (anv_extend_batch(batch, size) != VK_SUCCESS)
          return;
-      }
    }
 
    assert(batch->next + size <= batch->end);
@@ -324,9 +232,7 @@ anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other)
    VG(VALGRIND_CHECK_MEM_IS_DEFINED(other->start, size));
    memcpy(batch->next, other->start, size);
 
-   offset = batch->next - batch->start;
-   VkResult result = anv_reloc_list_append(batch->relocs, batch->alloc,
-                                           other->relocs, offset);
+   VkResult result = anv_reloc_list_append(batch->relocs, other->relocs);
    if (result != VK_SUCCESS) {
       anv_batch_set_error(batch, result);
       return;
@@ -346,17 +252,18 @@ anv_batch_bo_create(struct anv_cmd_buffer *cmd_buffer,
 {
    VkResult result;
 
-   struct anv_batch_bo *bbo = vk_alloc(&cmd_buffer->pool->alloc, sizeof(*bbo),
+   struct anv_batch_bo *bbo = vk_zalloc(&cmd_buffer->vk.pool->alloc, sizeof(*bbo),
                                         8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (bbo == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool,
                               size, &bbo->bo);
    if (result != VK_SUCCESS)
       goto fail_alloc;
 
-   result = anv_reloc_list_init(&bbo->relocs, &cmd_buffer->pool->alloc);
+   const bool uses_relocs = cmd_buffer->device->physical->uses_relocs;
+   result = anv_reloc_list_init(&bbo->relocs, &cmd_buffer->vk.pool->alloc, uses_relocs);
    if (result != VK_SUCCESS)
       goto fail_bo_alloc;
 
@@ -367,7 +274,7 @@ anv_batch_bo_create(struct anv_cmd_buffer *cmd_buffer,
  fail_bo_alloc:
    anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
  fail_alloc:
-   vk_free(&cmd_buffer->pool->alloc, bbo);
+   vk_free(&cmd_buffer->vk.pool->alloc, bbo);
 
    return result;
 }
@@ -379,18 +286,17 @@ anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer,
 {
    VkResult result;
 
-   struct anv_batch_bo *bbo = vk_alloc(&cmd_buffer->pool->alloc, sizeof(*bbo),
+   struct anv_batch_bo *bbo = vk_alloc(&cmd_buffer->vk.pool->alloc, sizeof(*bbo),
                                         8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (bbo == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool,
                               other_bbo->bo->size, &bbo->bo);
    if (result != VK_SUCCESS)
       goto fail_alloc;
 
-   result = anv_reloc_list_init_clone(&bbo->relocs, &cmd_buffer->pool->alloc,
-                                      &other_bbo->relocs);
+   result = anv_reloc_list_init_clone(&bbo->relocs, &other_bbo->relocs);
    if (result != VK_SUCCESS)
       goto fail_bo_alloc;
 
@@ -403,7 +309,7 @@ anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer,
  fail_bo_alloc:
    anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
  fail_alloc:
-   vk_free(&cmd_buffer->pool->alloc, bbo);
+   vk_free(&cmd_buffer->vk.pool->alloc, bbo);
 
    return result;
 }
@@ -437,37 +343,6 @@ anv_batch_bo_finish(struct anv_batch_bo *bbo, struct anv_batch *batch)
    VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->start, bbo->length));
 }
 
-static VkResult
-anv_batch_bo_grow(struct anv_cmd_buffer *cmd_buffer, struct anv_batch_bo *bbo,
-                  struct anv_batch *batch, size_t aditional,
-                  size_t batch_padding)
-{
-   assert(batch->start == bbo->bo->map);
-   bbo->length = batch->next - batch->start;
-
-   size_t new_size = bbo->bo->size;
-   while (new_size <= bbo->length + aditional + batch_padding)
-      new_size *= 2;
-
-   if (new_size == bbo->bo->size)
-      return VK_SUCCESS;
-
-   struct anv_bo *new_bo;
-   VkResult result = anv_bo_pool_alloc(&cmd_buffer->device->batch_bo_pool,
-                                       new_size, &new_bo);
-   if (result != VK_SUCCESS)
-      return result;
-
-   memcpy(new_bo->map, bbo->bo->map, bbo->length);
-
-   anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
-
-   bbo->bo = new_bo;
-   anv_batch_bo_continue(bbo, batch, batch_padding);
-
-   return VK_SUCCESS;
-}
-
 static void
 anv_batch_bo_link(struct anv_cmd_buffer *cmd_buffer,
                   struct anv_batch_bo *prev_bbo,
@@ -475,39 +350,30 @@ anv_batch_bo_link(struct anv_cmd_buffer *cmd_buffer,
                   uint32_t next_bbo_offset)
 {
    const uint32_t bb_start_offset =
-      prev_bbo->length - GFX8_MI_BATCH_BUFFER_START_length * 4;
+      prev_bbo->length - GFX9_MI_BATCH_BUFFER_START_length * 4;
    ASSERTED const uint32_t *bb_start = prev_bbo->bo->map + bb_start_offset;
 
    /* Make sure we're looking at a MI_BATCH_BUFFER_START */
    assert(((*bb_start >> 29) & 0x07) == 0);
    assert(((*bb_start >> 23) & 0x3f) == 49);
 
-   if (cmd_buffer->device->physical->use_softpin) {
-      assert(prev_bbo->bo->flags & EXEC_OBJECT_PINNED);
-      assert(next_bbo->bo->flags & EXEC_OBJECT_PINNED);
+   uint64_t *map = prev_bbo->bo->map + bb_start_offset + 4;
+   *map = intel_canonical_address(next_bbo->bo->offset + next_bbo_offset);
 
-      write_reloc(cmd_buffer->device,
-                  prev_bbo->bo->map + bb_start_offset + 4,
-                  next_bbo->bo->offset + next_bbo_offset, true);
-   } else {
-      uint32_t reloc_idx = prev_bbo->relocs.num_relocs - 1;
-      assert(prev_bbo->relocs.relocs[reloc_idx].offset == bb_start_offset + 4);
-
-      prev_bbo->relocs.reloc_bos[reloc_idx] = next_bbo->bo;
-      prev_bbo->relocs.relocs[reloc_idx].delta = next_bbo_offset;
-
-      /* Use a bogus presumed offset to force a relocation */
-      prev_bbo->relocs.relocs[reloc_idx].presumed_offset = -1;
-   }
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+   if (cmd_buffer->device->physical->memory.need_flush &&
+       anv_bo_needs_host_cache_flush(prev_bbo->bo->alloc_flags))
+      intel_flush_range(map, sizeof(uint64_t));
+#endif
 }
 
 static void
 anv_batch_bo_destroy(struct anv_batch_bo *bbo,
                      struct anv_cmd_buffer *cmd_buffer)
 {
-   anv_reloc_list_finish(&bbo->relocs, &cmd_buffer->pool->alloc);
+   anv_reloc_list_finish(&bbo->relocs);
    anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool, bbo->bo);
-   vk_free(&cmd_buffer->pool->alloc, bbo);
+   vk_free(&cmd_buffer->vk.pool->alloc, bbo);
 }
 
 static VkResult
@@ -550,13 +416,36 @@ anv_batch_bo_list_clone(const struct list_head *list,
 static struct anv_batch_bo *
 anv_cmd_buffer_current_batch_bo(struct anv_cmd_buffer *cmd_buffer)
 {
-   return LIST_ENTRY(struct anv_batch_bo, cmd_buffer->batch_bos.prev, link);
+   return list_entry(cmd_buffer->batch_bos.prev, struct anv_batch_bo, link);
+}
+
+static struct anv_batch_bo *
+anv_cmd_buffer_current_generation_batch_bo(struct anv_cmd_buffer *cmd_buffer)
+{
+   return list_entry(cmd_buffer->generation.batch_bos.prev, struct anv_batch_bo, link);
 }
 
 struct anv_address
 anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer)
 {
-   struct anv_state_pool *pool = anv_binding_table_pool(cmd_buffer->device);
+   /* Only graphics & compute queues need binding tables. */
+   if (!(cmd_buffer->queue_family->queueFlags & (VK_QUEUE_GRAPHICS_BIT |
+                                                 VK_QUEUE_COMPUTE_BIT)))
+      return ANV_NULL_ADDRESS;
+
+   /* If we've never allocated a binding table block, do it now. Otherwise we
+    * would trigger another STATE_BASE_ADDRESS emission which would require an
+    * additional bunch of flushes/stalls.
+    */
+   if (u_vector_length(&cmd_buffer->bt_block_states) == 0) {
+      VkResult result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
+      if (result != VK_SUCCESS) {
+         anv_batch_set_error(&cmd_buffer->batch, result);
+         return ANV_NULL_ADDRESS;
+      }
+   }
+
+   struct anv_state_pool *pool = &cmd_buffer->device->binding_table_pool;
    struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states);
    return (struct anv_address) {
       .bo = pool->block_pool.bo,
@@ -565,60 +454,57 @@ anv_cmd_buffer_surface_base_address(struct anv_cmd_buffer *cmd_buffer)
 }
 
 static void
-emit_batch_buffer_start(struct anv_cmd_buffer *cmd_buffer,
+emit_batch_buffer_start(struct anv_batch *batch,
                         struct anv_bo *bo, uint32_t offset)
 {
-   /* In gfx8+ the address field grew to two dwords to accomodate 48 bit
-    * offsets. The high 16 bits are in the last dword, so we can use the gfx8
-    * version in either case, as long as we set the instruction length in the
-    * header accordingly.  This means that we always emit three dwords here
-    * and all the padding and adjustment we do in this file works for all
-    * gens.
-    */
-
-#define GFX7_MI_BATCH_BUFFER_START_length      2
-#define GFX7_MI_BATCH_BUFFER_START_length_bias      2
-
-   const uint32_t gfx7_length =
-      GFX7_MI_BATCH_BUFFER_START_length - GFX7_MI_BATCH_BUFFER_START_length_bias;
-   const uint32_t gfx8_length =
-      GFX8_MI_BATCH_BUFFER_START_length - GFX8_MI_BATCH_BUFFER_START_length_bias;
-
-   anv_batch_emit(&cmd_buffer->batch, GFX8_MI_BATCH_BUFFER_START, bbs) {
-      bbs.DWordLength               = cmd_buffer->device->info.ver < 8 ?
-                                      gfx7_length : gfx8_length;
+   anv_batch_emit(batch, GFX9_MI_BATCH_BUFFER_START, bbs) {
+      bbs.DWordLength               = GFX9_MI_BATCH_BUFFER_START_length -
+                                      GFX9_MI_BATCH_BUFFER_START_length_bias;
       bbs.SecondLevelBatchBuffer    = Firstlevelbatch;
       bbs.AddressSpaceIndicator     = ASI_PPGTT;
       bbs.BatchBufferStartAddress   = (struct anv_address) { bo, offset };
    }
 }
 
+enum anv_cmd_buffer_batch {
+   ANV_CMD_BUFFER_BATCH_MAIN,
+   ANV_CMD_BUFFER_BATCH_GENERATION,
+};
+
 static void
 cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer *cmd_buffer,
-                             struct anv_batch_bo *bbo)
+                             struct anv_batch_bo *bbo,
+                             enum anv_cmd_buffer_batch batch_type)
 {
-   struct anv_batch *batch = &cmd_buffer->batch;
+   struct anv_batch *batch =
+      batch_type == ANV_CMD_BUFFER_BATCH_GENERATION ?
+      &cmd_buffer->generation.batch : &cmd_buffer->batch;
    struct anv_batch_bo *current_bbo =
+      batch_type == ANV_CMD_BUFFER_BATCH_GENERATION ?
+      anv_cmd_buffer_current_generation_batch_bo(cmd_buffer) :
       anv_cmd_buffer_current_batch_bo(cmd_buffer);
 
    /* We set the end of the batch a little short so we would be sure we
     * have room for the chaining command.  Since we're about to emit the
     * chaining command, let's set it back where it should go.
     */
-   batch->end += GFX8_MI_BATCH_BUFFER_START_length * 4;
+   batch->end += GFX9_MI_BATCH_BUFFER_START_length * 4;
    assert(batch->end == current_bbo->bo->map + current_bbo->bo->size);
 
-   emit_batch_buffer_start(cmd_buffer, bbo->bo, 0);
+   emit_batch_buffer_start(batch, bbo->bo, 0);
 
    anv_batch_bo_finish(current_bbo, batch);
+
+   /* Add the current amount of data written in the current_bbo to the command
+    * buffer.
+    */
+   cmd_buffer->total_batch_size += current_bbo->length;
 }
 
 static void
 anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer *cmd_buffer_from,
                                    struct anv_cmd_buffer *cmd_buffer_to)
 {
-   assert(cmd_buffer_from->device->physical->use_softpin);
-
    uint32_t *bb_start = cmd_buffer_from->batch_end;
 
    struct anv_batch_bo *last_bbo =
@@ -626,8 +512,8 @@ anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer *cmd_buffer_from,
    struct anv_batch_bo *first_bbo =
       list_first_entry(&cmd_buffer_to->batch_bos, struct anv_batch_bo, link);
 
-   struct GFX8_MI_BATCH_BUFFER_START gen_bb_start = {
-      __anv_cmd_header(GFX8_MI_BATCH_BUFFER_START),
+   struct GFX9_MI_BATCH_BUFFER_START gen_bb_start = {
+      __anv_cmd_header(GFX9_MI_BATCH_BUFFER_START),
       .SecondLevelBatchBuffer    = Firstlevelbatch,
       .AddressSpaceIndicator     = ASI_PPGTT,
       .BatchBufferStartAddress   = (struct anv_address) { first_bbo->bo, 0 },
@@ -636,10 +522,10 @@ anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer *cmd_buffer_from,
       .start  = last_bbo->bo->map,
       .end    = last_bbo->bo->map + last_bbo->bo->size,
       .relocs = &last_bbo->relocs,
-      .alloc  = &cmd_buffer_from->pool->alloc,
+      .alloc  = &cmd_buffer_from->vk.pool->alloc,
    };
 
-   __anv_cmd_pack(GFX8_MI_BATCH_BUFFER_START)(&local_batch, bb_start, &gen_bb_start);
+   __anv_cmd_pack(GFX9_MI_BATCH_BUFFER_START)(&local_batch, bb_start, &gen_bb_start);
 
    last_bbo->chained = true;
 }
@@ -647,56 +533,92 @@ anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer *cmd_buffer_from,
 static void
 anv_cmd_buffer_record_end_submit(struct anv_cmd_buffer *cmd_buffer)
 {
-   assert(cmd_buffer->device->physical->use_softpin);
-
    struct anv_batch_bo *last_bbo =
       list_last_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link);
    last_bbo->chained = false;
 
    uint32_t *batch = cmd_buffer->batch_end;
-   anv_pack_struct(batch, GFX8_MI_BATCH_BUFFER_END,
-                   __anv_cmd_header(GFX8_MI_BATCH_BUFFER_END));
+   anv_pack_struct(batch, GFX9_MI_BATCH_BUFFER_END,
+                   __anv_cmd_header(GFX9_MI_BATCH_BUFFER_END));
 }
 
 static VkResult
-anv_cmd_buffer_chain_batch(struct anv_batch *batch, void *_data)
+anv_cmd_buffer_chain_batch(struct anv_batch *batch, uint32_t size, void *_data)
 {
+   /* The caller should not need that much space. Otherwise it should split
+    * its commands.
+    */
+   assert(size <= ANV_MAX_CMD_BUFFER_BATCH_SIZE);
+
    struct anv_cmd_buffer *cmd_buffer = _data;
-   struct anv_batch_bo *new_bbo;
+   struct anv_batch_bo *new_bbo = NULL;
+   /* Amount of reserved space at the end of the batch to account for the
+    * chaining instruction.
+    */
+   const uint32_t batch_padding = GFX9_MI_BATCH_BUFFER_START_length * 4;
    /* Cap reallocation to chunk. */
-   uint32_t alloc_size = MIN2(cmd_buffer->total_batch_size,
-                              ANV_MAX_CMD_BUFFER_BATCH_SIZE);
+   uint32_t alloc_size = MIN2(
+      MAX2(batch->allocated_batch_size, size + batch_padding),
+      ANV_MAX_CMD_BUFFER_BATCH_SIZE);
 
    VkResult result = anv_batch_bo_create(cmd_buffer, alloc_size, &new_bbo);
    if (result != VK_SUCCESS)
       return result;
 
-   cmd_buffer->total_batch_size += alloc_size;
+   batch->allocated_batch_size += alloc_size;
 
    struct anv_batch_bo **seen_bbo = u_vector_add(&cmd_buffer->seen_bbos);
    if (seen_bbo == NULL) {
       anv_batch_bo_destroy(new_bbo, cmd_buffer);
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
    }
    *seen_bbo = new_bbo;
 
-   cmd_buffer_chain_to_batch_bo(cmd_buffer, new_bbo);
+   cmd_buffer_chain_to_batch_bo(cmd_buffer, new_bbo, ANV_CMD_BUFFER_BATCH_MAIN);
 
    list_addtail(&new_bbo->link, &cmd_buffer->batch_bos);
 
-   anv_batch_bo_start(new_bbo, batch, GFX8_MI_BATCH_BUFFER_START_length * 4);
+   anv_batch_bo_start(new_bbo, batch, batch_padding);
 
    return VK_SUCCESS;
 }
 
 static VkResult
-anv_cmd_buffer_grow_batch(struct anv_batch *batch, void *_data)
+anv_cmd_buffer_chain_generation_batch(struct anv_batch *batch, uint32_t size, void *_data)
 {
+   /* The caller should not need that much space. Otherwise it should split
+    * its commands.
+    */
+   assert(size <= ANV_MAX_CMD_BUFFER_BATCH_SIZE);
+
    struct anv_cmd_buffer *cmd_buffer = _data;
-   struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
+   struct anv_batch_bo *new_bbo = NULL;
+   /* Cap reallocation to chunk. */
+   uint32_t alloc_size = MIN2(
+      MAX2(batch->allocated_batch_size, size),
+      ANV_MAX_CMD_BUFFER_BATCH_SIZE);
 
-   anv_batch_bo_grow(cmd_buffer, bbo, &cmd_buffer->batch, 4096,
-                     GFX8_MI_BATCH_BUFFER_START_length * 4);
+   VkResult result = anv_batch_bo_create(cmd_buffer, alloc_size, &new_bbo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   batch->allocated_batch_size += alloc_size;
+
+   struct anv_batch_bo **seen_bbo = u_vector_add(&cmd_buffer->seen_bbos);
+   if (seen_bbo == NULL) {
+      anv_batch_bo_destroy(new_bbo, cmd_buffer);
+      return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+   *seen_bbo = new_bbo;
+
+   if (!list_is_empty(&cmd_buffer->generation.batch_bos)) {
+      cmd_buffer_chain_to_batch_bo(cmd_buffer, new_bbo,
+                                   ANV_CMD_BUFFER_BATCH_GENERATION);
+   }
+
+   list_addtail(&new_bbo->link, &cmd_buffer->generation.batch_bos);
+
+   anv_batch_bo_start(new_bbo, batch, GFX9_MI_BATCH_BUFFER_START_length * 4);
 
    return VK_SUCCESS;
 }
@@ -759,9 +681,6 @@ anv_cmd_buffer_grow_batch(struct anv_batch *batch, void *_data)
  * surface state offsets so that they are correct relative to out new surface
  * state base address at the bottom of the binding table block.
  *
- * \see adjust_relocations_from_block_pool()
- * \see adjust_relocations_too_block_pool()
- *
  * \param[in]  entries        The number of surface state entries the binding
  *                            table should be able to hold.
  *
@@ -776,9 +695,12 @@ struct anv_state
 anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer,
                                    uint32_t entries, uint32_t *state_offset)
 {
+   if (u_vector_length(&cmd_buffer->bt_block_states) == 0)
+      return (struct anv_state) { 0 };
+
    struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states);
 
-   uint32_t bt_size = align_u32(entries * 4, 32);
+   uint32_t bt_size = align(entries * 4, 32);
 
    struct anv_state state = cmd_buffer->bt_next;
    if (bt_size > state.alloc_size)
@@ -789,26 +711,131 @@ anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer,
    cmd_buffer->bt_next.map += bt_size;
    cmd_buffer->bt_next.alloc_size -= bt_size;
 
-   assert(bt_block->offset < 0);
-   *state_offset = -bt_block->offset;
+   if (cmd_buffer->device->info->verx10 >= 125) {
+      /* We're using 3DSTATE_BINDING_TABLE_POOL_ALLOC to change the binding
+       * table address independently from surface state base address.  We no
+       * longer need any sort of offsetting.
+       */
+      *state_offset = 0;
+   } else {
+      assert(bt_block->offset < 0);
+      *state_offset = -bt_block->offset;
+   }
 
    return state;
 }
 
 struct anv_state
-anv_cmd_buffer_alloc_surface_state(struct anv_cmd_buffer *cmd_buffer)
+anv_cmd_buffer_alloc_surface_states(struct anv_cmd_buffer *cmd_buffer,
+                                    uint32_t count)
 {
+   if (count == 0)
+      return ANV_STATE_NULL;
    struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
-   return anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
-                                 isl_dev->ss.size, isl_dev->ss.align);
+   struct anv_state state =
+      anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
+                             count * isl_dev->ss.size,
+                             isl_dev->ss.align);
+   if (state.map == NULL)
+      anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+   return state;
 }
 
 struct anv_state
 anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer,
                                    uint32_t size, uint32_t alignment)
 {
-   return anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,
-                                 size, alignment);
+   if (size == 0)
+      return ANV_STATE_NULL;
+   assert(cmd_buffer->state.current_db_mode !=
+          ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN);
+   struct anv_state state =
+      anv_state_stream_alloc(cmd_buffer->state.current_db_mode ==
+                             ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER ?
+                             &cmd_buffer->dynamic_state_db_stream :
+                             &cmd_buffer->dynamic_state_stream,
+                             size, alignment);
+   if (state.map == NULL)
+      anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+   return state;
+}
+
+struct anv_state
+anv_cmd_buffer_alloc_general_state(struct anv_cmd_buffer *cmd_buffer,
+                                   uint32_t size, uint32_t alignment)
+{
+   if (size == 0)
+      return ANV_STATE_NULL;
+   struct anv_state state =
+      anv_state_stream_alloc(&cmd_buffer->general_state_stream,
+                             size, alignment);
+   if (state.map == NULL)
+      anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+   return state;
+}
+
+/** Allocate space associated with a command buffer
+ *
+ * Some commands like vkCmdBuildAccelerationStructuresKHR() can end up needing
+ * large amount of temporary buffers. This function is here to deal with those
+ * potentially larger allocations, using a side BO if needed.
+ *
+ */
+struct anv_cmd_alloc
+anv_cmd_buffer_alloc_space(struct anv_cmd_buffer *cmd_buffer,
+                           size_t size, uint32_t alignment,
+                           bool mapped)
+{
+   /* Below 16k, source memory from dynamic state, otherwise allocate a BO. */
+   if (size < 16 * 1024) {
+      struct anv_state state =
+         anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,
+                                size, alignment);
+      if (state.map == NULL) {
+         anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+         return (struct anv_cmd_alloc) {
+            .address = ANV_NULL_ADDRESS,
+         };
+      }
+
+      return (struct anv_cmd_alloc) {
+         .address = anv_state_pool_state_address(
+            &cmd_buffer->device->dynamic_state_pool,
+            state),
+         .map = state.map,
+         .size = size,
+      };
+   }
+
+   assert(alignment <= 4096);
+
+   struct anv_bo *bo = NULL;
+   VkResult result =
+      anv_bo_pool_alloc(mapped ?
+                        &cmd_buffer->device->batch_bo_pool :
+                        &cmd_buffer->device->bvh_bo_pool,
+                        align(size, 4096), &bo);
+   if (result != VK_SUCCESS) {
+      anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      return ANV_EMPTY_ALLOC;
+   }
+
+   struct anv_bo **bo_entry =
+      u_vector_add(&cmd_buffer->dynamic_bos);
+   if (bo_entry == NULL) {
+      anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
+      anv_bo_pool_free(bo->map != NULL ?
+                       &cmd_buffer->device->batch_bo_pool :
+                       &cmd_buffer->device->bvh_bo_pool, bo);
+      return ANV_EMPTY_ALLOC;
+   }
+   *bo_entry = bo;
+
+   return (struct anv_cmd_alloc) {
+      .address = (struct anv_address) { .bo = bo },
+      .map = bo->map,
+      .size = size,
+   };
 }
 
 VkResult
@@ -817,7 +844,7 @@ anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer)
    struct anv_state *bt_block = u_vector_add(&cmd_buffer->bt_block_states);
    if (bt_block == NULL) {
       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
    }
 
    *bt_block = anv_binding_table_pool_alloc(cmd_buffer->device);
@@ -834,55 +861,58 @@ anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer)
 VkResult
 anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
 {
-   struct anv_batch_bo *batch_bo;
+   struct anv_batch_bo *batch_bo = NULL;
    VkResult result;
 
    list_inithead(&cmd_buffer->batch_bos);
 
-   cmd_buffer->total_batch_size = ANV_MIN_CMD_BUFFER_BATCH_SIZE;
+   cmd_buffer->total_batch_size = 0;
 
    result = anv_batch_bo_create(cmd_buffer,
-                                cmd_buffer->total_batch_size,
+                                ANV_MIN_CMD_BUFFER_BATCH_SIZE,
                                 &batch_bo);
    if (result != VK_SUCCESS)
       return result;
 
    list_addtail(&batch_bo->link, &cmd_buffer->batch_bos);
 
-   cmd_buffer->batch.alloc = &cmd_buffer->pool->alloc;
+   cmd_buffer->batch.alloc = &cmd_buffer->vk.pool->alloc;
    cmd_buffer->batch.user_data = cmd_buffer;
+   cmd_buffer->batch.allocated_batch_size = ANV_MIN_CMD_BUFFER_BATCH_SIZE;
 
-   if (cmd_buffer->device->can_chain_batches) {
-      cmd_buffer->batch.extend_cb = anv_cmd_buffer_chain_batch;
-   } else {
-      cmd_buffer->batch.extend_cb = anv_cmd_buffer_grow_batch;
-   }
+   cmd_buffer->batch.extend_cb = anv_cmd_buffer_chain_batch;
+   cmd_buffer->batch.engine_class = cmd_buffer->queue_family->engine_class;
 
    anv_batch_bo_start(batch_bo, &cmd_buffer->batch,
-                      GFX8_MI_BATCH_BUFFER_START_length * 4);
+                      GFX9_MI_BATCH_BUFFER_START_length * 4);
 
-   int success = u_vector_init(&cmd_buffer->seen_bbos,
-                                 sizeof(struct anv_bo *),
-                                 8 * sizeof(struct anv_bo *));
+   /* Generation batch is initialized empty since it's possible it won't be
+    * used.
+    */
+   list_inithead(&cmd_buffer->generation.batch_bos);
+
+   cmd_buffer->generation.batch.alloc = &cmd_buffer->vk.pool->alloc;
+   cmd_buffer->generation.batch.user_data = cmd_buffer;
+   cmd_buffer->generation.batch.allocated_batch_size = 0;
+   cmd_buffer->generation.batch.extend_cb = anv_cmd_buffer_chain_generation_batch;
+   cmd_buffer->generation.batch.engine_class =
+      cmd_buffer->queue_family->engine_class;
+
+   int success = u_vector_init_pow2(&cmd_buffer->seen_bbos, 8,
+                                    sizeof(struct anv_bo *));
    if (!success)
       goto fail_batch_bo;
 
    *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = batch_bo;
 
-   /* u_vector requires power-of-two size elements */
-   unsigned pow2_state_size = util_next_power_of_two(sizeof(struct anv_state));
-   success = u_vector_init(&cmd_buffer->bt_block_states,
-                           pow2_state_size, 8 * pow2_state_size);
+   success = u_vector_init(&cmd_buffer->bt_block_states, 8,
+                           sizeof(struct anv_state));
    if (!success)
       goto fail_seen_bbos;
 
+   const bool uses_relocs = cmd_buffer->device->physical->uses_relocs;
    result = anv_reloc_list_init(&cmd_buffer->surface_relocs,
-                                &cmd_buffer->pool->alloc);
-   if (result != VK_SUCCESS)
-      goto fail_bt_blocks;
-   cmd_buffer->last_ss_pool_center = 0;
-
-   result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
+                                &cmd_buffer->vk.pool->alloc, uses_relocs);
    if (result != VK_SUCCESS)
       goto fail_bt_blocks;
 
@@ -906,7 +936,7 @@ anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
       anv_binding_table_pool_free(cmd_buffer->device, *bt_block);
    u_vector_finish(&cmd_buffer->bt_block_states);
 
-   anv_reloc_list_finish(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc);
+   anv_reloc_list_finish(&cmd_buffer->surface_relocs);
 
    u_vector_finish(&cmd_buffer->seen_bbos);
 
@@ -916,6 +946,17 @@ anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
       list_del(&bbo->link);
       anv_batch_bo_destroy(bbo, cmd_buffer);
    }
+   /* Also destroy all generation batch buffers */
+   list_for_each_entry_safe(struct anv_batch_bo, bbo,
+                            &cmd_buffer->generation.batch_bos, link) {
+      list_del(&bbo->link);
+      anv_batch_bo_destroy(bbo, cmd_buffer);
+   }
+
+   if (cmd_buffer->generation.ring_bo) {
+      anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool,
+                       cmd_buffer->generation.ring_bo);
+   }
 }
 
 void
@@ -932,18 +973,15 @@ anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
 
    anv_batch_bo_start(anv_cmd_buffer_current_batch_bo(cmd_buffer),
                       &cmd_buffer->batch,
-                      GFX8_MI_BATCH_BUFFER_START_length * 4);
+                      GFX9_MI_BATCH_BUFFER_START_length * 4);
 
-   while (u_vector_length(&cmd_buffer->bt_block_states) > 1) {
+   while (u_vector_length(&cmd_buffer->bt_block_states) > 0) {
       struct anv_state *bt_block = u_vector_remove(&cmd_buffer->bt_block_states);
       anv_binding_table_pool_free(cmd_buffer->device, *bt_block);
    }
-   assert(u_vector_length(&cmd_buffer->bt_block_states) == 1);
-   cmd_buffer->bt_next = *(struct anv_state *)u_vector_head(&cmd_buffer->bt_block_states);
-   cmd_buffer->bt_next.offset = 0;
+   cmd_buffer->bt_next = ANV_STATE_NULL;
 
    anv_reloc_list_clear(&cmd_buffer->surface_relocs);
-   cmd_buffer->last_ss_pool_center = 0;
 
    /* Reset the list of seen buffers */
    cmd_buffer->seen_bbos.head = 0;
@@ -953,25 +991,45 @@ anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer)
 
    *(struct anv_batch_bo **)u_vector_add(&cmd_buffer->seen_bbos) = first_bbo;
 
+   assert(first_bbo->bo->size == ANV_MIN_CMD_BUFFER_BATCH_SIZE);
+   cmd_buffer->batch.allocated_batch_size = first_bbo->bo->size;
+
+   /* Delete all generation batch bos */
+   list_for_each_entry_safe(struct anv_batch_bo, bbo,
+                            &cmd_buffer->generation.batch_bos, link) {
+      list_del(&bbo->link);
+      anv_batch_bo_destroy(bbo, cmd_buffer);
+   }
+
+   /* And reset generation batch */
+   cmd_buffer->generation.batch.allocated_batch_size = 0;
+   cmd_buffer->generation.batch.start = NULL;
+   cmd_buffer->generation.batch.end   = NULL;
+   cmd_buffer->generation.batch.next  = NULL;
 
-   assert(!cmd_buffer->device->can_chain_batches ||
-          first_bbo->bo->size == ANV_MIN_CMD_BUFFER_BATCH_SIZE);
-   cmd_buffer->total_batch_size = first_bbo->bo->size;
+   if (cmd_buffer->generation.ring_bo) {
+      anv_bo_pool_free(&cmd_buffer->device->batch_bo_pool,
+                       cmd_buffer->generation.ring_bo);
+      cmd_buffer->generation.ring_bo = NULL;
+   }
+
+   cmd_buffer->total_batch_size = 0;
 }
 
 void
 anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer)
 {
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
    struct anv_batch_bo *batch_bo = anv_cmd_buffer_current_batch_bo(cmd_buffer);
 
-   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
+   if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
       /* When we start a batch buffer, we subtract a certain amount of
        * padding from the end to ensure that we always have room to emit a
        * BATCH_BUFFER_START to chain to the next BO.  We need to remove
        * that padding before we end the batch; otherwise, we may end up
        * with our BATCH_BUFFER_END in another BO.
        */
-      cmd_buffer->batch.end += GFX8_MI_BATCH_BUFFER_START_length * 4;
+      cmd_buffer->batch.end += GFX9_MI_BATCH_BUFFER_START_length * 4;
       assert(cmd_buffer->batch.start == batch_bo->bo->map);
       assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size);
 
@@ -983,50 +1041,29 @@ anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer)
        */
       batch_bo->chained = anv_cmd_buffer_is_chainable(cmd_buffer);
       if (batch_bo->chained)
-         emit_batch_buffer_start(cmd_buffer, batch_bo->bo, 0);
+         emit_batch_buffer_start(&cmd_buffer->batch, batch_bo->bo, 0);
       else
-         anv_batch_emit(&cmd_buffer->batch, GFX8_MI_BATCH_BUFFER_END, bbe);
+         anv_batch_emit(&cmd_buffer->batch, GFX9_MI_BATCH_BUFFER_END, bbe);
 
       /* Round batch up to an even number of dwords. */
       if ((cmd_buffer->batch.next - cmd_buffer->batch.start) & 4)
-         anv_batch_emit(&cmd_buffer->batch, GFX8_MI_NOOP, noop);
+         anv_batch_emit(&cmd_buffer->batch, GFX9_MI_NOOP, noop);
 
       cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_PRIMARY;
    } else {
-      assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+      assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
       /* If this is a secondary command buffer, we need to determine the
        * mode in which it will be executed with vkExecuteCommands.  We
        * determine this statically here so that this stays in sync with the
        * actual ExecuteCommands implementation.
        */
       const uint32_t length = cmd_buffer->batch.next - cmd_buffer->batch.start;
-      if (!cmd_buffer->device->can_chain_batches) {
-         cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT;
-      } else if (cmd_buffer->device->physical->use_call_secondary) {
+      if (cmd_buffer->device->physical->use_call_secondary) {
          cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN;
-         /* If the secondary command buffer begins & ends in the same BO and
-          * its length is less than the length of CS prefetch, add some NOOPs
-          * instructions so the last MI_BATCH_BUFFER_START is outside the CS
-          * prefetch.
-          */
-         if (cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) {
-            const struct intel_device_info *devinfo = &cmd_buffer->device->info;
-            /* Careful to have everything in signed integer. */
-            int32_t prefetch_len = devinfo->cs_prefetch_size;
-            int32_t batch_len =
-               cmd_buffer->batch.next - cmd_buffer->batch.start;
-
-            for (int32_t i = 0; i < (prefetch_len - batch_len); i += 4)
-               anv_batch_emit(&cmd_buffer->batch, GFX8_MI_NOOP, noop);
-         }
 
          void *jump_addr =
-            anv_batch_emitn(&cmd_buffer->batch,
-                            GFX8_MI_BATCH_BUFFER_START_length,
-                            GFX8_MI_BATCH_BUFFER_START,
-                            .AddressSpaceIndicator = ASI_PPGTT,
-                            .SecondLevelBatchBuffer = Firstlevelbatch) +
-            (GFX8_MI_BATCH_BUFFER_START_BatchBufferStartAddress_start / 8);
+            anv_genX(devinfo, batch_emit_return)(&cmd_buffer->batch) +
+            (GFX9_MI_BATCH_BUFFER_START_BatchBufferStartAddress_start / 8);
          cmd_buffer->return_addr = anv_batch_address(&cmd_buffer->batch, jump_addr);
 
          /* The emit above may have caused us to chain batch buffers which
@@ -1054,11 +1091,11 @@ anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer)
           * have room for the chaining command.  Since we're about to emit the
           * chaining command, let's set it back where it should go.
           */
-         cmd_buffer->batch.end += GFX8_MI_BATCH_BUFFER_START_length * 4;
+         cmd_buffer->batch.end += GFX9_MI_BATCH_BUFFER_START_length * 4;
          assert(cmd_buffer->batch.start == batch_bo->bo->map);
          assert(cmd_buffer->batch.end == batch_bo->bo->map + batch_bo->bo->size);
 
-         emit_batch_buffer_start(cmd_buffer, batch_bo->bo, 0);
+         emit_batch_buffer_start(&cmd_buffer->batch, batch_bo->bo, 0);
          assert(cmd_buffer->batch.start == batch_bo->bo->map);
       } else {
          cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN;
@@ -1066,6 +1103,11 @@ anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer)
    }
 
    anv_batch_bo_finish(batch_bo, &cmd_buffer->batch);
+
+   /* Add the current amount of data written in the current_bbo to the command
+    * buffer.
+    */
+   cmd_buffer->total_batch_size += batch_bo->length;
 }
 
 static VkResult
@@ -1075,7 +1117,7 @@ anv_cmd_buffer_add_seen_bbos(struct anv_cmd_buffer *cmd_buffer,
    list_for_each_entry(struct anv_batch_bo, bbo, list, link) {
       struct anv_batch_bo **bbo_ptr = u_vector_add(&cmd_buffer->seen_bbos);
       if (bbo_ptr == NULL)
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(cmd_buffer, VK_ERROR_OUT_OF_HOST_MEMORY);
 
       *bbo_ptr = bbo;
    }
@@ -1092,21 +1134,13 @@ anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,
    case ANV_CMD_BUFFER_EXEC_MODE_EMIT:
       anv_batch_emit_batch(&primary->batch, &secondary->batch);
       break;
-   case ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT: {
-      struct anv_batch_bo *bbo = anv_cmd_buffer_current_batch_bo(primary);
-      unsigned length = secondary->batch.end - secondary->batch.start;
-      anv_batch_bo_grow(primary, bbo, &primary->batch, length,
-                        GFX8_MI_BATCH_BUFFER_START_length * 4);
-      anv_batch_emit_batch(&primary->batch, &secondary->batch);
-      break;
-   }
    case ANV_CMD_BUFFER_EXEC_MODE_CHAIN: {
       struct anv_batch_bo *first_bbo =
          list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link);
       struct anv_batch_bo *last_bbo =
          list_last_entry(&secondary->batch_bos, struct anv_batch_bo, link);
 
-      emit_batch_buffer_start(primary, first_bbo->bo, 0);
+      emit_batch_buffer_start(&primary->batch, first_bbo->bo, 0);
 
       struct anv_batch_bo *this_bbo = anv_cmd_buffer_current_batch_bo(primary);
       assert(primary->batch.start == this_bbo->bo->map);
@@ -1135,30 +1169,23 @@ anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,
       struct anv_batch_bo *last_bbo =
          list_last_entry(&copy_list, struct anv_batch_bo, link);
 
-      cmd_buffer_chain_to_batch_bo(primary, first_bbo);
+      cmd_buffer_chain_to_batch_bo(primary, first_bbo,
+                                   ANV_CMD_BUFFER_BATCH_MAIN);
 
       list_splicetail(&copy_list, &primary->batch_bos);
 
       anv_batch_bo_continue(last_bbo, &primary->batch,
-                            GFX8_MI_BATCH_BUFFER_START_length * 4);
+                            GFX9_MI_BATCH_BUFFER_START_length * 4);
       break;
    }
    case ANV_CMD_BUFFER_EXEC_MODE_CALL_AND_RETURN: {
       struct anv_batch_bo *first_bbo =
          list_first_entry(&secondary->batch_bos, struct anv_batch_bo, link);
 
-      uint64_t *write_return_addr =
-         anv_batch_emitn(&primary->batch,
-                         GFX8_MI_STORE_DATA_IMM_length + 1 /* QWord write */,
-                         GFX8_MI_STORE_DATA_IMM,
-                         .Address = secondary->return_addr)
-         + (GFX8_MI_STORE_DATA_IMM_ImmediateData_start / 8);
-
-      emit_batch_buffer_start(primary, first_bbo->bo, 0);
-
-      *write_return_addr =
-         anv_address_physical(anv_batch_address(&primary->batch,
-                                                primary->batch.next));
+      anv_genX(primary->device->info, batch_emit_secondary_call)(
+         &primary->batch,
+         (struct anv_address) { .bo = first_bbo->bo },
+         secondary->return_addr);
 
       anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos);
       break;
@@ -1167,904 +1194,524 @@ anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary,
       assert(!"Invalid execution mode");
    }
 
-   anv_reloc_list_append(&primary->surface_relocs, &primary->pool->alloc,
-                         &secondary->surface_relocs, 0);
-}
-
-struct anv_execbuf {
-   struct drm_i915_gem_execbuffer2           execbuf;
-
-   struct drm_i915_gem_execbuffer_ext_timeline_fences timeline_fences;
-
-   struct drm_i915_gem_exec_object2 *        objects;
-   uint32_t                                  bo_count;
-   struct anv_bo **                          bos;
-
-   /* Allocated length of the 'objects' and 'bos' arrays */
-   uint32_t                                  array_length;
+   anv_reloc_list_append(&primary->surface_relocs, &secondary->surface_relocs);
 
-   /* List of relocations for surface states, only used with platforms not
-    * using softpin.
+   /* Add the amount of data written in the secondary buffer to the primary
+    * command buffer.
     */
-   void *                                    surface_states_relocs;
-
-   /* Indicates whether any of the command buffers have relocations. This
-    * doesn't not necessarily mean we'll need the kernel to process them. It
-    * might be that a previous execbuf has already placed things in the VMA
-    * and we can make i915 skip the relocations.
-    */
-   bool                                      has_relocs;
-
-   const VkAllocationCallbacks *             alloc;
-   VkSystemAllocationScope                   alloc_scope;
-
-   int                                       perf_query_pass;
-};
-
-static void
-anv_execbuf_init(struct anv_execbuf *exec)
-{
-   memset(exec, 0, sizeof(*exec));
-}
-
-static void
-anv_execbuf_finish(struct anv_execbuf *exec)
-{
-   vk_free(exec->alloc, exec->surface_states_relocs);
-   vk_free(exec->alloc, exec->objects);
-   vk_free(exec->alloc, exec->bos);
+   primary->total_batch_size += secondary->total_batch_size;
 }
 
-static void
-anv_execbuf_add_ext(struct anv_execbuf *exec,
-                    uint32_t ext_name,
-                    struct i915_user_extension *ext)
+void
+anv_cmd_buffer_chain_command_buffers(struct anv_cmd_buffer **cmd_buffers,
+                                     uint32_t num_cmd_buffers)
 {
-   __u64 *iter = &exec->execbuf.cliprects_ptr;
-
-   exec->execbuf.flags |= I915_EXEC_USE_EXTENSIONS;
-
-   while (*iter != 0) {
-      iter = (__u64 *) &((struct i915_user_extension *)(uintptr_t)*iter)->next_extension;
+   if (!anv_cmd_buffer_is_chainable(cmd_buffers[0])) {
+      assert(num_cmd_buffers == 1);
+      return;
    }
 
-   ext->name = ext_name;
+   /* Chain the N-1 first batch buffers */
+   for (uint32_t i = 0; i < (num_cmd_buffers - 1); i++) {
+      assert(cmd_buffers[i]->companion_rcs_cmd_buffer == NULL);
+      anv_cmd_buffer_record_chain_submit(cmd_buffers[i], cmd_buffers[i + 1]);
+   }
 
-   *iter = (uintptr_t) ext;
+   /* Put an end to the last one */
+   anv_cmd_buffer_record_end_submit(cmd_buffers[num_cmd_buffers - 1]);
 }
 
-static VkResult
-anv_execbuf_add_bo_bitset(struct anv_device *device,
-                          struct anv_execbuf *exec,
-                          uint32_t dep_words,
-                          BITSET_WORD *deps,
-                          uint32_t extra_flags);
-
-static VkResult
-anv_execbuf_add_bo(struct anv_device *device,
-                   struct anv_execbuf *exec,
-                   struct anv_bo *bo,
-                   struct anv_reloc_list *relocs,
-                   uint32_t extra_flags)
+static void
+anv_print_batch(struct anv_device *device,
+                struct anv_queue *queue,
+                struct anv_cmd_buffer *cmd_buffer)
 {
-   struct drm_i915_gem_exec_object2 *obj = NULL;
-
-   bo = anv_bo_unwrap(bo);
-
-   if (bo->index < exec->bo_count && exec->bos[bo->index] == bo)
-      obj = &exec->objects[bo->index];
-
-   if (obj == NULL) {
-      /* We've never seen this one before.  Add it to the list and assign
-       * an id that we can use later.
-       */
-      if (exec->bo_count >= exec->array_length) {
-         uint32_t new_len = exec->objects ? exec->array_length * 2 : 64;
-
-         struct drm_i915_gem_exec_object2 *new_objects =
-            vk_alloc(exec->alloc, new_len * sizeof(*new_objects), 8, exec->alloc_scope);
-         if (new_objects == NULL)
-            return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-         struct anv_bo **new_bos =
-            vk_alloc(exec->alloc, new_len * sizeof(*new_bos), 8, exec->alloc_scope);
-         if (new_bos == NULL) {
-            vk_free(exec->alloc, new_objects);
-            return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-         }
-
-         if (exec->objects) {
-            memcpy(new_objects, exec->objects,
-                   exec->bo_count * sizeof(*new_objects));
-            memcpy(new_bos, exec->bos,
-                   exec->bo_count * sizeof(*new_bos));
-         }
-
-         vk_free(exec->alloc, exec->objects);
-         vk_free(exec->alloc, exec->bos);
-
-         exec->objects = new_objects;
-         exec->bos = new_bos;
-         exec->array_length = new_len;
-      }
-
-      assert(exec->bo_count < exec->array_length);
-
-      bo->index = exec->bo_count++;
-      obj = &exec->objects[bo->index];
-      exec->bos[bo->index] = bo;
-
-      obj->handle = bo->gem_handle;
-      obj->relocation_count = 0;
-      obj->relocs_ptr = 0;
-      obj->alignment = 0;
-      obj->offset = bo->offset;
-      obj->flags = bo->flags | extra_flags;
-      obj->rsvd1 = 0;
-      obj->rsvd2 = 0;
+   struct anv_batch_bo *bbo =
+      list_first_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link);
+   device->cmd_buffer_being_decoded = cmd_buffer;
+   struct intel_batch_decode_ctx *ctx = queue->decoder;
+
+   if (cmd_buffer->is_companion_rcs_cmd_buffer) {
+      int render_queue_idx =
+         anv_get_first_render_queue_index(device->physical);
+      ctx = &device->decoder[render_queue_idx];
    }
 
-   if (extra_flags & EXEC_OBJECT_WRITE) {
-      obj->flags |= EXEC_OBJECT_WRITE;
-      obj->flags &= ~EXEC_OBJECT_ASYNC;
+   if (INTEL_DEBUG(DEBUG_BATCH)) {
+      intel_print_batch(ctx, bbo->bo->map,
+                        bbo->bo->size, bbo->bo->offset, false);
    }
-
-   if (relocs != NULL) {
-      assert(obj->relocation_count == 0);
-
-      if (relocs->num_relocs > 0) {
-         /* This is the first time we've ever seen a list of relocations for
-          * this BO.  Go ahead and set the relocations and then walk the list
-          * of relocations and add them all.
-          */
-         exec->has_relocs = true;
-         obj->relocation_count = relocs->num_relocs;
-         obj->relocs_ptr = (uintptr_t) relocs->relocs;
-
-         for (size_t i = 0; i < relocs->num_relocs; i++) {
-            VkResult result;
-
-            /* A quick sanity check on relocations */
-            assert(relocs->relocs[i].offset < bo->size);
-            result = anv_execbuf_add_bo(device, exec, relocs->reloc_bos[i],
-                                        NULL, extra_flags);
-            if (result != VK_SUCCESS)
-               return result;
-         }
-      }
-
-      return anv_execbuf_add_bo_bitset(device, exec, relocs->dep_words,
-                                       relocs->deps, extra_flags);
+   if (INTEL_DEBUG(DEBUG_BATCH_STATS)) {
+      intel_batch_stats(ctx, bbo->bo->map,
+                        bbo->bo->size, bbo->bo->offset, false);
    }
-
-   return VK_SUCCESS;
+   device->cmd_buffer_being_decoded = NULL;
 }
 
-/* Add BO dependencies to execbuf */
-static VkResult
-anv_execbuf_add_bo_bitset(struct anv_device *device,
-                          struct anv_execbuf *exec,
-                          uint32_t dep_words,
-                          BITSET_WORD *deps,
-                          uint32_t extra_flags)
+void
+anv_cmd_buffer_exec_batch_debug(struct anv_queue *queue,
+                                uint32_t cmd_buffer_count,
+                                struct anv_cmd_buffer **cmd_buffers,
+                                struct anv_query_pool *perf_query_pool,
+                                uint32_t perf_query_pass)
 {
-   for (uint32_t w = 0; w < dep_words; w++) {
-      BITSET_WORD mask = deps[w];
-      while (mask) {
-         int i = u_bit_scan(&mask);
-         uint32_t gem_handle = w * BITSET_WORDBITS + i;
-         struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
-         assert(bo->refcount > 0);
-         VkResult result =
-            anv_execbuf_add_bo(device, exec, bo, NULL, extra_flags);
-         if (result != VK_SUCCESS)
-            return result;
-      }
-   }
-
-   return VK_SUCCESS;
-}
+   if (!INTEL_DEBUG(DEBUG_BATCH | DEBUG_BATCH_STATS))
+      return;
 
-static void
-anv_cmd_buffer_process_relocs(struct anv_cmd_buffer *cmd_buffer,
-                              struct anv_reloc_list *list)
-{
-   for (size_t i = 0; i < list->num_relocs; i++)
-      list->relocs[i].target_handle = anv_bo_unwrap(list->reloc_bos[i])->index;
-}
+   struct anv_device *device = queue->device;
+   const bool has_perf_query = perf_query_pool && perf_query_pass >= 0 &&
+                               cmd_buffer_count;
+   uint64_t frame_id = device->debug_frame_desc->frame_id;
 
-static void
-adjust_relocations_from_state_pool(struct anv_state_pool *pool,
-                                   struct anv_reloc_list *relocs,
-                                   uint32_t last_pool_center_bo_offset)
-{
-   assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset);
-   uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset;
-
-   for (size_t i = 0; i < relocs->num_relocs; i++) {
-      /* All of the relocations from this block pool to other BO's should
-       * have been emitted relative to the surface block pool center.  We
-       * need to add the center offset to make them relative to the
-       * beginning of the actual GEM bo.
-       */
-      relocs->relocs[i].offset += delta;
-   }
-}
+   if (!intel_debug_batch_in_range(device->debug_frame_desc->frame_id))
+      return;
+   fprintf(stderr, "Batch for frame %"PRIu64" on queue %d\n",
+      frame_id, (int)(queue - device->queues));
 
-static void
-adjust_relocations_to_state_pool(struct anv_state_pool *pool,
-                                 struct anv_bo *from_bo,
-                                 struct anv_reloc_list *relocs,
-                                 uint32_t last_pool_center_bo_offset)
-{
-   assert(!from_bo->is_wrapper);
-   assert(last_pool_center_bo_offset <= pool->block_pool.center_bo_offset);
-   uint32_t delta = pool->block_pool.center_bo_offset - last_pool_center_bo_offset;
-
-   /* When we initially emit relocations into a block pool, we don't
-    * actually know what the final center_bo_offset will be so we just emit
-    * it as if center_bo_offset == 0.  Now that we know what the center
-    * offset is, we need to walk the list of relocations and adjust any
-    * relocations that point to the pool bo with the correct offset.
-    */
-   for (size_t i = 0; i < relocs->num_relocs; i++) {
-      if (relocs->reloc_bos[i] == pool->block_pool.bo) {
-         /* Adjust the delta value in the relocation to correctly
-          * correspond to the new delta.  Initially, this value may have
-          * been negative (if treated as unsigned), but we trust in
-          * uint32_t roll-over to fix that for us at this point.
-          */
-         relocs->relocs[i].delta += delta;
+   if (cmd_buffer_count) {
+      if (has_perf_query) {
+         struct anv_bo *pass_batch_bo = perf_query_pool->bo;
+         uint64_t pass_batch_offset =
+            khr_perf_query_preamble_offset(perf_query_pool, perf_query_pass);
 
-         /* Since the delta has changed, we need to update the actual
-          * relocated value with the new presumed value.  This function
-          * should only be called on batch buffers, so we know it isn't in
-          * use by the GPU at the moment.
-          */
-         assert(relocs->relocs[i].offset < from_bo->size);
-         write_reloc(pool->block_pool.device,
-                     from_bo->map + relocs->relocs[i].offset,
-                     relocs->relocs[i].presumed_offset +
-                     relocs->relocs[i].delta, false);
+         if (INTEL_DEBUG(DEBUG_BATCH)) {
+            intel_print_batch(queue->decoder,
+                              pass_batch_bo->map + pass_batch_offset, 64,
+                              pass_batch_bo->offset + pass_batch_offset, false);
+         }
       }
-   }
-}
-
-static void
-anv_reloc_list_apply(struct anv_device *device,
-                     struct anv_reloc_list *list,
-                     struct anv_bo *bo,
-                     bool always_relocate)
-{
-   bo = anv_bo_unwrap(bo);
-
-   for (size_t i = 0; i < list->num_relocs; i++) {
-      struct anv_bo *target_bo = anv_bo_unwrap(list->reloc_bos[i]);
-      if (list->relocs[i].presumed_offset == target_bo->offset &&
-          !always_relocate)
-         continue;
 
-      void *p = bo->map + list->relocs[i].offset;
-      write_reloc(device, p, target_bo->offset + list->relocs[i].delta, true);
-      list->relocs[i].presumed_offset = target_bo->offset;
+      for (uint32_t i = 0; i < cmd_buffer_count; i++)
+         anv_print_batch(device, queue, cmd_buffers[i]);
+   } else if (INTEL_DEBUG(DEBUG_BATCH)) {
+      intel_print_batch(queue->decoder, device->trivial_batch_bo->map,
+                        device->trivial_batch_bo->size,
+                        device->trivial_batch_bo->offset, false);
    }
 }
 
-/**
- * This function applies the relocation for a command buffer and writes the
- * actual addresses into the buffers as per what we were told by the kernel on
- * the previous execbuf2 call.  This should be safe to do because, for each
- * relocated address, we have two cases:
- *
- *  1) The target BO is inactive (as seen by the kernel).  In this case, it is
- *     not in use by the GPU so updating the address is 100% ok.  It won't be
- *     in-use by the GPU (from our context) again until the next execbuf2
- *     happens.  If the kernel decides to move it in the next execbuf2, it
- *     will have to do the relocations itself, but that's ok because it should
- *     have all of the information needed to do so.
+/* We lock around execbuf for three main reasons:
  *
- *  2) The target BO is active (as seen by the kernel).  In this case, it
- *     hasn't moved since the last execbuffer2 call because GTT shuffling
- *     *only* happens when the BO is idle. (From our perspective, it only
- *     happens inside the execbuffer2 ioctl, but the shuffling may be
- *     triggered by another ioctl, with full-ppgtt this is limited to only
- *     execbuffer2 ioctls on the same context, or memory pressure.)  Since the
- *     target BO hasn't moved, our anv_bo::offset exactly matches the BO's GTT
- *     address and the relocated value we are writing into the BO will be the
- *     same as the value that is already there.
+ *  1) When a block pool is resized, we create a new gem handle with a
+ *     different size and, in the case of surface states, possibly a different
+ *     center offset but we re-use the same anv_bo struct when we do so. If
+ *     this happens in the middle of setting up an execbuf, we could end up
+ *     with our list of BOs out of sync with our list of gem handles.
  *
- *     There is also a possibility that the target BO is active but the exact
- *     RENDER_SURFACE_STATE object we are writing the relocation into isn't in
- *     use.  In this case, the address currently in the RENDER_SURFACE_STATE
- *     may be stale but it's still safe to write the relocation because that
- *     particular RENDER_SURFACE_STATE object isn't in-use by the GPU and
- *     won't be until the next execbuf2 call.
+ *  2) The algorithm we use for building the list of unique buffers isn't
+ *     thread-safe. While the client is supposed to synchronize around
+ *     QueueSubmit, this would be extremely difficult to debug if it ever came
+ *     up in the wild due to a broken app. It's better to play it safe and
+ *     just lock around QueueSubmit.
  *
- * By doing relocations on the CPU, we can tell the kernel that it doesn't
- * need to bother.  We want to do this because the surface state buffer is
- * used by every command buffer so, if the kernel does the relocations, it
- * will always be busy and the kernel will always stall.  This is also
- * probably the fastest mechanism for doing relocations since the kernel would
- * have to make a full copy of all the relocations lists.
+ * Since the only other things that ever take the device lock such as block
+ * pool resize only rarely happen, this will almost never be contended so
+ * taking a lock isn't really an expensive operation in this case.
  */
-static bool
-execbuf_can_skip_relocations(struct anv_execbuf *exec)
+static inline VkResult
+anv_queue_exec_locked(struct anv_queue *queue,
+                      uint32_t wait_count,
+                      const struct vk_sync_wait *waits,
+                      uint32_t cmd_buffer_count,
+                      struct anv_cmd_buffer **cmd_buffers,
+                      uint32_t signal_count,
+                      const struct vk_sync_signal *signals,
+                      struct anv_query_pool *perf_query_pool,
+                      uint32_t perf_query_pass,
+                      struct anv_utrace_submit *utrace_submit)
 {
-   if (!exec->has_relocs)
-      return true;
-
-   static int userspace_relocs = -1;
-   if (userspace_relocs < 0)
-      userspace_relocs = env_var_as_boolean("ANV_USERSPACE_RELOCS", true);
-   if (!userspace_relocs)
-      return false;
-
-   /* First, we have to check to see whether or not we can even do the
-    * relocation.  New buffers which have never been submitted to the kernel
-    * don't have a valid offset so we need to let the kernel do relocations so
-    * that we can get offsets for them.  On future execbuf2 calls, those
-    * buffers will have offsets and we will be able to skip relocating.
-    * Invalid offsets are indicated by anv_bo::offset == (uint64_t)-1.
-    */
-   for (uint32_t i = 0; i < exec->bo_count; i++) {
-      assert(!exec->bos[i]->is_wrapper);
-      if (exec->bos[i]->offset == (uint64_t)-1)
-         return false;
-   }
-
-   return true;
-}
+   struct anv_device *device = queue->device;
+   VkResult result = VK_SUCCESS;
 
-static void
-relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer,
-                    struct anv_execbuf *exec)
-{
-   /* Since surface states are shared between command buffers and we don't
-    * know what order they will be submitted to the kernel, we don't know
-    * what address is actually written in the surface state object at any
-    * given time.  The only option is to always relocate them.
+   /* We only need to synchronize the main & companion command buffers if we
+    * have a companion command buffer somewhere in the list of command
+    * buffers.
     */
-   struct anv_bo *surface_state_bo =
-      anv_bo_unwrap(cmd_buffer->device->surface_state_pool.block_pool.bo);
-   anv_reloc_list_apply(cmd_buffer->device, &cmd_buffer->surface_relocs,
-                        surface_state_bo,
-                        true /* always relocate surface states */);
-
-   /* Since we own all of the batch buffers, we know what values are stored
-    * in the relocated addresses and only have to update them if the offsets
-    * have changed.
-    */
-   struct anv_batch_bo **bbo;
-   u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
-      anv_reloc_list_apply(cmd_buffer->device,
-                           &(*bbo)->relocs, (*bbo)->bo, false);
+   bool needs_companion_sync = false;
+   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+      if (cmd_buffers[i]->companion_rcs_cmd_buffer != NULL) {
+         needs_companion_sync = true;
+         break;
+      }
    }
 
-   for (uint32_t i = 0; i < exec->bo_count; i++)
-      exec->objects[i].offset = exec->bos[i]->offset;
-}
-
-static void
-reset_cmd_buffer_surface_offsets(struct anv_cmd_buffer *cmd_buffer)
-{
-   /* In the case where we fall back to doing kernel relocations, we need to
-    * ensure that the relocation list is valid. All relocations on the batch
-    * buffers are already valid and kept up-to-date. Since surface states are
-    * shared between command buffers and we don't know what order they will be
-    * submitted to the kernel, we don't know what address is actually written
-    * in the surface state object at any given time. The only option is to set
-    * a bogus presumed offset and let the kernel relocate them.
-    */
-   for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++)
-      cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1;
-}
-
-static VkResult
-setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,
-                             struct anv_cmd_buffer *cmd_buffer)
-{
-   struct anv_state_pool *ss_pool =
-      &cmd_buffer->device->surface_state_pool;
+   result =
+      device->kmd_backend->queue_exec_locked(
+         queue,
+         wait_count, waits,
+         cmd_buffer_count, cmd_buffers,
+         needs_companion_sync ? 0 : signal_count, signals,
+         perf_query_pool,
+         perf_query_pass,
+         utrace_submit);
+   if (result != VK_SUCCESS)
+      return result;
 
-   adjust_relocations_from_state_pool(ss_pool, &cmd_buffer->surface_relocs,
-                                      cmd_buffer->last_ss_pool_center);
-   VkResult result;
-   if (cmd_buffer->device->physical->use_softpin) {
-      /* Add surface dependencies (BOs) to the execbuf */
-      anv_execbuf_add_bo_bitset(cmd_buffer->device, execbuf,
-                                cmd_buffer->surface_relocs.dep_words,
-                                cmd_buffer->surface_relocs.deps, 0);
-   } else {
-      /* Since we aren't in the softpin case, all of our STATE_BASE_ADDRESS BOs
-       * will get added automatically by processing relocations on the batch
-       * buffer.  We have to add the surface state BO manually because it has
-       * relocations of its own that we need to be sure are processsed.
+   if (needs_companion_sync) {
+      struct vk_sync_wait companion_sync = {
+         .sync = queue->companion_sync,
+      };
+      /* If any of the command buffer had a companion batch, the submission
+       * backend will signal queue->companion_sync, so to ensure completion,
+       * we just need to wait on that fence.
        */
-      result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
-                                  ss_pool->block_pool.bo,
-                                  &cmd_buffer->surface_relocs, 0);
-      if (result != VK_SUCCESS)
-         return result;
-   }
-
-   /* First, we walk over all of the bos we've seen and add them and their
-    * relocations to the validate list.
-    */
-   struct anv_batch_bo **bbo;
-   u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
-      adjust_relocations_to_state_pool(ss_pool, (*bbo)->bo, &(*bbo)->relocs,
-                                       cmd_buffer->last_ss_pool_center);
-
-      result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
-                                  (*bbo)->bo, &(*bbo)->relocs, 0);
-      if (result != VK_SUCCESS)
-         return result;
+      result =
+         device->kmd_backend->queue_exec_locked(queue,
+                                                1, &companion_sync,
+                                                0, NULL,
+                                                signal_count, signals,
+                                                NULL, 0,
+                                                NULL);
    }
 
-   /* Now that we've adjusted all of the surface state relocations, we need to
-    * record the surface state pool center so future executions of the command
-    * buffer can adjust correctly.
-    */
-   cmd_buffer->last_ss_pool_center = ss_pool->block_pool.center_bo_offset;
-
-   return VK_SUCCESS;
+   return result;
 }
 
-static void
-chain_command_buffers(struct anv_cmd_buffer **cmd_buffers,
-                      uint32_t num_cmd_buffers)
+static inline bool
+can_chain_query_pools(struct anv_query_pool *p1, struct anv_query_pool *p2)
 {
-   if (!anv_cmd_buffer_is_chainable(cmd_buffers[0])) {
-      assert(num_cmd_buffers == 1);
-      return;
-   }
-
-   /* Chain the N-1 first batch buffers */
-   for (uint32_t i = 0; i < (num_cmd_buffers - 1); i++)
-      anv_cmd_buffer_record_chain_submit(cmd_buffers[i], cmd_buffers[i + 1]);
-
-   /* Put an end to the last one */
-   anv_cmd_buffer_record_end_submit(cmd_buffers[num_cmd_buffers - 1]);
+   return (!p1 || !p2 || p1 == p2);
 }
 
 static VkResult
-setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
-                              struct anv_queue *queue,
-                              struct anv_cmd_buffer **cmd_buffers,
-                              uint32_t num_cmd_buffers)
+anv_queue_submit_sparse_bind_locked(struct anv_queue *queue,
+                                    struct vk_queue_submit *submit)
 {
    struct anv_device *device = queue->device;
-   struct anv_state_pool *ss_pool = &device->surface_state_pool;
    VkResult result;
 
-   /* Edit the tail of the command buffers to chain them all together if they
-    * can be.
+   /* When fake sparse is enabled, while we do accept creating "sparse"
+    * resources we can't really handle sparse submission. Fake sparse is
+    * supposed to be used by applications that request sparse to be enabled
+    * but don't actually *use* it.
     */
-   chain_command_buffers(cmd_buffers, num_cmd_buffers);
-
-   for (uint32_t i = 0; i < num_cmd_buffers; i++) {
-      result = setup_execbuf_for_cmd_buffer(execbuf, cmd_buffers[i]);
-      if (result != VK_SUCCESS)
-         return result;
+   if (device->physical->sparse_type == ANV_SPARSE_TYPE_NOT_SUPPORTED) {
+      if (INTEL_DEBUG(DEBUG_SPARSE))
+         fprintf(stderr, "=== application submitting sparse operations: "
+               "buffer_bind:%d image_opaque_bind:%d image_bind:%d\n",
+               submit->buffer_bind_count, submit->image_opaque_bind_count,
+               submit->image_bind_count);
+      return vk_queue_set_lost(&queue->vk, "Sparse binding not supported");
    }
 
-   /* Add all the global BOs to the object list for softpin case. */
-   if (device->physical->use_softpin) {
-      anv_block_pool_foreach_bo(bo, &ss_pool->block_pool) {
-         result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
-         if (result != VK_SUCCESS)
-            return result;
-      }
+   assert(submit->command_buffer_count == 0);
 
-      struct anv_block_pool *pool;
-      pool = &device->dynamic_state_pool.block_pool;
-      anv_block_pool_foreach_bo(bo, pool) {
-         result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
-         if (result != VK_SUCCESS)
-            return result;
-      }
+   if (INTEL_DEBUG(DEBUG_SPARSE)) {
+      fprintf(stderr, "[sparse submission, buffers:%u opaque_images:%u "
+              "images:%u waits:%u signals:%u]\n",
+              submit->buffer_bind_count,
+              submit->image_opaque_bind_count,
+              submit->image_bind_count,
+              submit->wait_count, submit->signal_count);
+   }
 
-      pool = &device->general_state_pool.block_pool;
-      anv_block_pool_foreach_bo(bo, pool) {
-         result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
-         if (result != VK_SUCCESS)
-            return result;
-      }
+   struct anv_sparse_submission sparse_submit = {
+      .queue = queue,
+      .binds = NULL,
+      .binds_len = 0,
+      .binds_capacity = 0,
+      .wait_count = submit->wait_count,
+      .signal_count = submit->signal_count,
+      .waits = submit->waits,
+      .signals = submit->signals,
+   };
 
-      pool = &device->instruction_state_pool.block_pool;
-      anv_block_pool_foreach_bo(bo, pool) {
-         result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
-         if (result != VK_SUCCESS)
-            return result;
-      }
+   for (uint32_t i = 0; i < submit->buffer_bind_count; i++) {
+      VkSparseBufferMemoryBindInfo *bind_info = &submit->buffer_binds[i];
+      ANV_FROM_HANDLE(anv_buffer, buffer, bind_info->buffer);
 
-      pool = &device->binding_table_pool.block_pool;
-      anv_block_pool_foreach_bo(bo, pool) {
-         result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
-         if (result != VK_SUCCESS)
-            return result;
-      }
+      assert(anv_buffer_is_sparse(buffer));
 
-      /* Add the BOs for all user allocated memory objects because we can't
-       * track after binding updates of VK_EXT_descriptor_indexing.
-       */
-      list_for_each_entry(struct anv_device_memory, mem,
-                          &device->memory_objects, link) {
-         result = anv_execbuf_add_bo(device, execbuf, mem->bo, NULL, 0);
+      for (uint32_t j = 0; j < bind_info->bindCount; j++) {
+         result = anv_sparse_bind_buffer(device, buffer,
+                                         &bind_info->pBinds[j],
+                                         &sparse_submit);
          if (result != VK_SUCCESS)
-            return result;
+            goto out_free_submit;
       }
-   } else {
-      /* We do not support chaining primary command buffers without
-       * softpin.
-       */
-      assert(num_cmd_buffers == 1);
    }
 
-   bool no_reloc = true;
-   if (execbuf->has_relocs) {
-      no_reloc = execbuf_can_skip_relocations(execbuf);
-      if (no_reloc) {
-         /* If we were able to successfully relocate everything, tell the
-          * kernel that it can skip doing relocations. The requirement for
-          * using NO_RELOC is:
-          *
-          *  1) The addresses written in the objects must match the
-          *     corresponding reloc.presumed_offset which in turn must match
-          *     the corresponding execobject.offset.
-          *
-          *  2) To avoid stalling, execobject.offset should match the current
-          *     address of that object within the active context.
-          *
-          * In order to satisfy all of the invariants that make userspace
-          * relocations to be safe (see relocate_cmd_buffer()), we need to
-          * further ensure that the addresses we use match those used by the
-          * kernel for the most recent execbuf2.
-          *
-          * The kernel may still choose to do relocations anyway if something
-          * has moved in the GTT. In this case, the relocation list still
-          * needs to be valid. All relocations on the batch buffers are
-          * already valid and kept up-to-date. For surface state relocations,
-          * by applying the relocations in relocate_cmd_buffer, we ensured
-          * that the address in the RENDER_SURFACE_STATE matches
-          * presumed_offset, so it should be safe for the kernel to relocate
-          * them as needed.
-          */
-         for (uint32_t i = 0; i < num_cmd_buffers; i++) {
-            relocate_cmd_buffer(cmd_buffers[i], execbuf);
+   for (uint32_t i = 0; i < submit->image_bind_count; i++) {
+      VkSparseImageMemoryBindInfo *bind_info = &submit->image_binds[i];
+      ANV_FROM_HANDLE(anv_image, image, bind_info->image);
 
-            anv_reloc_list_apply(device, &cmd_buffers[i]->surface_relocs,
-                                 device->surface_state_pool.block_pool.bo,
-                                 true /* always relocate surface states */);
-         }
-      } else {
-         /* In the case where we fall back to doing kernel relocations, we
-          * need to ensure that the relocation list is valid. All relocations
-          * on the batch buffers are already valid and kept up-to-date. Since
-          * surface states are shared between command buffers and we don't
-          * know what order they will be submitted to the kernel, we don't
-          * know what address is actually written in the surface state object
-          * at any given time. The only option is to set a bogus presumed
-          * offset and let the kernel relocate them.
-          */
-         for (uint32_t i = 0; i < num_cmd_buffers; i++)
-            reset_cmd_buffer_surface_offsets(cmd_buffers[i]);
+      assert(anv_image_is_sparse(image));
+      assert(image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT);
+
+      for (uint32_t j = 0; j < bind_info->bindCount; j++) {
+         result = anv_sparse_bind_image_memory(queue, image,
+                                               &bind_info->pBinds[j],
+                                               &sparse_submit);
+         if (result != VK_SUCCESS)
+            goto out_free_submit;
       }
    }
 
-   struct anv_batch_bo *first_batch_bo =
-      list_first_entry(&cmd_buffers[0]->batch_bos, struct anv_batch_bo, link);
-
-   /* The kernel requires that the last entry in the validation list be the
-    * batch buffer to execute.  We can simply swap the element
-    * corresponding to the first batch_bo in the chain with the last
-    * element in the list.
-    */
-   if (first_batch_bo->bo->index != execbuf->bo_count - 1) {
-      uint32_t idx = first_batch_bo->bo->index;
-      uint32_t last_idx = execbuf->bo_count - 1;
-
-      struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx];
-      assert(execbuf->bos[idx] == first_batch_bo->bo);
+   for (uint32_t i = 0; i < submit->image_opaque_bind_count; i++) {
+      VkSparseImageOpaqueMemoryBindInfo *bind_info =
+         &submit->image_opaque_binds[i];
+      ANV_FROM_HANDLE(anv_image, image, bind_info->image);
 
-      execbuf->objects[idx] = execbuf->objects[last_idx];
-      execbuf->bos[idx] = execbuf->bos[last_idx];
-      execbuf->bos[idx]->index = idx;
+      assert(anv_image_is_sparse(image));
 
-      execbuf->objects[last_idx] = tmp_obj;
-      execbuf->bos[last_idx] = first_batch_bo->bo;
-      first_batch_bo->bo->index = last_idx;
+      for (uint32_t j = 0; j < bind_info->bindCount; j++) {
+         result = anv_sparse_bind_image_opaque(device, image,
+                                               &bind_info->pBinds[j],
+                                               &sparse_submit);
+         if (result != VK_SUCCESS)
+            goto out_free_submit;
+      }
    }
 
-   /* If we are pinning our BOs, we shouldn't have to relocate anything */
-   if (device->physical->use_softpin)
-      assert(!execbuf->has_relocs);
+   result = anv_sparse_bind(device, &sparse_submit);
 
-   /* Now we go through and fixup all of the relocation lists to point to the
-    * correct indices in the object array (I915_EXEC_HANDLE_LUT).  We have to
-    * do this after we reorder the list above as some of the indices may have
-    * changed.
-    */
-   struct anv_batch_bo **bbo;
-   if (execbuf->has_relocs) {
-      assert(num_cmd_buffers == 1);
-      u_vector_foreach(bbo, &cmd_buffers[0]->seen_bbos)
-         anv_cmd_buffer_process_relocs(cmd_buffers[0], &(*bbo)->relocs);
+out_free_submit:
+   vk_free(&device->vk.alloc, sparse_submit.binds);
+   return result;
+}
 
-      anv_cmd_buffer_process_relocs(cmd_buffers[0], &cmd_buffers[0]->surface_relocs);
-   }
+static VkResult
+anv_queue_submit_cmd_buffers_locked(struct anv_queue *queue,
+                                    struct vk_queue_submit *submit,
+                                    struct anv_utrace_submit *utrace_submit)
+{
+   VkResult result;
 
-   if (!device->info.has_llc) {
-      __builtin_ia32_mfence();
-      for (uint32_t i = 0; i < num_cmd_buffers; i++) {
-         u_vector_foreach(bbo, &cmd_buffers[i]->seen_bbos) {
-            for (uint32_t i = 0; i < (*bbo)->length; i += CACHELINE_SIZE)
-               __builtin_ia32_clflush((*bbo)->bo->map + i);
+   if (submit->command_buffer_count == 0) {
+      result = anv_queue_exec_locked(queue, submit->wait_count, submit->waits,
+                                     0 /* cmd_buffer_count */,
+                                     NULL /* cmd_buffers */,
+                                     submit->signal_count, submit->signals,
+                                     NULL /* perf_query_pool */,
+                                     0 /* perf_query_pass */,
+                                     utrace_submit);
+      if (result != VK_SUCCESS)
+         return result;
+   } else {
+      /* Everything's easier if we don't have to bother with container_of() */
+      STATIC_ASSERT(offsetof(struct anv_cmd_buffer, vk) == 0);
+      struct vk_command_buffer **vk_cmd_buffers = submit->command_buffers;
+      struct anv_cmd_buffer **cmd_buffers = (void *)vk_cmd_buffers;
+      uint32_t start = 0;
+      uint32_t end = submit->command_buffer_count;
+      struct anv_query_pool *perf_query_pool =
+         cmd_buffers[start]->perf_query_pool;
+      for (uint32_t n = 0; n < end; n++) {
+         bool can_chain = false;
+         uint32_t next = n + 1;
+         /* Can we chain the last buffer into the next one? */
+         if (next < end &&
+             anv_cmd_buffer_is_chainable(cmd_buffers[n]) &&
+             anv_cmd_buffer_is_chainable(cmd_buffers[next]) &&
+             can_chain_query_pools
+             (cmd_buffers[next]->perf_query_pool, perf_query_pool)) {
+            can_chain = true;
+            perf_query_pool =
+               perf_query_pool ? perf_query_pool :
+               cmd_buffers[next]->perf_query_pool;
+         }
+         if (!can_chain) {
+            /* The next buffer cannot be chained, or we have reached the
+             * last buffer, submit what have been chained so far.
+             */
+            VkResult result =
+               anv_queue_exec_locked(queue,
+                                     start == 0 ? submit->wait_count : 0,
+                                     start == 0 ? submit->waits : NULL,
+                                     next - start, &cmd_buffers[start],
+                                     next == end ? submit->signal_count : 0,
+                                     next == end ? submit->signals : NULL,
+                                     perf_query_pool,
+                                     submit->perf_pass_index,
+                                     next == end ? utrace_submit : NULL);
+            if (result != VK_SUCCESS)
+               return result;
+            if (next < end) {
+               start = next;
+               perf_query_pool = cmd_buffers[start]->perf_query_pool;
+            }
          }
       }
    }
+   for (uint32_t i = 0; i < submit->signal_count; i++) {
+      if (!vk_sync_is_anv_bo_sync(submit->signals[i].sync))
+         continue;
 
-   struct anv_batch *batch = &cmd_buffers[0]->batch;
-   execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
-      .buffers_ptr = (uintptr_t) execbuf->objects,
-      .buffer_count = execbuf->bo_count,
-      .batch_start_offset = 0,
-      /* On platforms that cannot chain batch buffers because of the i915
-       * command parser, we have to provide the batch length. Everywhere else
-       * we'll chain batches so no point in passing a length.
+      struct anv_bo_sync *bo_sync =
+         container_of(submit->signals[i].sync, struct anv_bo_sync, sync);
+
+      /* Once the execbuf has returned, we need to set the fence state to
+       * SUBMITTED.  We can't do this before calling execbuf because
+       * anv_GetFenceStatus does take the global device lock before checking
+       * fence->state.
+       *
+       * We set the fence state to SUBMITTED regardless of whether or not the
+       * execbuf succeeds because we need to ensure that vkWaitForFences() and
+       * vkGetFenceStatus() return a valid result (VK_ERROR_DEVICE_LOST or
+       * VK_SUCCESS) in a finite amount of time even if execbuf fails.
        */
-      .batch_len = device->can_chain_batches ? 0 : batch->next - batch->start,
-      .cliprects_ptr = 0,
-      .num_cliprects = 0,
-      .DR1 = 0,
-      .DR4 = 0,
-      .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | (no_reloc ? I915_EXEC_NO_RELOC : 0),
-      .rsvd1 = device->context_id,
-      .rsvd2 = 0,
-   };
-
-   return VK_SUCCESS;
-}
-
-static VkResult
-setup_empty_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue)
-{
-   struct anv_device *device = queue->device;
-   VkResult result = anv_execbuf_add_bo(device, execbuf,
-                                        device->trivial_batch_bo,
-                                        NULL, 0);
-   if (result != VK_SUCCESS)
-      return result;
+      assert(bo_sync->state == ANV_BO_SYNC_STATE_RESET);
+      bo_sync->state = ANV_BO_SYNC_STATE_SUBMITTED;
+   }
 
-   execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
-      .buffers_ptr = (uintptr_t) execbuf->objects,
-      .buffer_count = execbuf->bo_count,
-      .batch_start_offset = 0,
-      .batch_len = 8, /* GFX7_MI_BATCH_BUFFER_END and NOOP */
-      .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | I915_EXEC_NO_RELOC,
-      .rsvd1 = device->context_id,
-      .rsvd2 = 0,
-   };
+   pthread_cond_broadcast(&queue->device->queue_submit);
 
    return VK_SUCCESS;
 }
 
-/* We lock around execbuf for three main reasons:
- *
- *  1) When a block pool is resized, we create a new gem handle with a
- *     different size and, in the case of surface states, possibly a different
- *     center offset but we re-use the same anv_bo struct when we do so. If
- *     this happens in the middle of setting up an execbuf, we could end up
- *     with our list of BOs out of sync with our list of gem handles.
- *
- *  2) The algorithm we use for building the list of unique buffers isn't
- *     thread-safe. While the client is supposed to syncronize around
- *     QueueSubmit, this would be extremely difficult to debug if it ever came
- *     up in the wild due to a broken app. It's better to play it safe and
- *     just lock around QueueSubmit.
- *
- *  3) The anv_cmd_buffer_execbuf function may perform relocations in
- *      userspace. Due to the fact that the surface state buffer is shared
- *      between batches, we can't afford to have that happen from multiple
- *      threads at the same time. Even though the user is supposed to ensure
- *      this doesn't happen, we play it safe as in (2) above.
- *
- * Since the only other things that ever take the device lock such as block
- * pool resize only rarely happen, this will almost never be contended so
- * taking a lock isn't really an expensive operation in this case.
- */
 VkResult
-anv_queue_execbuf_locked(struct anv_queue *queue,
-                         struct anv_queue_submit *submit)
+anv_queue_submit(struct vk_queue *vk_queue,
+                 struct vk_queue_submit *submit)
 {
+   struct anv_queue *queue = container_of(vk_queue, struct anv_queue, vk);
    struct anv_device *device = queue->device;
-   struct anv_execbuf execbuf;
-   anv_execbuf_init(&execbuf);
-   execbuf.alloc = submit->alloc;
-   execbuf.alloc_scope = submit->alloc_scope;
-   execbuf.perf_query_pass = submit->perf_query_pass;
-
-   /* Always add the workaround BO as it includes a driver identifier for the
-    * error_state.
+   VkResult result;
+
+   if (queue->device->info->no_hw) {
+      for (uint32_t i = 0; i < submit->signal_count; i++) {
+         result = vk_sync_signal(&device->vk,
+                                 submit->signals[i].sync,
+                                 submit->signals[i].signal_value);
+         if (result != VK_SUCCESS)
+            return vk_queue_set_lost(&queue->vk, "vk_sync_signal failed");
+      }
+      return VK_SUCCESS;
+   }
+
+   /* Flush the trace points first before taking the lock as the flushing
+    * might try to take that same lock.
     */
-   VkResult result =
-      anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL, 0);
+   struct anv_utrace_submit *utrace_submit = NULL;
+   result = anv_device_utrace_flush_cmd_buffers(
+      queue,
+      submit->command_buffer_count,
+      (struct anv_cmd_buffer **)submit->command_buffers,
+      &utrace_submit);
    if (result != VK_SUCCESS)
-      goto error;
+      return result;
 
-   for (uint32_t i = 0; i < submit->fence_bo_count; i++) {
-      int signaled;
-      struct anv_bo *bo = anv_unpack_ptr(submit->fence_bos[i], 1, &signaled);
+   pthread_mutex_lock(&device->mutex);
 
-      result = anv_execbuf_add_bo(device, &execbuf, bo, NULL,
-                                  signaled ? EXEC_OBJECT_WRITE : 0);
-      if (result != VK_SUCCESS)
-         goto error;
-   }
+   uint64_t start_ts = intel_ds_begin_submit(&queue->ds);
 
-   if (submit->cmd_buffer_count) {
-      result = setup_execbuf_for_cmd_buffers(&execbuf, queue,
-                                             submit->cmd_buffers,
-                                             submit->cmd_buffer_count);
-   } else if (submit->simple_bo) {
-      result = anv_execbuf_add_bo(device, &execbuf, submit->simple_bo, NULL, 0);
-      if (result != VK_SUCCESS)
-         goto error;
-
-      execbuf.execbuf = (struct drm_i915_gem_execbuffer2) {
-         .buffers_ptr = (uintptr_t) execbuf.objects,
-         .buffer_count = execbuf.bo_count,
-         .batch_start_offset = 0,
-         .batch_len = submit->simple_bo_size,
-         .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | I915_EXEC_NO_RELOC,
-         .rsvd1 = device->context_id,
-         .rsvd2 = 0,
-      };
+   if (submit->buffer_bind_count ||
+       submit->image_opaque_bind_count ||
+       submit->image_bind_count) {
+      result = anv_queue_submit_sparse_bind_locked(queue, submit);
    } else {
-      result = setup_empty_execbuf(&execbuf, queue);
+      result = anv_queue_submit_cmd_buffers_locked(queue, submit,
+                                                   utrace_submit);
    }
 
-   if (result != VK_SUCCESS)
-      goto error;
+   /* Take submission ID under lock */
+   intel_ds_end_submit(&queue->ds, start_ts);
 
-   const bool has_perf_query =
-      submit->perf_query_pass >= 0 &&
-      submit->cmd_buffer_count &&
-      submit->perf_query_pool;
+   pthread_mutex_unlock(&device->mutex);
 
-   if (INTEL_DEBUG & DEBUG_SUBMIT) {
-      fprintf(stderr, "Batch offset=0x%x len=0x%x on queue 0\n",
-              execbuf.execbuf.batch_start_offset, execbuf.execbuf.batch_len);
-      for (uint32_t i = 0; i < execbuf.bo_count; i++) {
-         const struct anv_bo *bo = execbuf.bos[i];
+   intel_ds_device_process(&device->ds, true);
 
-         fprintf(stderr, "   BO: addr=0x%016"PRIx64" size=%010"PRIx64" handle=%05u name=%s\n",
-                 bo->offset, bo->size, bo->gem_handle, bo->name);
-      }
-   }
+   return result;
+}
 
-   if (INTEL_DEBUG & DEBUG_BATCH) {
-      fprintf(stderr, "Batch on queue %d\n", (int)(queue - device->queues));
-      if (submit->cmd_buffer_count) {
-         if (has_perf_query) {
-            struct anv_query_pool *query_pool = submit->perf_query_pool;
-            struct anv_bo *pass_batch_bo = query_pool->bo;
-            uint64_t pass_batch_offset =
-               khr_perf_query_preamble_offset(query_pool,
-                                              submit->perf_query_pass);
-
-            intel_print_batch(&device->decoder_ctx,
-                              pass_batch_bo->map + pass_batch_offset, 64,
-                              pass_batch_bo->offset + pass_batch_offset, false);
-         }
+VkResult
+anv_queue_submit_simple_batch(struct anv_queue *queue,
+                              struct anv_batch *batch,
+                              bool is_companion_rcs_batch)
+{
+   struct anv_device *device = queue->device;
+   VkResult result = VK_SUCCESS;
 
-         for (uint32_t i = 0; i < submit->cmd_buffer_count; i++) {
-            struct anv_batch_bo **bo =
-               u_vector_tail(&submit->cmd_buffers[i]->seen_bbos);
-            device->cmd_buffer_being_decoded = submit->cmd_buffers[i];
-            intel_print_batch(&device->decoder_ctx, (*bo)->bo->map,
-                              (*bo)->bo->size, (*bo)->bo->offset, false);
-            device->cmd_buffer_being_decoded = NULL;
-         }
-      } else if (submit->simple_bo) {
-         intel_print_batch(&device->decoder_ctx, submit->simple_bo->map,
-                           submit->simple_bo->size, submit->simple_bo->offset, false);
-      } else {
-         intel_print_batch(&device->decoder_ctx,
-                           device->trivial_batch_bo->map,
-                           device->trivial_batch_bo->size,
-                           device->trivial_batch_bo->offset, false);
-      }
-   }
+   if (anv_batch_has_error(batch))
+      return batch->status;
 
-   if (submit->fence_count > 0) {
-      if (device->has_thread_submit) {
-         execbuf.timeline_fences.fence_count = submit->fence_count;
-         execbuf.timeline_fences.handles_ptr = (uintptr_t)submit->fences;
-         execbuf.timeline_fences.values_ptr = (uintptr_t)submit->fence_values;
-         anv_execbuf_add_ext(&execbuf,
-                             DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES,
-                             &execbuf.timeline_fences.base);
-      } else {
-         execbuf.execbuf.flags |= I915_EXEC_FENCE_ARRAY;
-         execbuf.execbuf.num_cliprects = submit->fence_count;
-         execbuf.execbuf.cliprects_ptr = (uintptr_t)submit->fences;
-      }
-   }
+   if (queue->device->info->no_hw)
+      return VK_SUCCESS;
 
-   if (submit->in_fence != -1) {
-      assert(!device->has_thread_submit);
-      execbuf.execbuf.flags |= I915_EXEC_FENCE_IN;
-      execbuf.execbuf.rsvd2 |= (uint32_t)submit->in_fence;
-   }
+   /* This is only used by device init so we can assume the queue is empty and
+    * we aren't fighting with a submit thread.
+    */
+   assert(vk_queue_is_empty(&queue->vk));
+
+   uint32_t batch_size = align(batch->next - batch->start, 8);
+
+   struct anv_bo *batch_bo = NULL;
+   result = anv_bo_pool_alloc(&device->batch_bo_pool, batch_size, &batch_bo);
+   if (result != VK_SUCCESS)
+      return result;
 
-   if (submit->need_out_fence) {
-      assert(!device->has_thread_submit);
-      execbuf.execbuf.flags |= I915_EXEC_FENCE_OUT;
+   memcpy(batch_bo->map, batch->start, batch_size);
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+   if (device->physical->memory.need_flush &&
+       anv_bo_needs_host_cache_flush(batch_bo->alloc_flags))
+      intel_flush_range(batch_bo->map, batch_size);
+#endif
+
+   if (INTEL_DEBUG(DEBUG_BATCH) &&
+       intel_debug_batch_in_range(device->debug_frame_desc->frame_id)) {
+      int render_queue_idx =
+         anv_get_first_render_queue_index(device->physical);
+      struct intel_batch_decode_ctx *ctx = is_companion_rcs_batch ?
+                                           &device->decoder[render_queue_idx] :
+                                           queue->decoder;
+      intel_print_batch(ctx, batch_bo->map, batch_bo->size, batch_bo->offset,
+                        false);
    }
 
-   if (has_perf_query) {
-      struct anv_query_pool *query_pool = submit->perf_query_pool;
-      assert(submit->perf_query_pass < query_pool->n_passes);
-      struct intel_perf_query_info *query_info =
-         query_pool->pass_query[submit->perf_query_pass];
+   result = device->kmd_backend->execute_simple_batch(queue, batch_bo,
+                                                      batch_size,
+                                                      is_companion_rcs_batch);
 
-      /* Some performance queries just the pipeline statistic HW, no need for
-       * OA in that case, so no need to reconfigure.
-       */
-      if ((INTEL_DEBUG & DEBUG_NO_OACONFIG) == 0 &&
-          (query_info->kind == INTEL_PERF_QUERY_TYPE_OA ||
-           query_info->kind == INTEL_PERF_QUERY_TYPE_RAW)) {
-         int ret = intel_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG,
-                               (void *)(uintptr_t) query_info->oa_metrics_set_id);
-         if (ret < 0) {
-            result = anv_device_set_lost(device,
-                                         "i915-perf config failed: %s",
-                                         strerror(errno));
-         }
-      }
+   anv_bo_pool_free(&device->batch_bo_pool, batch_bo);
 
-      struct anv_bo *pass_batch_bo = query_pool->bo;
+   return result;
+}
 
-      struct drm_i915_gem_exec_object2 query_pass_object = {
-         .handle = pass_batch_bo->gem_handle,
-         .offset = pass_batch_bo->offset,
-         .flags  = pass_batch_bo->flags,
-      };
-      struct drm_i915_gem_execbuffer2 query_pass_execbuf = {
-         .buffers_ptr = (uintptr_t) &query_pass_object,
-         .buffer_count = 1,
-         .batch_start_offset = khr_perf_query_preamble_offset(query_pool,
-                                                              submit->perf_query_pass),
-         .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags,
-         .rsvd1 = device->context_id,
-      };
+VkResult
+anv_queue_submit_trtt_batch(struct anv_sparse_submission *submit,
+                            struct anv_batch *batch)
+{
+   struct anv_queue *queue = submit->queue;
+   struct anv_device *device = queue->device;
+   VkResult result = VK_SUCCESS;
 
-      int ret = queue->device->info.no_hw ? 0 :
-         anv_gem_execbuffer(queue->device, &query_pass_execbuf);
-      if (ret)
-         result = anv_queue_set_lost(queue, "execbuf2 failed: %m");
-   }
+   uint32_t batch_size = align(batch->next - batch->start, 8);
+   struct anv_trtt_batch_bo *trtt_bbo;
+   result = anv_trtt_batch_bo_new(device, batch_size, &trtt_bbo);
+   if (result != VK_SUCCESS)
+      return result;
 
-   int ret = queue->device->info.no_hw ? 0 :
-      anv_gem_execbuffer(queue->device, &execbuf.execbuf);
-   if (ret)
-      result = anv_queue_set_lost(queue, "execbuf2 failed: %m");
+   memcpy(trtt_bbo->bo->map, batch->start, trtt_bbo->size);
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+   if (device->physical->memory.need_flush &&
+       anv_bo_needs_host_cache_flush(trtt_bbo->bo->alloc_flags))
+      intel_flush_range(trtt_bbo->bo->map, trtt_bbo->size);
+#endif
 
-   struct drm_i915_gem_exec_object2 *objects = execbuf.objects;
-   for (uint32_t k = 0; k < execbuf.bo_count; k++) {
-      if (execbuf.bos[k]->flags & EXEC_OBJECT_PINNED)
-         assert(execbuf.bos[k]->offset == objects[k].offset);
-      execbuf.bos[k]->offset = objects[k].offset;
+   if (INTEL_DEBUG(DEBUG_BATCH)) {
+      intel_print_batch(queue->decoder, trtt_bbo->bo->map, trtt_bbo->bo->size,
+                        trtt_bbo->bo->offset, false);
    }
 
-   if (result == VK_SUCCESS && submit->need_out_fence)
-      submit->out_fence = execbuf.execbuf.rsvd2 >> 32;
+   result = device->kmd_backend->execute_trtt_batch(submit, trtt_bbo);
 
- error:
-   pthread_cond_broadcast(&device->queue_submit);
+   return result;
+}
 
-   anv_execbuf_finish(&execbuf);
+void
+anv_cmd_buffer_clflush(struct anv_cmd_buffer **cmd_buffers,
+                       uint32_t num_cmd_buffers)
+{
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+   struct anv_batch_bo **bbo;
 
-   return result;
+   __builtin_ia32_mfence();
+
+   for (uint32_t i = 0; i < num_cmd_buffers; i++) {
+      u_vector_foreach(bbo, &cmd_buffers[i]->seen_bbos) {
+         intel_flush_range_no_fence((*bbo)->bo->map, (*bbo)->length);
+      }
+   }
+
+   __builtin_ia32_mfence();
+#endif
 }
diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c
index 765df4c5cf2..1fec49fdedd 100644
--- a/src/intel/vulkan/anv_blorp.c
+++ b/src/intel/vulkan/anv_blorp.c
@@ -22,6 +22,7 @@
  */
 
 #include "anv_private.h"
+#include "genxml/gen8_pack.h"
 
 static bool
 lookup_blorp_shader(struct blorp_batch *batch,
@@ -31,11 +32,9 @@ lookup_blorp_shader(struct blorp_batch *batch,
    struct blorp_context *blorp = batch->blorp;
    struct anv_device *device = blorp->driver_ctx;
 
-   /* The default cache must be a real cache */
-   assert(device->default_pipeline_cache.cache);
-
    struct anv_shader_bin *bin =
-      anv_pipeline_cache_search(&device->default_pipeline_cache, key, key_size);
+      anv_device_search_for_kernel(device, device->internal_cache,
+                                   key, key_size, NULL);
    if (!bin)
       return false;
 
@@ -54,26 +53,29 @@ static bool
 upload_blorp_shader(struct blorp_batch *batch, uint32_t stage,
                     const void *key, uint32_t key_size,
                     const void *kernel, uint32_t kernel_size,
-                    const struct brw_stage_prog_data *prog_data,
+                    const void *prog_data,
                     uint32_t prog_data_size,
                     uint32_t *kernel_out, void *prog_data_out)
 {
    struct blorp_context *blorp = batch->blorp;
    struct anv_device *device = blorp->driver_ctx;
 
-   /* The blorp cache must be a real cache */
-   assert(device->default_pipeline_cache.cache);
-
-   struct anv_pipeline_bind_map bind_map = {
-      .surface_count = 0,
-      .sampler_count = 0,
+   struct anv_pipeline_bind_map empty_bind_map = {};
+   struct anv_push_descriptor_info empty_push_desc_info = {};
+   struct anv_shader_upload_params upload_params = {
+      .stage               = stage,
+      .key_data            = key,
+      .key_size            = key_size,
+      .kernel_data         = kernel,
+      .kernel_size         = kernel_size,
+      .prog_data           = prog_data,
+      .prog_data_size      = prog_data_size,
+      .bind_map            = &empty_bind_map,
+      .push_desc_info      = &empty_push_desc_info,
    };
 
    struct anv_shader_bin *bin =
-      anv_pipeline_cache_upload_kernel(&device->default_pipeline_cache, stage,
-                                       key, key_size, kernel, kernel_size,
-                                       prog_data, prog_data_size,
-                                       NULL, 0, NULL, &bind_map);
+      anv_device_upload_kernel(device, device->internal_cache, &upload_params);
 
    if (!bin)
       return false;
@@ -89,84 +91,142 @@ upload_blorp_shader(struct blorp_batch *batch, uint32_t stage,
    return true;
 }
 
+static void
+upload_dynamic_state(struct blorp_context *context,
+                     const void *data, uint32_t size,
+                     uint32_t alignment, enum blorp_dynamic_state name)
+{
+   struct anv_device *device = context->driver_ctx;
+
+   device->blorp.dynamic_states[name].state =
+      anv_state_pool_emit_data(&device->dynamic_state_pool,
+                               size, alignment, data);
+   if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
+      device->blorp.dynamic_states[name].db_state =
+         anv_state_pool_emit_data(&device->dynamic_state_db_pool,
+                                  size, alignment, data);
+   }
+}
+
 void
 anv_device_init_blorp(struct anv_device *device)
 {
-   blorp_init(&device->blorp, device, &device->isl_dev);
-   device->blorp.compiler = device->physical->compiler;
-   device->blorp.lookup_shader = lookup_blorp_shader;
-   device->blorp.upload_shader = upload_blorp_shader;
-   switch (device->info.verx10) {
-   case 70:
-      device->blorp.exec = gfx7_blorp_exec;
-      break;
-   case 75:
-      device->blorp.exec = gfx75_blorp_exec;
-      break;
-   case 80:
-      device->blorp.exec = gfx8_blorp_exec;
-      break;
-   case 90:
-      device->blorp.exec = gfx9_blorp_exec;
-      break;
-   case 110:
-      device->blorp.exec = gfx11_blorp_exec;
-      break;
-   case 120:
-      device->blorp.exec = gfx12_blorp_exec;
-      break;
-   case 125:
-      device->blorp.exec = gfx125_blorp_exec;
-      break;
-   default:
-      unreachable("Unknown hardware generation");
-   }
+   const struct blorp_config config = {
+      .use_mesh_shading = device->vk.enabled_extensions.EXT_mesh_shader,
+      .use_unrestricted_depth_range =
+         device->vk.enabled_extensions.EXT_depth_range_unrestricted,
+      .use_cached_dynamic_states = true,
+   };
+
+   blorp_init_brw(&device->blorp.context, device, &device->isl_dev,
+                  device->physical->compiler, &config);
+   device->blorp.context.lookup_shader = lookup_blorp_shader;
+   device->blorp.context.upload_shader = upload_blorp_shader;
+   device->blorp.context.enable_tbimr = device->physical->instance->enable_tbimr;
+   device->blorp.context.exec = anv_genX(device->info, blorp_exec);
+   device->blorp.context.upload_dynamic_state = upload_dynamic_state;
+
+   anv_genX(device->info, blorp_init_dynamic_states)(&device->blorp.context);
 }
 
 void
 anv_device_finish_blorp(struct anv_device *device)
 {
-   blorp_finish(&device->blorp);
+#ifdef HAVE_VALGRIND
+   /* We only need to free these to prevent valgrind errors.  The backing
+    * BO will go away in a couple of lines so we don't actually leak.
+    */
+   for (uint32_t i = 0; i < ARRAY_SIZE(device->blorp.dynamic_states); i++) {
+      anv_state_pool_free(&device->dynamic_state_pool,
+                          device->blorp.dynamic_states[i].state);
+      if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
+         anv_state_pool_free(&device->dynamic_state_db_pool,
+                             device->blorp.dynamic_states[i].db_state);
+      }
+
+   }
+#endif
+   blorp_finish(&device->blorp.context);
+}
+
+static void
+anv_blorp_batch_init(struct anv_cmd_buffer *cmd_buffer,
+                     struct blorp_batch *batch, enum blorp_batch_flags flags)
+{
+   VkQueueFlags queue_flags = cmd_buffer->queue_family->queueFlags;
+
+   if (queue_flags & VK_QUEUE_GRAPHICS_BIT) {
+      /* blorp runs on render engine by default */
+   } else if (queue_flags & VK_QUEUE_COMPUTE_BIT) {
+      flags |= BLORP_BATCH_USE_COMPUTE;
+   } else if (queue_flags & VK_QUEUE_TRANSFER_BIT) {
+      flags |= BLORP_BATCH_USE_BLITTER;
+   } else {
+      unreachable("unknown queue family");
+   }
+
+   /* Can't have both flags at the same time. */
+   assert((flags & BLORP_BATCH_USE_BLITTER) == 0 ||
+          (flags & BLORP_BATCH_USE_COMPUTE) == 0);
+
+   blorp_batch_init(&cmd_buffer->device->blorp.context, batch, cmd_buffer, flags);
 }
 
 static void
-get_blorp_surf_for_anv_buffer(struct anv_device *device,
-                              struct anv_buffer *buffer, uint64_t offset,
-                              uint32_t width, uint32_t height,
-                              uint32_t row_pitch, enum isl_format format,
-                              bool is_dest,
-                              struct blorp_surf *blorp_surf,
-                              struct isl_surf *isl_surf)
+anv_blorp_batch_finish(struct blorp_batch *batch)
 {
-   const struct isl_format_layout *fmtl =
-      isl_format_get_layout(format);
-   bool ok UNUSED;
+   blorp_batch_finish(batch);
+}
 
-   /* ASTC is the only format which doesn't support linear layouts.
-    * Create an equivalently sized surface with ISL to get around this.
-    */
-   if (fmtl->txc == ISL_TXC_ASTC) {
-      /* Use an equivalently sized format */
-      format = ISL_FORMAT_R32G32B32A32_UINT;
-      assert(fmtl->bpb == isl_format_get_layout(format)->bpb);
+static isl_surf_usage_flags_t
+get_usage_flag_for_cmd_buffer(const struct anv_cmd_buffer *cmd_buffer,
+                              bool is_dest)
+{
+   isl_surf_usage_flags_t usage;
 
-      /* Shrink the dimensions for the new format */
-      width = DIV_ROUND_UP(width, fmtl->bw);
-      height = DIV_ROUND_UP(height, fmtl->bh);
+   switch (cmd_buffer->queue_family->engine_class) {
+   case INTEL_ENGINE_CLASS_RENDER:
+      usage = is_dest ? ISL_SURF_USAGE_RENDER_TARGET_BIT :
+                        ISL_SURF_USAGE_TEXTURE_BIT;
+      break;
+   case INTEL_ENGINE_CLASS_COMPUTE:
+      usage = is_dest ? ISL_SURF_USAGE_STORAGE_BIT :
+                        ISL_SURF_USAGE_TEXTURE_BIT;
+      break;
+   case INTEL_ENGINE_CLASS_COPY:
+      usage = is_dest ? ISL_SURF_USAGE_BLITTER_DST_BIT :
+                        ISL_SURF_USAGE_BLITTER_SRC_BIT;
+      break;
+   default:
+      unreachable("Unhandled engine class");
    }
 
+   return usage;
+}
+
+static void
+get_blorp_surf_for_anv_address(struct anv_cmd_buffer *cmd_buffer,
+                               struct anv_address address,
+                               uint32_t width, uint32_t height,
+                               uint32_t row_pitch, enum isl_format format,
+                               bool is_dest,
+                               struct blorp_surf *blorp_surf,
+                               struct isl_surf *isl_surf)
+{
+   bool ok UNUSED;
+   isl_surf_usage_flags_t usage =
+      get_usage_flag_for_cmd_buffer(cmd_buffer, is_dest);
+
    *blorp_surf = (struct blorp_surf) {
       .surf = isl_surf,
       .addr = {
-         .buffer = buffer->address.bo,
-         .offset = buffer->address.offset + offset,
-         .mocs = anv_mocs(device, buffer->address.bo,
-                          is_dest ? ISL_SURF_USAGE_RENDER_TARGET_BIT
-                                  : ISL_SURF_USAGE_TEXTURE_BIT),
+         .buffer = address.bo,
+         .offset = address.offset,
+         .mocs = anv_mocs(cmd_buffer->device, address.bo, usage),
       },
    };
 
-   ok = isl_surf_init(&device->isl_dev, isl_surf,
+   ok = isl_surf_init(&cmd_buffer->device->isl_dev, isl_surf,
                      .dim = ISL_SURF_DIM_2D,
                      .format = format,
                      .width = width,
@@ -176,12 +236,26 @@ get_blorp_surf_for_anv_buffer(struct anv_device *device,
                      .array_len = 1,
                      .samples = 1,
                      .row_pitch_B = row_pitch,
-                     .usage = is_dest ? ISL_SURF_USAGE_RENDER_TARGET_BIT
-                                      : ISL_SURF_USAGE_TEXTURE_BIT,
+                     .usage = usage,
                      .tiling_flags = ISL_TILING_LINEAR_BIT);
    assert(ok);
 }
 
+static void
+get_blorp_surf_for_anv_buffer(struct anv_cmd_buffer *cmd_buffer,
+                              struct anv_buffer *buffer, uint64_t offset,
+                              uint32_t width, uint32_t height,
+                              uint32_t row_pitch, enum isl_format format,
+                              bool is_dest,
+                              struct blorp_surf *blorp_surf,
+                              struct isl_surf *isl_surf)
+{
+   get_blorp_surf_for_anv_address(cmd_buffer,
+                                  anv_address_add(buffer->address, offset),
+                                  width, height, row_pitch, format,
+                                  is_dest, blorp_surf, isl_surf);
+}
+
 /* Pick something high enough that it won't be used in core and low enough it
  * will never map to an extension.
  */
@@ -197,7 +271,7 @@ anv_to_blorp_address(struct anv_address addr)
 }
 
 static void
-get_blorp_surf_for_anv_image(const struct anv_device *device,
+get_blorp_surf_for_anv_image(const struct anv_cmd_buffer *cmd_buffer,
                              const struct anv_image *image,
                              VkImageAspectFlags aspect,
                              VkImageUsageFlags usage,
@@ -205,18 +279,19 @@ get_blorp_surf_for_anv_image(const struct anv_device *device,
                              enum isl_aux_usage aux_usage,
                              struct blorp_surf *blorp_surf)
 {
+   const struct anv_device *device = cmd_buffer->device;
    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
 
    if (layout != ANV_IMAGE_LAYOUT_EXPLICIT_AUX) {
       assert(usage != 0);
-      aux_usage = anv_layout_to_aux_usage(&device->info, image,
-                                          aspect, usage, layout);
+      aux_usage = anv_layout_to_aux_usage(device->info, image,
+                                          aspect, usage, layout,
+                                          cmd_buffer->queue_family->queueFlags);
    }
 
-   isl_surf_usage_flags_t mocs_usage =
-      (usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT) ?
-      ISL_SURF_USAGE_RENDER_TARGET_BIT : ISL_SURF_USAGE_TEXTURE_BIT;
-
+   isl_surf_usage_flags_t isl_usage =
+      get_usage_flag_for_cmd_buffer(cmd_buffer,
+                                    usage & VK_IMAGE_USAGE_TRANSFER_DST_BIT);
    const struct anv_surface *surface = &image->planes[plane].primary_surface;
    const struct anv_address address =
       anv_image_address(image, &surface->memory_range);
@@ -226,7 +301,7 @@ get_blorp_surf_for_anv_image(const struct anv_device *device,
       .addr = {
          .buffer = address.bo,
          .offset = address.offset,
-         .mocs = anv_mocs(device, address.bo, mocs_usage),
+         .mocs = anv_mocs(device, address.bo, isl_usage),
       },
    };
 
@@ -242,7 +317,7 @@ get_blorp_surf_for_anv_image(const struct anv_device *device,
          blorp_surf->aux_addr = (struct blorp_address) {
             .buffer = aux_address.bo,
             .offset = aux_address.offset,
-            .mocs = anv_mocs(device, aux_address.bo, 0),
+            .mocs = anv_mocs(device, aux_address.bo, isl_usage),
          };
       }
 
@@ -267,33 +342,6 @@ get_blorp_surf_for_anv_image(const struct anv_device *device,
    }
 }
 
-static bool
-get_blorp_surf_for_anv_shadow_image(const struct anv_device *device,
-                                    const struct anv_image *image,
-                                    VkImageAspectFlags aspect,
-                                    struct blorp_surf *blorp_surf)
-{
-
-   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
-   if (!anv_surface_is_valid(&image->planes[plane].shadow_surface))
-      return false;
-
-   const struct anv_surface *surface = &image->planes[plane].shadow_surface;
-   const struct anv_address address =
-      anv_image_address(image, &surface->memory_range);
-
-   *blorp_surf = (struct blorp_surf) {
-      .surf = &surface->isl,
-      .addr = {
-         .buffer = address.bo,
-         .offset = address.offset,
-         .mocs = anv_mocs(device, address.bo, ISL_SURF_USAGE_RENDER_TARGET_BIT),
-      },
-   };
-
-   return true;
-}
-
 static void
 copy_image(struct anv_cmd_buffer *cmd_buffer,
            struct blorp_batch *batch,
@@ -301,14 +349,14 @@ copy_image(struct anv_cmd_buffer *cmd_buffer,
            VkImageLayout src_image_layout,
            struct anv_image *dst_image,
            VkImageLayout dst_image_layout,
-           const VkImageCopy2KHR *region)
+           const VkImageCopy2 *region)
 {
    VkOffset3D srcOffset =
-      anv_sanitize_image_offset(src_image->vk.image_type, region->srcOffset);
+      vk_image_sanitize_offset(&src_image->vk, region->srcOffset);
    VkOffset3D dstOffset =
-      anv_sanitize_image_offset(dst_image->vk.image_type, region->dstOffset);
+      vk_image_sanitize_offset(&dst_image->vk, region->dstOffset);
    VkExtent3D extent =
-      anv_sanitize_image_extent(src_image->vk.image_type, region->extent);
+      vk_image_sanitize_extent(&src_image->vk, region->extent);
 
    const uint32_t dst_level = region->dstSubresource.mipLevel;
    unsigned dst_base_layer, layer_count;
@@ -340,12 +388,12 @@ copy_image(struct anv_cmd_buffer *cmd_buffer,
    if (util_bitcount(src_mask) > 1) {
       anv_foreach_image_aspect_bit(aspect_bit, src_image, src_mask) {
          struct blorp_surf src_surf, dst_surf;
-         get_blorp_surf_for_anv_image(cmd_buffer->device,
+         get_blorp_surf_for_anv_image(cmd_buffer,
                                       src_image, 1UL << aspect_bit,
                                       VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
                                       src_image_layout, ISL_AUX_USAGE_NONE,
                                       &src_surf);
-         get_blorp_surf_for_anv_image(cmd_buffer->device,
+         get_blorp_surf_for_anv_image(cmd_buffer,
                                       dst_image, 1UL << aspect_bit,
                                       VK_IMAGE_USAGE_TRANSFER_DST_BIT,
                                       dst_image_layout, ISL_AUX_USAGE_NONE,
@@ -362,28 +410,17 @@ copy_image(struct anv_cmd_buffer *cmd_buffer,
                        dstOffset.x, dstOffset.y,
                        extent.width, extent.height);
          }
-
-         struct blorp_surf dst_shadow_surf;
-         if (get_blorp_surf_for_anv_shadow_image(cmd_buffer->device,
-                                                 dst_image,
-                                                 1UL << aspect_bit,
-                                                 &dst_shadow_surf)) {
-            for (unsigned i = 0; i < layer_count; i++) {
-               blorp_copy(batch, &src_surf, src_level, src_base_layer + i,
-                          &dst_shadow_surf, dst_level, dst_base_layer + i,
-                          srcOffset.x, srcOffset.y,
-                          dstOffset.x, dstOffset.y,
-                          extent.width, extent.height);
-            }
-         }
       }
    } else {
+      /* This case handles the ycbcr images, aspect mask are compatible but
+       * don't need to be the same.
+       */
       struct blorp_surf src_surf, dst_surf;
-      get_blorp_surf_for_anv_image(cmd_buffer->device, src_image, src_mask,
+      get_blorp_surf_for_anv_image(cmd_buffer, src_image, src_mask,
                                    VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
                                    src_image_layout, ISL_AUX_USAGE_NONE,
                                    &src_surf);
-      get_blorp_surf_for_anv_image(cmd_buffer->device, dst_image, dst_mask,
+      get_blorp_surf_for_anv_image(cmd_buffer, dst_image, dst_mask,
                                    VK_IMAGE_USAGE_TRANSFER_DST_BIT,
                                    dst_image_layout, ISL_AUX_USAGE_NONE,
                                    &dst_surf);
@@ -398,32 +435,133 @@ copy_image(struct anv_cmd_buffer *cmd_buffer,
                     dstOffset.x, dstOffset.y,
                     extent.width, extent.height);
       }
+   }
+}
 
-      struct blorp_surf dst_shadow_surf;
-      if (get_blorp_surf_for_anv_shadow_image(cmd_buffer->device,
-                                              dst_image, dst_mask,
-                                              &dst_shadow_surf)) {
-         for (unsigned i = 0; i < layer_count; i++) {
-            blorp_copy(batch, &src_surf, src_level, src_base_layer + i,
-                       &dst_shadow_surf, dst_level, dst_base_layer + i,
-                       srcOffset.x, srcOffset.y,
-                       dstOffset.x, dstOffset.y,
-                       extent.width, extent.height);
-         }
+static struct anv_state
+record_main_rcs_cmd_buffer_done(struct anv_cmd_buffer *cmd_buffer)
+{
+   const struct intel_device_info *info = cmd_buffer->device->info;
+
+   const VkResult result = anv_cmd_buffer_ensure_rcs_companion(cmd_buffer);
+   if (result != VK_SUCCESS) {
+      anv_batch_set_error(&cmd_buffer->batch, result);
+      return ANV_STATE_NULL;
+   }
+
+   assert(cmd_buffer->companion_rcs_cmd_buffer != NULL);
+
+   /* Re-emit the aux table register in every command buffer.  This way we're
+    * ensured that we have the table even if this command buffer doesn't
+    * initialize any images.
+    */
+   if (cmd_buffer->device->info->has_aux_map) {
+      anv_add_pending_pipe_bits(cmd_buffer->companion_rcs_cmd_buffer,
+                                 ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
+                                 "new cmd buffer with aux-tt");
+   }
+
+   return anv_genX(info, cmd_buffer_begin_companion_rcs_syncpoint)(cmd_buffer);
+}
+
+static void
+end_main_rcs_cmd_buffer_done(struct anv_cmd_buffer *cmd_buffer,
+                             struct anv_state syncpoint)
+{
+   const struct intel_device_info *info = cmd_buffer->device->info;
+   anv_genX(info, cmd_buffer_end_companion_rcs_syncpoint)(cmd_buffer,
+                                                          syncpoint);
+}
+
+static bool
+anv_blorp_blitter_execute_on_companion(struct anv_cmd_buffer *cmd_buffer,
+                                       struct anv_image *image,
+                                       const VkCopyBufferToImageInfo2* pCopyBufferToImageInfo,
+                                       const VkCopyImageToBufferInfo2* pCopyImageToBufferInfo)
+{
+   if (!anv_cmd_buffer_is_blitter_queue(cmd_buffer))
+      return false;
+
+   assert((pCopyBufferToImageInfo && !pCopyImageToBufferInfo) ||
+          (pCopyImageToBufferInfo && !pCopyBufferToImageInfo));
+
+   bool blorp_execute_on_companion = false;
+   VkImageAspectFlags aspect_mask = VK_IMAGE_ASPECT_NONE;
+   const uint32_t region_count = pCopyBufferToImageInfo ?
+                                 pCopyBufferToImageInfo->regionCount :
+                                 pCopyImageToBufferInfo->regionCount;
+
+   for (unsigned r = 0; r < region_count &&
+                            !blorp_execute_on_companion; r++) {
+      if (pCopyBufferToImageInfo) {
+         aspect_mask =
+            pCopyBufferToImageInfo->pRegions[r].imageSubresource.aspectMask;
+      } else {
+         aspect_mask =
+            pCopyImageToBufferInfo->pRegions[r].imageSubresource.aspectMask;
+      }
+
+      enum isl_format linear_format =
+         anv_get_isl_format(cmd_buffer->device->info, image->vk.format,
+                            aspect_mask, VK_IMAGE_TILING_LINEAR);
+      const struct isl_format_layout *linear_fmtl =
+         isl_format_get_layout(linear_format);
+
+      switch (linear_fmtl->bpb) {
+      case 96:
+         /* We can only support linear mode for 96bpp on blitter engine. */
+         blorp_execute_on_companion |=
+            image->vk.tiling != VK_IMAGE_TILING_LINEAR;
+         break;
+      default:
+         blorp_execute_on_companion |= linear_fmtl->bpb % 3 == 0;
+         break;
       }
    }
+
+   return blorp_execute_on_companion;
+}
+
+static bool
+anv_blorp_execute_on_companion(struct anv_cmd_buffer *cmd_buffer,
+                               struct anv_image *dst_image)
+{
+   /* MSAA images have to be dealt with on the companion RCS command buffer
+    * for both CCS && BCS engines.
+    */
+   if ((anv_cmd_buffer_is_blitter_queue(cmd_buffer) ||
+        anv_cmd_buffer_is_compute_queue(cmd_buffer)) &&
+       dst_image->vk.samples > 1)
+      return true;
+
+   /* Emulation of formats is done through a compute shader, so we need
+    * the companion command buffer for the BCS engine.
+    */
+   if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) &&
+       dst_image->emu_plane_format != VK_FORMAT_UNDEFINED)
+      return true;
+
+   return false;
 }
 
-void anv_CmdCopyImage2KHR(
+void anv_CmdCopyImage2(
     VkCommandBuffer                             commandBuffer,
-    const VkCopyImageInfo2KHR*                  pCopyImageInfo)
+    const VkCopyImageInfo2*                     pCopyImageInfo)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_image, src_image, pCopyImageInfo->srcImage);
    ANV_FROM_HANDLE(anv_image, dst_image, pCopyImageInfo->dstImage);
 
+   struct anv_cmd_buffer *main_cmd_buffer = cmd_buffer;
+   UNUSED struct anv_state rcs_done = ANV_STATE_NULL;
+
+   if (anv_blorp_execute_on_companion(cmd_buffer, dst_image)) {
+      rcs_done = record_main_rcs_cmd_buffer_done(cmd_buffer);
+      cmd_buffer = cmd_buffer->companion_rcs_cmd_buffer;
+   }
+
    struct blorp_batch batch;
-   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
 
    for (unsigned r = 0; r < pCopyImageInfo->regionCount; r++) {
       copy_image(cmd_buffer, &batch,
@@ -432,7 +570,32 @@ void anv_CmdCopyImage2KHR(
                  &pCopyImageInfo->pRegions[r]);
    }
 
-   blorp_batch_finish(&batch);
+   anv_blorp_batch_finish(&batch);
+
+   if (dst_image->emu_plane_format != VK_FORMAT_UNDEFINED) {
+      assert(!anv_cmd_buffer_is_blitter_queue(cmd_buffer));
+      const enum anv_pipe_bits pipe_bits =
+         anv_cmd_buffer_is_compute_queue(cmd_buffer) ?
+         ANV_PIPE_HDC_PIPELINE_FLUSH_BIT :
+         ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+      anv_add_pending_pipe_bits(cmd_buffer, pipe_bits,
+                                "Copy flush before astc emu");
+
+      for (unsigned r = 0; r < pCopyImageInfo->regionCount; r++) {
+         const VkImageCopy2 *region = &pCopyImageInfo->pRegions[r];
+         const VkOffset3D block_offset = vk_image_offset_to_elements(
+               &dst_image->vk, region->dstOffset);
+         const VkExtent3D block_extent = vk_image_extent_to_elements(
+               &src_image->vk, region->extent);
+         anv_astc_emu_process(cmd_buffer, dst_image,
+                              pCopyImageInfo->dstImageLayout,
+                              &region->dstSubresource,
+                              block_offset, block_extent);
+      }
+   }
+
+   if (rcs_done.alloc_size)
+      end_main_rcs_cmd_buffer_done(main_cmd_buffer, rcs_done);
 }
 
 static enum isl_format
@@ -459,7 +622,7 @@ copy_buffer_to_image(struct anv_cmd_buffer *cmd_buffer,
                      struct anv_buffer *anv_buffer,
                      struct anv_image *anv_image,
                      VkImageLayout image_layout,
-                     const VkBufferImageCopy2KHR* region,
+                     const VkBufferImageCopy2* region,
                      bool buffer_to_image)
 {
    struct {
@@ -481,18 +644,18 @@ copy_buffer_to_image(struct anv_cmd_buffer *cmd_buffer,
 
    const VkImageAspectFlags aspect = region->imageSubresource.aspectMask;
 
-   get_blorp_surf_for_anv_image(cmd_buffer->device, anv_image, aspect,
+   get_blorp_surf_for_anv_image(cmd_buffer, anv_image, aspect,
                                 buffer_to_image ?
                                 VK_IMAGE_USAGE_TRANSFER_DST_BIT :
                                 VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
                                 image_layout, ISL_AUX_USAGE_NONE,
                                 &image.surf);
    image.offset =
-      anv_sanitize_image_offset(anv_image->vk.image_type, region->imageOffset);
+      vk_image_sanitize_offset(&anv_image->vk, region->imageOffset);
    image.level = region->imageSubresource.mipLevel;
 
    VkExtent3D extent =
-      anv_sanitize_image_extent(anv_image->vk.image_type, region->imageExtent);
+      vk_image_sanitize_extent(&anv_image->vk, region->imageExtent);
    if (anv_image->vk.image_type != VK_IMAGE_TYPE_3D) {
       image.offset.z = region->imageSubresource.baseArrayLayer;
       extent.depth =
@@ -501,32 +664,17 @@ copy_buffer_to_image(struct anv_cmd_buffer *cmd_buffer,
    }
 
    const enum isl_format linear_format =
-      anv_get_isl_format(&cmd_buffer->device->info, anv_image->vk.format,
+      anv_get_isl_format(cmd_buffer->device->info, anv_image->vk.format,
                          aspect, VK_IMAGE_TILING_LINEAR);
    const struct isl_format_layout *linear_fmtl =
       isl_format_get_layout(linear_format);
 
-   const uint32_t buffer_row_length =
-      region->bufferRowLength ?
-      region->bufferRowLength : extent.width;
-
-   const uint32_t buffer_image_height =
-      region->bufferImageHeight ?
-      region->bufferImageHeight : extent.height;
-
-   const uint32_t buffer_row_pitch =
-      DIV_ROUND_UP(buffer_row_length, linear_fmtl->bw) *
-      (linear_fmtl->bpb / 8);
-
-   const uint32_t buffer_layer_stride =
-      DIV_ROUND_UP(buffer_image_height, linear_fmtl->bh) *
-      buffer_row_pitch;
+   const struct vk_image_buffer_layout buffer_layout =
+      vk_image_buffer_copy_layout(&anv_image->vk, region);
 
    /* Some formats have additional restrictions which may cause ISL to
-    * fail to create a surface for us.  Some examples include:
-    *
-    *    1. ASTC formats are not allowed to be LINEAR and must be tiled
-    *    2. YCbCr formats have to have 2-pixel aligned strides
+    * fail to create a surface for us.  For example, YCbCr formats
+    * have to have 2-pixel aligned strides.
     *
     * To avoid these issues, we always bind the buffer as if it's a
     * "normal" format like RGBA32_UINT.  Since we're using blorp_copy,
@@ -540,14 +688,12 @@ copy_buffer_to_image(struct anv_cmd_buffer *cmd_buffer,
       isl_format_for_size(linear_fmtl->bpb / 8);
 
    struct isl_surf buffer_isl_surf;
-   get_blorp_surf_for_anv_buffer(cmd_buffer->device,
+   get_blorp_surf_for_anv_buffer(cmd_buffer,
                                  anv_buffer, region->bufferOffset,
                                  buffer_extent.width, buffer_extent.height,
-                                 buffer_row_pitch, buffer_format, false,
-                                 &buffer.surf, &buffer_isl_surf);
+                                 buffer_layout.row_stride_B, buffer_format,
+                                 false, &buffer.surf, &buffer_isl_surf);
 
-   bool dst_has_shadow = false;
-   struct blorp_surf dst_shadow_surf;
    if (&image == dst) {
       /* In this case, the source is the buffer and, since blorp takes its
        * copy dimensions in terms of the source format, we have to use the
@@ -561,11 +707,6 @@ copy_buffer_to_image(struct anv_cmd_buffer *cmd_buffer,
                                         aspect, dst->surf.aux_usage,
                                         dst->level,
                                         dst->offset.z, extent.depth);
-
-      dst_has_shadow =
-         get_blorp_surf_for_anv_shadow_image(cmd_buffer->device,
-                                             anv_image, aspect,
-                                             &dst_shadow_surf);
    }
 
    for (unsigned z = 0; z < extent.depth; z++) {
@@ -574,29 +715,40 @@ copy_buffer_to_image(struct anv_cmd_buffer *cmd_buffer,
                  src->offset.x, src->offset.y, dst->offset.x, dst->offset.y,
                  extent.width, extent.height);
 
-      if (dst_has_shadow) {
-         blorp_copy(batch, &src->surf, src->level, src->offset.z,
-                    &dst_shadow_surf, dst->level, dst->offset.z,
-                    src->offset.x, src->offset.y,
-                    dst->offset.x, dst->offset.y,
-                    extent.width, extent.height);
-      }
-
       image.offset.z++;
-      buffer.surf.addr.offset += buffer_layer_stride;
+      buffer.surf.addr.offset += buffer_layout.image_stride_B;
    }
 }
 
-void anv_CmdCopyBufferToImage2KHR(
+void anv_CmdCopyBufferToImage2(
     VkCommandBuffer                             commandBuffer,
-    const VkCopyBufferToImageInfo2KHR*          pCopyBufferToImageInfo)
+    const VkCopyBufferToImageInfo2*             pCopyBufferToImageInfo)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_buffer, src_buffer, pCopyBufferToImageInfo->srcBuffer);
    ANV_FROM_HANDLE(anv_image, dst_image, pCopyBufferToImageInfo->dstImage);
 
+   struct anv_cmd_buffer *main_cmd_buffer = cmd_buffer;
+   UNUSED struct anv_state rcs_done = ANV_STATE_NULL;
+
+   bool blorp_execute_on_companion =
+      anv_blorp_execute_on_companion(cmd_buffer, dst_image);
+
+   /* Check if any one of the aspects is incompatible with the blitter engine,
+    * if true, use the companion RCS command buffer for blit operation since 3
+    * component formats are not supported natively except 96bpb on the blitter.
+    */
+   blorp_execute_on_companion |=
+      anv_blorp_blitter_execute_on_companion(cmd_buffer, dst_image,
+                                             pCopyBufferToImageInfo, NULL);
+
+   if (blorp_execute_on_companion) {
+      rcs_done = record_main_rcs_cmd_buffer_done(cmd_buffer);
+      cmd_buffer = cmd_buffer->companion_rcs_cmd_buffer;
+   }
+
    struct blorp_batch batch;
-   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
 
    for (unsigned r = 0; r < pCopyBufferToImageInfo->regionCount; r++) {
       copy_buffer_to_image(cmd_buffer, &batch, src_buffer, dst_image,
@@ -604,19 +756,76 @@ void anv_CmdCopyBufferToImage2KHR(
                            &pCopyBufferToImageInfo->pRegions[r], true);
    }
 
-   blorp_batch_finish(&batch);
+   anv_blorp_batch_finish(&batch);
+
+   if (dst_image->emu_plane_format != VK_FORMAT_UNDEFINED) {
+      assert(!anv_cmd_buffer_is_blitter_queue(cmd_buffer));
+      const enum anv_pipe_bits pipe_bits =
+         anv_cmd_buffer_is_compute_queue(cmd_buffer) ?
+         ANV_PIPE_HDC_PIPELINE_FLUSH_BIT :
+         ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+      anv_add_pending_pipe_bits(cmd_buffer, pipe_bits,
+                                "Copy flush before astc emu");
+
+      for (unsigned r = 0; r < pCopyBufferToImageInfo->regionCount; r++) {
+         const VkBufferImageCopy2 *region =
+            &pCopyBufferToImageInfo->pRegions[r];
+         const VkOffset3D block_offset = vk_image_offset_to_elements(
+               &dst_image->vk, region->imageOffset);
+         const VkExtent3D block_extent = vk_image_extent_to_elements(
+               &dst_image->vk, region->imageExtent);
+         anv_astc_emu_process(cmd_buffer, dst_image,
+                              pCopyBufferToImageInfo->dstImageLayout,
+                              &region->imageSubresource,
+                              block_offset, block_extent);
+      }
+   }
+
+   if (rcs_done.alloc_size)
+      end_main_rcs_cmd_buffer_done(main_cmd_buffer, rcs_done);
 }
 
-void anv_CmdCopyImageToBuffer2KHR(
+static void
+anv_add_buffer_write_pending_bits(struct anv_cmd_buffer *cmd_buffer,
+                                  const char *reason)
+{
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+
+   cmd_buffer->state.queries.buffer_write_bits |=
+      (cmd_buffer->queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT) == 0 ?
+      ANV_QUERY_COMPUTE_WRITES_PENDING_BITS :
+      ANV_QUERY_RENDER_TARGET_WRITES_PENDING_BITS(devinfo);
+}
+
+void anv_CmdCopyImageToBuffer2(
     VkCommandBuffer                             commandBuffer,
-    const VkCopyImageToBufferInfo2KHR*          pCopyImageToBufferInfo)
+    const VkCopyImageToBufferInfo2*             pCopyImageToBufferInfo)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_image, src_image, pCopyImageToBufferInfo->srcImage);
    ANV_FROM_HANDLE(anv_buffer, dst_buffer, pCopyImageToBufferInfo->dstBuffer);
 
+   UNUSED struct anv_cmd_buffer *main_cmd_buffer = cmd_buffer;
+   UNUSED struct anv_state rcs_done = ANV_STATE_NULL;
+
+   bool blorp_execute_on_companion =
+      anv_blorp_execute_on_companion(cmd_buffer, src_image);
+
+   /* Check if any one of the aspects is incompatible with the blitter engine,
+    * if true, use the companion RCS command buffer for blit operation since 3
+    * component formats are not supported natively except 96bpb on the blitter.
+    */
+   blorp_execute_on_companion |=
+      anv_blorp_blitter_execute_on_companion(cmd_buffer, src_image, NULL,
+                                             pCopyImageToBufferInfo);
+
+   if (blorp_execute_on_companion) {
+      rcs_done = record_main_rcs_cmd_buffer_done(cmd_buffer);
+      cmd_buffer = cmd_buffer->companion_rcs_cmd_buffer;
+   }
+
    struct blorp_batch batch;
-   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
 
    for (unsigned r = 0; r < pCopyImageToBufferInfo->regionCount; r++) {
       copy_buffer_to_image(cmd_buffer, &batch, dst_buffer, src_image,
@@ -624,9 +833,12 @@ void anv_CmdCopyImageToBuffer2KHR(
                            &pCopyImageToBufferInfo->pRegions[r], false);
    }
 
-   blorp_batch_finish(&batch);
+   anv_add_buffer_write_pending_bits(cmd_buffer, "after copy image to buffer");
 
-   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_BUFFER_WRITES;
+   anv_blorp_batch_finish(&batch);
+
+   if (rcs_done.alloc_size)
+      end_main_rcs_cmd_buffer_done(main_cmd_buffer, rcs_done);
 }
 
 static bool
@@ -657,7 +869,7 @@ blit_image(struct anv_cmd_buffer *cmd_buffer,
            VkImageLayout src_image_layout,
            struct anv_image *dst_image,
            VkImageLayout dst_image_layout,
-           const VkImageBlit2KHR *region,
+           const VkImageBlit2 *region,
            VkFilter filter)
 {
    const VkImageSubresourceLayers *src_res = &region->srcSubresource;
@@ -681,20 +893,35 @@ blit_image(struct anv_cmd_buffer *cmd_buffer,
                                        dst_res->aspectMask));
 
    anv_foreach_image_aspect_bit(aspect_bit, src_image, src_res->aspectMask) {
-      get_blorp_surf_for_anv_image(cmd_buffer->device,
+      get_blorp_surf_for_anv_image(cmd_buffer,
                                    src_image, 1U << aspect_bit,
                                    VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
                                    src_image_layout, ISL_AUX_USAGE_NONE, &src);
-      get_blorp_surf_for_anv_image(cmd_buffer->device,
+      get_blorp_surf_for_anv_image(cmd_buffer,
                                    dst_image, 1U << aspect_bit,
                                    VK_IMAGE_USAGE_TRANSFER_DST_BIT,
                                    dst_image_layout, ISL_AUX_USAGE_NONE, &dst);
 
+      VkFormat src_vk_format = src_image->vk.format;
+
+      if (src_image->emu_plane_format != VK_FORMAT_UNDEFINED) {
+         /* redirect src to the hidden plane */
+         const uint32_t plane = src_image->n_planes;
+         const struct anv_surface *surface =
+            &src_image->planes[plane].primary_surface;
+         const struct anv_address address =
+            anv_image_address(src_image, &surface->memory_range);
+         src.surf = &surface->isl,
+         src.addr.offset = address.offset;
+
+         src_vk_format = src_image->emu_plane_format;
+      }
+
       struct anv_format_plane src_format =
-         anv_get_format_aspect(&cmd_buffer->device->info, src_image->vk.format,
+         anv_get_format_aspect(cmd_buffer->device->info, src_vk_format,
                                1U << aspect_bit, src_image->vk.tiling);
       struct anv_format_plane dst_format =
-         anv_get_format_aspect(&cmd_buffer->device->info, dst_image->vk.format,
+         anv_get_format_aspect(cmd_buffer->device->info, dst_image->vk.format,
                                1U << aspect_bit, dst_image->vk.tiling);
 
       unsigned dst_start, dst_end;
@@ -768,16 +995,16 @@ blit_image(struct anv_cmd_buffer *cmd_buffer,
    }
 }
 
-void anv_CmdBlitImage2KHR(
+void anv_CmdBlitImage2(
     VkCommandBuffer                             commandBuffer,
-    const VkBlitImageInfo2KHR*                  pBlitImageInfo)
+    const VkBlitImageInfo2*                     pBlitImageInfo)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_image, src_image, pBlitImageInfo->srcImage);
    ANV_FROM_HANDLE(anv_image, dst_image, pBlitImageInfo->dstImage);
 
    struct blorp_batch batch;
-   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
 
    for (unsigned r = 0; r < pBlitImageInfo->regionCount; r++) {
       blit_image(cmd_buffer, &batch,
@@ -786,7 +1013,7 @@ void anv_CmdBlitImage2KHR(
                  &pBlitImageInfo->pRegions[r], pBlitImageInfo->filter);
    }
 
-   blorp_batch_finish(&batch);
+   anv_blorp_batch_finish(&batch);
 }
 
 /**
@@ -815,43 +1042,46 @@ copy_buffer(struct anv_device *device,
             struct blorp_batch *batch,
             struct anv_buffer *src_buffer,
             struct anv_buffer *dst_buffer,
-            const VkBufferCopy2KHR *region)
+            const VkBufferCopy2 *region)
 {
    struct blorp_address src = {
       .buffer = src_buffer->address.bo,
       .offset = src_buffer->address.offset + region->srcOffset,
       .mocs = anv_mocs(device, src_buffer->address.bo,
-                       ISL_SURF_USAGE_TEXTURE_BIT),
+                       blorp_batch_isl_copy_usage(batch, false /* is_dest */)),
    };
    struct blorp_address dst = {
       .buffer = dst_buffer->address.bo,
       .offset = dst_buffer->address.offset + region->dstOffset,
       .mocs = anv_mocs(device, dst_buffer->address.bo,
-                       ISL_SURF_USAGE_RENDER_TARGET_BIT),
+                       blorp_batch_isl_copy_usage(batch, true /* is_dest */)),
    };
 
    blorp_buffer_copy(batch, src, dst, region->size);
 }
 
-void anv_CmdCopyBuffer2KHR(
+void anv_CmdCopyBuffer2(
     VkCommandBuffer                             commandBuffer,
-    const VkCopyBufferInfo2KHR*                 pCopyBufferInfo)
+    const VkCopyBufferInfo2*                    pCopyBufferInfo)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
    ANV_FROM_HANDLE(anv_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
 
    struct blorp_batch batch;
-   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+   anv_blorp_batch_init(cmd_buffer, &batch,
+                        cmd_buffer->state.current_pipeline ==
+                        cmd_buffer->device->physical->gpgpu_pipeline_value ?
+                        BLORP_BATCH_USE_COMPUTE : 0);
 
    for (unsigned r = 0; r < pCopyBufferInfo->regionCount; r++) {
       copy_buffer(cmd_buffer->device, &batch, src_buffer, dst_buffer,
                   &pCopyBufferInfo->pRegions[r]);
    }
 
-   blorp_batch_finish(&batch);
+   anv_add_buffer_write_pending_bits(cmd_buffer, "after copy buffer");
 
-   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_BUFFER_WRITES;
+   anv_blorp_batch_finish(&batch);
 }
 
 
@@ -866,7 +1096,10 @@ void anv_CmdUpdateBuffer(
    ANV_FROM_HANDLE(anv_buffer, dst_buffer, dstBuffer);
 
    struct blorp_batch batch;
-   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+   anv_blorp_batch_init(cmd_buffer, &batch,
+                        cmd_buffer->state.current_pipeline ==
+                        cmd_buffer->device->physical->gpgpu_pipeline_value ?
+                        BLORP_BATCH_USE_COMPUTE : 0);
 
    /* We can't quite grab a full block because the state stream needs a
     * little data at the top to build its linked list.
@@ -887,21 +1120,25 @@ void anv_CmdUpdateBuffer(
       const uint32_t copy_size = MIN2(dataSize, max_update_size);
 
       struct anv_state tmp_data =
-         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, copy_size, 64);
+         anv_cmd_buffer_alloc_temporary_state(cmd_buffer, copy_size, 64);
+      struct anv_address tmp_addr =
+         anv_cmd_buffer_temporary_state_address(cmd_buffer, tmp_data);
 
       memcpy(tmp_data.map, pData, copy_size);
 
       struct blorp_address src = {
-         .buffer = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
-         .offset = tmp_data.offset,
-         .mocs = isl_mocs(&cmd_buffer->device->isl_dev,
-                          ISL_SURF_USAGE_TEXTURE_BIT, false)
+         .buffer = tmp_addr.bo,
+         .offset = tmp_addr.offset,
+         .mocs = anv_mocs(cmd_buffer->device, NULL,
+                          get_usage_flag_for_cmd_buffer(cmd_buffer,
+                                                        false /* is_dest */)),
       };
       struct blorp_address dst = {
          .buffer = dst_buffer->address.bo,
          .offset = dst_buffer->address.offset + dstOffset,
          .mocs = anv_mocs(cmd_buffer->device, dst_buffer->address.bo,
-                          ISL_SURF_USAGE_RENDER_TARGET_BIT),
+                          get_usage_flag_for_cmd_buffer(cmd_buffer,
+                                                        true /* is_dest */)),
       };
 
       blorp_buffer_copy(&batch, src, dst, copy_size);
@@ -911,44 +1148,33 @@ void anv_CmdUpdateBuffer(
       pData = (void *)pData + copy_size;
    }
 
-   blorp_batch_finish(&batch);
+   anv_add_buffer_write_pending_bits(cmd_buffer, "update buffer");
 
-   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_BUFFER_WRITES;
+   anv_blorp_batch_finish(&batch);
 }
 
-void anv_CmdFillBuffer(
-    VkCommandBuffer                             commandBuffer,
-    VkBuffer                                    dstBuffer,
-    VkDeviceSize                                dstOffset,
-    VkDeviceSize                                fillSize,
-    uint32_t                                    data)
+void
+anv_cmd_buffer_fill_area(struct anv_cmd_buffer *cmd_buffer,
+                         struct anv_address address,
+                         VkDeviceSize size,
+                         uint32_t data)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   ANV_FROM_HANDLE(anv_buffer, dst_buffer, dstBuffer);
    struct blorp_surf surf;
    struct isl_surf isl_surf;
 
    struct blorp_batch batch;
-   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
-
-   fillSize = anv_buffer_get_range(dst_buffer, dstOffset, fillSize);
-
-   /* From the Vulkan spec:
-    *
-    *    "size is the number of bytes to fill, and must be either a multiple
-    *    of 4, or VK_WHOLE_SIZE to fill the range from offset to the end of
-    *    the buffer. If VK_WHOLE_SIZE is used and the remaining size of the
-    *    buffer is not a multiple of 4, then the nearest smaller multiple is
-    *    used."
-    */
-   fillSize &= ~3ull;
+   anv_blorp_batch_init(cmd_buffer, &batch,
+                        cmd_buffer->state.current_pipeline ==
+                        cmd_buffer->device->physical->gpgpu_pipeline_value ?
+                        BLORP_BATCH_USE_COMPUTE : 0);
 
    /* First, we compute the biggest format that can be used with the
     * given offsets and size.
     */
    int bs = 16;
-   bs = gcd_pow2_u64(bs, dstOffset);
-   bs = gcd_pow2_u64(bs, fillSize);
+   uint64_t offset = address.offset;
+   bs = gcd_pow2_u64(bs, offset);
+   bs = gcd_pow2_u64(bs, size);
    enum isl_format isl_format = isl_format_for_size(bs);
 
    union isl_color_value color = {
@@ -956,53 +1182,89 @@ void anv_CmdFillBuffer(
    };
 
    const uint64_t max_fill_size = MAX_SURFACE_DIM * MAX_SURFACE_DIM * bs;
-   while (fillSize >= max_fill_size) {
-      get_blorp_surf_for_anv_buffer(cmd_buffer->device,
-                                    dst_buffer, dstOffset,
-                                    MAX_SURFACE_DIM, MAX_SURFACE_DIM,
-                                    MAX_SURFACE_DIM * bs, isl_format, true,
-                                    &surf, &isl_surf);
+   while (size >= max_fill_size) {
+      get_blorp_surf_for_anv_address(cmd_buffer,
+                                     (struct anv_address) {
+                                        .bo = address.bo, .offset = offset,
+                                     },
+                                     MAX_SURFACE_DIM, MAX_SURFACE_DIM,
+                                     MAX_SURFACE_DIM * bs, isl_format,
+                                     true /* is_dest */,
+                                     &surf, &isl_surf);
 
       blorp_clear(&batch, &surf, isl_format, ISL_SWIZZLE_IDENTITY,
                   0, 0, 1, 0, 0, MAX_SURFACE_DIM, MAX_SURFACE_DIM,
-                  color, NULL);
-      fillSize -= max_fill_size;
-      dstOffset += max_fill_size;
+                  color, 0 /* color_write_disable */);
+      size -= max_fill_size;
+      offset += max_fill_size;
    }
 
-   uint64_t height = fillSize / (MAX_SURFACE_DIM * bs);
+   uint64_t height = size / (MAX_SURFACE_DIM * bs);
    assert(height < MAX_SURFACE_DIM);
    if (height != 0) {
       const uint64_t rect_fill_size = height * MAX_SURFACE_DIM * bs;
-      get_blorp_surf_for_anv_buffer(cmd_buffer->device,
-                                    dst_buffer, dstOffset,
-                                    MAX_SURFACE_DIM, height,
-                                    MAX_SURFACE_DIM * bs, isl_format, true,
-                                    &surf, &isl_surf);
+      get_blorp_surf_for_anv_address(cmd_buffer,
+                                     (struct anv_address) {
+                                        .bo = address.bo, .offset = offset,
+                                     },
+                                     MAX_SURFACE_DIM, height,
+                                     MAX_SURFACE_DIM * bs, isl_format,
+                                     true /* is_dest */,
+                                     &surf, &isl_surf);
 
       blorp_clear(&batch, &surf, isl_format, ISL_SWIZZLE_IDENTITY,
                   0, 0, 1, 0, 0, MAX_SURFACE_DIM, height,
-                  color, NULL);
-      fillSize -= rect_fill_size;
-      dstOffset += rect_fill_size;
+                  color, 0 /* color_write_disable */);
+      size -= rect_fill_size;
+      offset += rect_fill_size;
    }
 
-   if (fillSize != 0) {
-      const uint32_t width = fillSize / bs;
-      get_blorp_surf_for_anv_buffer(cmd_buffer->device,
-                                    dst_buffer, dstOffset,
-                                    width, 1,
-                                    width * bs, isl_format, true,
-                                    &surf, &isl_surf);
+   if (size != 0) {
+      const uint32_t width = size / bs;
+      get_blorp_surf_for_anv_address(cmd_buffer,
+                                     (struct anv_address) {
+                                        .bo = address.bo, .offset = offset,
+                                     },
+                                     width, 1,
+                                     width * bs, isl_format,
+                                     true /* is_dest */,
+                                     &surf, &isl_surf);
 
       blorp_clear(&batch, &surf, isl_format, ISL_SWIZZLE_IDENTITY,
                   0, 0, 1, 0, 0, width, 1,
-                  color, NULL);
+                  color, 0 /* color_write_disable */);
    }
 
-   blorp_batch_finish(&batch);
+   anv_blorp_batch_finish(&batch);
+}
+
+void anv_CmdFillBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    dstBuffer,
+    VkDeviceSize                                dstOffset,
+    VkDeviceSize                                fillSize,
+    uint32_t                                    data)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, dst_buffer, dstBuffer);
+
+   fillSize = vk_buffer_range(&dst_buffer->vk, dstOffset, fillSize);
+
+   /* From the Vulkan spec:
+    *
+    *    "size is the number of bytes to fill, and must be either a multiple
+    *    of 4, or VK_WHOLE_SIZE to fill the range from offset to the end of
+    *    the buffer. If VK_WHOLE_SIZE is used and the remaining size of the
+    *    buffer is not a multiple of 4, then the nearest smaller multiple is
+    *    used."
+    */
+   fillSize &= ~3ull;
+
+   anv_cmd_buffer_fill_area(cmd_buffer,
+                            anv_address_add(dst_buffer->address, dstOffset),
+                            fillSize, data);
 
-   cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_RENDER_TARGET_BUFFER_WRITES;
+   anv_add_buffer_write_pending_bits(cmd_buffer, "after fill buffer");
 }
 
 void anv_CmdClearColorImage(
@@ -1016,11 +1278,16 @@ void anv_CmdClearColorImage(
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_image, image, _image);
 
-   static const bool color_write_disable[4] = { false, false, false, false };
+   struct anv_cmd_buffer *main_cmd_buffer = cmd_buffer;
+   UNUSED struct anv_state rcs_done = ANV_STATE_NULL;
 
-   struct blorp_batch batch;
-   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+   if (anv_blorp_execute_on_companion(cmd_buffer, image)) {
+      rcs_done = record_main_rcs_cmd_buffer_done(cmd_buffer);
+      cmd_buffer = cmd_buffer->companion_rcs_cmd_buffer;
+   }
 
+   struct blorp_batch batch;
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
 
    for (unsigned r = 0; r < rangeCount; r++) {
       if (pRanges[r].aspectMask == 0)
@@ -1029,13 +1296,13 @@ void anv_CmdClearColorImage(
       assert(pRanges[r].aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
 
       struct blorp_surf surf;
-      get_blorp_surf_for_anv_image(cmd_buffer->device,
+      get_blorp_surf_for_anv_image(cmd_buffer,
                                    image, pRanges[r].aspectMask,
                                    VK_IMAGE_USAGE_TRANSFER_DST_BIT,
                                    imageLayout, ISL_AUX_USAGE_NONE, &surf);
 
       struct anv_format_plane src_format =
-         anv_get_format_aspect(&cmd_buffer->device->info, image->vk.format,
+         anv_get_format_aspect(cmd_buffer->device->info, image->vk.format,
                                VK_IMAGE_ASPECT_COLOR_BIT, image->vk.tiling);
 
       unsigned base_layer = pRanges[r].baseArrayLayer;
@@ -1046,12 +1313,12 @@ void anv_CmdClearColorImage(
 
       for (uint32_t i = 0; i < level_count; i++) {
          const unsigned level = pRanges[r].baseMipLevel + i;
-         const unsigned level_width = anv_minify(image->vk.extent.width, level);
-         const unsigned level_height = anv_minify(image->vk.extent.height, level);
+         const unsigned level_width = u_minify(image->vk.extent.width, level);
+         const unsigned level_height = u_minify(image->vk.extent.height, level);
 
          if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
             base_layer = 0;
-            layer_count = anv_minify(image->vk.extent.depth, level);
+            layer_count = u_minify(image->vk.extent.depth, level);
          }
 
          anv_cmd_buffer_mark_image_written(cmd_buffer, image,
@@ -1063,11 +1330,14 @@ void anv_CmdClearColorImage(
                      src_format.isl_format, src_format.swizzle,
                      level, base_layer, layer_count,
                      0, 0, level_width, level_height,
-                     vk_to_isl_color(*pColor), color_write_disable);
+                     vk_to_isl_color(*pColor), 0 /* color_write_disable */);
       }
    }
 
-   blorp_batch_finish(&batch);
+   anv_blorp_batch_finish(&batch);
+
+   if (rcs_done.alloc_size)
+      end_main_rcs_cmd_buffer_done(main_cmd_buffer, rcs_done);
 }
 
 void anv_CmdClearDepthStencilImage(
@@ -1082,11 +1352,12 @@ void anv_CmdClearDepthStencilImage(
    ANV_FROM_HANDLE(anv_image, image, image_h);
 
    struct blorp_batch batch;
-   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
+   assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
 
-   struct blorp_surf depth, stencil, stencil_shadow;
+   struct blorp_surf depth, stencil;
    if (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
-      get_blorp_surf_for_anv_image(cmd_buffer->device,
+      get_blorp_surf_for_anv_image(cmd_buffer,
                                    image, VK_IMAGE_ASPECT_DEPTH_BIT,
                                    VK_IMAGE_USAGE_TRANSFER_DST_BIT,
                                    imageLayout, ISL_AUX_USAGE_NONE, &depth);
@@ -1094,17 +1365,11 @@ void anv_CmdClearDepthStencilImage(
       memset(&depth, 0, sizeof(depth));
    }
 
-   bool has_stencil_shadow = false;
    if (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
-      get_blorp_surf_for_anv_image(cmd_buffer->device,
+      get_blorp_surf_for_anv_image(cmd_buffer,
                                    image, VK_IMAGE_ASPECT_STENCIL_BIT,
                                    VK_IMAGE_USAGE_TRANSFER_DST_BIT,
                                    imageLayout, ISL_AUX_USAGE_NONE, &stencil);
-
-      has_stencil_shadow =
-         get_blorp_surf_for_anv_shadow_image(cmd_buffer->device, image,
-                                             VK_IMAGE_ASPECT_STENCIL_BIT,
-                                             &stencil_shadow);
    } else {
       memset(&stencil, 0, sizeof(stencil));
    }
@@ -1124,11 +1389,11 @@ void anv_CmdClearDepthStencilImage(
 
       for (uint32_t i = 0; i < level_count; i++) {
          const unsigned level = pRanges[r].baseMipLevel + i;
-         const unsigned level_width = anv_minify(image->vk.extent.width, level);
-         const unsigned level_height = anv_minify(image->vk.extent.height, level);
+         const unsigned level_width = u_minify(image->vk.extent.width, level);
+         const unsigned level_height = u_minify(image->vk.extent.height, level);
 
          if (image->vk.image_type == VK_IMAGE_TYPE_3D)
-            layer_count = anv_minify(image->vk.extent.depth, level);
+            layer_count = u_minify(image->vk.extent.depth, level);
 
          blorp_clear_depth_stencil(&batch, &depth, &stencil,
                                    level, base_layer, layer_count,
@@ -1136,21 +1401,10 @@ void anv_CmdClearDepthStencilImage(
                                    clear_depth, pDepthStencil->depth,
                                    clear_stencil ? 0xff : 0,
                                    pDepthStencil->stencil);
-
-         if (clear_stencil && has_stencil_shadow) {
-            union isl_color_value stencil_color = {
-               .u32 = { pDepthStencil->stencil, },
-            };
-            blorp_clear(&batch, &stencil_shadow,
-                        ISL_FORMAT_R8_UINT, ISL_SWIZZLE_IDENTITY,
-                        level, base_layer, layer_count,
-                        0, 0, level_width, level_height,
-                        stencil_color, NULL);
-         }
       }
    }
 
-   blorp_batch_finish(&batch);
+   anv_blorp_batch_finish(&batch);
 }
 
 VkResult
@@ -1170,7 +1424,7 @@ anv_cmd_buffer_alloc_blorp_binding_table(struct anv_cmd_buffer *cmd_buffer,
       /* Re-emit state base addresses so we get the new surface state base
        * address before we start emitting binding tables etc.
        */
-      anv_cmd_buffer_emit_state_base_address(cmd_buffer);
+      anv_cmd_buffer_emit_bt_pool_base_address(cmd_buffer);
 
       *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer, num_entries,
                                                      state_offset);
@@ -1201,43 +1455,418 @@ binding_table_for_surface_state(struct anv_cmd_buffer *cmd_buffer,
    return VK_SUCCESS;
 }
 
+static bool
+can_fast_clear_color_att(struct anv_cmd_buffer *cmd_buffer,
+                         struct blorp_batch *batch,
+                         const struct anv_attachment *att,
+                         const VkClearAttachment *attachment,
+                         uint32_t rectCount, const VkClearRect *pRects)
+{
+   union isl_color_value clear_color =
+      vk_to_isl_color(attachment->clearValue.color);
+
+   if (INTEL_DEBUG(DEBUG_NO_FAST_CLEAR))
+      return false;
+
+   /* We don't support fast clearing with conditional rendering at the
+    * moment. All the tracking done around fast clears (clear color updates
+    * and fast-clear type updates) happens unconditionally.
+    */
+   if (batch->flags & BLORP_BATCH_PREDICATE_ENABLE)
+      return false;
+
+   if (rectCount > 1) {
+      anv_perf_warn(VK_LOG_OBJS(&cmd_buffer->device->vk.base),
+                    "Fast clears for vkCmdClearAttachments supported only for rectCount == 1");
+      return false;
+   }
+
+   /* We only support fast-clears on the first layer */
+   if (pRects[0].layerCount > 1 || pRects[0].baseArrayLayer > 0)
+      return false;
+
+   bool is_multiview = cmd_buffer->state.gfx.view_mask != 0;
+   if (is_multiview && (cmd_buffer->state.gfx.view_mask != 1))
+      return false;
+
+   return anv_can_fast_clear_color_view(cmd_buffer->device,
+                                        (struct anv_image_view *)att->iview,
+                                        att->layout,
+                                        clear_color,
+                                        pRects->layerCount,
+                                        pRects->rect,
+                                        cmd_buffer->queue_family->queueFlags);
+}
+
+static void
+exec_ccs_op(struct anv_cmd_buffer *cmd_buffer,
+            struct blorp_batch *batch,
+            const struct anv_image *image,
+            enum isl_format format, struct isl_swizzle swizzle,
+            VkImageAspectFlagBits aspect, uint32_t level,
+            uint32_t base_layer, uint32_t layer_count,
+            enum isl_aux_op ccs_op, union isl_color_value *clear_value)
+{
+   assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
+   assert(image->vk.samples == 1);
+   assert(level < anv_image_aux_levels(image, aspect));
+   /* Multi-LOD YcBcR is not allowed */
+   assert(image->n_planes == 1 || level == 0);
+   assert(base_layer + layer_count <=
+          anv_image_aux_layers(image, aspect, level));
+
+   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+
+   struct blorp_surf surf;
+   get_blorp_surf_for_anv_image(cmd_buffer, image, aspect,
+                                0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
+                                image->planes[plane].aux_usage,
+                                &surf);
+
+   uint32_t level_width = u_minify(surf.surf->logical_level0_px.w, level);
+   uint32_t level_height = u_minify(surf.surf->logical_level0_px.h, level);
+
+   /* Blorp will store the clear color for us if we provide the clear color
+    * address and we are doing a fast clear. So we save the clear value into
+    * the blorp surface.
+    */
+   if (clear_value)
+      surf.clear_color = *clear_value;
+
+   char flush_reason[64];
+   int ret =
+      snprintf(flush_reason, sizeof(flush_reason),
+               "ccs op start: %s", isl_aux_op_to_name(ccs_op));
+   assert(ret < sizeof(flush_reason));
+
+   /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear":
+    *
+    *    "After Render target fast clear, pipe-control with color cache
+    *    write-flush must be issued before sending any DRAW commands on
+    *    that render target."
+    *
+    * This comment is a bit cryptic and doesn't really tell you what's going
+    * or what's really needed.  It appears that fast clear ops are not
+    * properly synchronized with other drawing.  This means that we cannot
+    * have a fast clear operation in the pipe at the same time as other
+    * regular drawing operations.  We need to use a PIPE_CONTROL to ensure
+    * that the contents of the previous draw hit the render target before we
+    * resolve and then use a second PIPE_CONTROL after the resolve to ensure
+    * that it is completed before any additional drawing occurs.
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                             ANV_PIPE_TILE_CACHE_FLUSH_BIT |
+                             (devinfo->verx10 == 120 ?
+                                ANV_PIPE_DEPTH_STALL_BIT : 0) |
+                             (devinfo->verx10 == 125 ?
+                                ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
+                                ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0) |
+                             ANV_PIPE_PSS_STALL_SYNC_BIT |
+                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                             flush_reason);
+
+   switch (ccs_op) {
+   case ISL_AUX_OP_FAST_CLEAR:
+      /* From the ICL PRMs, Volume 9: Render Engine, State Caching :
+       *
+       *    "Any values referenced by pointers within the RENDER_SURFACE_STATE
+       *     or SAMPLER_STATE (e.g. Clear Color Pointer, Border Color or
+       *     Indirect State Pointer) are considered to be part of that state
+       *     and any changes to these referenced values requires an
+       *     invalidation of the L1 state cache to ensure the new values are
+       *     being used as part of the state. In the case of surface data
+       *     pointed to by the Surface Base Address in RENDER SURFACE STATE,
+       *     the Texture Cache must be invalidated if the surface data
+       *     changes."
+       *
+       * and From the Render Target Fast Clear section,
+       *
+       *   "HwManaged FastClear allows SW to store FastClearValue in separate
+       *   graphics allocation, instead of keeping them in
+       *   RENDER_SURFACE_STATE. This behavior can be enabled by setting
+       *   ClearValueAddressEnable in RENDER_SURFACE_STATE.
+       *
+       *    Proper sequence of commands is as follows:
+       *
+       *       1. Storing clear color to allocation
+       *       2. Ensuring that step 1. is finished and visible for TextureCache
+       *       3. Performing FastClear
+       *
+       *    Step 2. is required on products with ClearColorConversion feature.
+       *    This feature is enabled by setting ClearColorConversionEnable.
+       *    This causes HW to read stored color from ClearColorAllocation and
+       *    write back with the native format or RenderTarget - and clear
+       *    color needs to be present and visible. Reading is done from
+       *    TextureCache, writing is done to RenderCache."
+       *
+       * We're going to change the clear color. Invalidate the texture cache
+       * now to ensure the clear color conversion feature works properly.
+       * Although the docs seem to require invalidating the texture cache
+       * after updating the clear color allocation, we can do this beforehand
+       * so long as we ensure:
+       *
+       *    1. Step 1 is complete before the texture cache is accessed in step 3
+       *    2. We don't access the texture cache between invalidation and step 3
+       *
+       * The second requirement is satisfied because we'll be performing step
+       * 1 and 3 right after invalidating. The first is satisfied because
+       * BLORP updates the clear color before performing the fast clear and it
+       * performs the synchronizations suggested by the Render Target Fast
+       * Clear section (not quoted here) to ensure its completion.
+       *
+       * While we're here, also invalidate the state cache as suggested.
+       */
+      if (devinfo->ver >= 11) {
+         anv_add_pending_pipe_bits(cmd_buffer,
+                                   ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
+                                   ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT,
+                                   "before blorp clear color update");
+      }
+
+      blorp_fast_clear(batch, &surf, format, swizzle,
+                       level, base_layer, layer_count,
+                       0, 0, level_width, level_height);
+      break;
+   case ISL_AUX_OP_FULL_RESOLVE:
+   case ISL_AUX_OP_PARTIAL_RESOLVE: {
+      /* Wa_1508744258: Enable RHWO optimization for resolves */
+      const bool enable_rhwo_opt =
+         intel_needs_workaround(cmd_buffer->device->info, 1508744258);
+
+      if (enable_rhwo_opt)
+         cmd_buffer->state.pending_rhwo_optimization_enabled = true;
+
+      blorp_ccs_resolve(batch, &surf, level, base_layer, layer_count,
+                        format, ccs_op);
+
+      if (enable_rhwo_opt)
+         cmd_buffer->state.pending_rhwo_optimization_enabled = false;
+      break;
+   }
+   case ISL_AUX_OP_AMBIGUATE:
+      for (uint32_t a = 0; a < layer_count; a++) {
+         const uint32_t layer = base_layer + a;
+         blorp_ccs_ambiguate(batch, &surf, level, layer);
+      }
+      break;
+   default:
+      unreachable("Unsupported CCS operation");
+   }
+
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                             (devinfo->verx10 == 120 ?
+                                ANV_PIPE_TILE_CACHE_FLUSH_BIT |
+                                ANV_PIPE_DEPTH_STALL_BIT : 0) |
+                             ANV_PIPE_PSS_STALL_SYNC_BIT |
+                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                             "ccs op finish");
+}
+
+static void
+exec_mcs_op(struct anv_cmd_buffer *cmd_buffer,
+            struct blorp_batch *batch,
+            const struct anv_image *image,
+            enum isl_format format, struct isl_swizzle swizzle,
+            VkImageAspectFlagBits aspect,
+            uint32_t base_layer, uint32_t layer_count,
+            enum isl_aux_op mcs_op, union isl_color_value *clear_value)
+{
+   assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
+   assert(image->vk.samples > 1);
+   assert(base_layer + layer_count <= anv_image_aux_layers(image, aspect, 0));
+
+   /* Multisampling with multi-planar formats is not supported */
+   assert(image->n_planes == 1);
+
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+   struct blorp_surf surf;
+   get_blorp_surf_for_anv_image(cmd_buffer, image, aspect,
+                                0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
+                                ISL_AUX_USAGE_MCS, &surf);
+
+   /* Blorp will store the clear color for us if we provide the clear color
+    * address and we are doing a fast clear. So we save the clear value into
+    * the blorp surface.
+    */
+   if (clear_value)
+      surf.clear_color = *clear_value;
+
+   /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear":
+    *
+    *    "After Render target fast clear, pipe-control with color cache
+    *    write-flush must be issued before sending any DRAW commands on
+    *    that render target."
+    *
+    * This comment is a bit cryptic and doesn't really tell you what's going
+    * or what's really needed.  It appears that fast clear ops are not
+    * properly synchronized with other drawing.  This means that we cannot
+    * have a fast clear operation in the pipe at the same time as other
+    * regular drawing operations.  We need to use a PIPE_CONTROL to ensure
+    * that the contents of the previous draw hit the render target before we
+    * resolve and then use a second PIPE_CONTROL after the resolve to ensure
+    * that it is completed before any additional drawing occurs.
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                             ANV_PIPE_TILE_CACHE_FLUSH_BIT |
+                             (devinfo->verx10 == 120 ?
+                                ANV_PIPE_DEPTH_STALL_BIT : 0) |
+                             (devinfo->verx10 == 125 ?
+                                ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
+                                ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0) |
+                             ANV_PIPE_PSS_STALL_SYNC_BIT |
+                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                             "before fast clear mcs");
+
+   switch (mcs_op) {
+   case ISL_AUX_OP_FAST_CLEAR:
+      /* From the ICL PRMs, Volume 9: Render Engine, State Caching :
+       *
+       *    "Any values referenced by pointers within the RENDER_SURFACE_STATE
+       *     or SAMPLER_STATE (e.g. Clear Color Pointer, Border Color or
+       *     Indirect State Pointer) are considered to be part of that state
+       *     and any changes to these referenced values requires an
+       *     invalidation of the L1 state cache to ensure the new values are
+       *     being used as part of the state. In the case of surface data
+       *     pointed to by the Surface Base Address in RENDER SURFACE STATE,
+       *     the Texture Cache must be invalidated if the surface data
+       *     changes."
+       *
+       * and From the Render Target Fast Clear section,
+       *
+       *   "HwManaged FastClear allows SW to store FastClearValue in separate
+       *   graphics allocation, instead of keeping them in
+       *   RENDER_SURFACE_STATE. This behavior can be enabled by setting
+       *   ClearValueAddressEnable in RENDER_SURFACE_STATE.
+       *
+       *    Proper sequence of commands is as follows:
+       *
+       *       1. Storing clear color to allocation
+       *       2. Ensuring that step 1. is finished and visible for TextureCache
+       *       3. Performing FastClear
+       *
+       *    Step 2. is required on products with ClearColorConversion feature.
+       *    This feature is enabled by setting ClearColorConversionEnable.
+       *    This causes HW to read stored color from ClearColorAllocation and
+       *    write back with the native format or RenderTarget - and clear
+       *    color needs to be present and visible. Reading is done from
+       *    TextureCache, writing is done to RenderCache."
+       *
+       * We're going to change the clear color. Invalidate the texture cache
+       * now to ensure the clear color conversion feature works properly.
+       * Although the docs seem to require invalidating the texture cache
+       * after updating the clear color allocation, we can do this beforehand
+       * so long as we ensure:
+       *
+       *    1. Step 1 is complete before the texture cache is accessed in step 3
+       *    2. We don't access the texture cache between invalidation and step 3
+       *
+       * The second requirement is satisfied because we'll be performing step
+       * 1 and 3 right after invalidating. The first is satisfied because
+       * BLORP updates the clear color before performing the fast clear and it
+       * performs the synchronizations suggested by the Render Target Fast
+       * Clear section (not quoted here) to ensure its completion.
+       *
+       * While we're here, also invalidate the state cache as suggested.
+       */
+      if (devinfo->ver >= 11) {
+         anv_add_pending_pipe_bits(cmd_buffer,
+                                   ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
+                                   ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT,
+                                   "before blorp clear color update");
+      }
+
+      blorp_fast_clear(batch, &surf, format, swizzle,
+                       0, base_layer, layer_count,
+                       0, 0, image->vk.extent.width, image->vk.extent.height);
+      break;
+   case ISL_AUX_OP_PARTIAL_RESOLVE:
+      blorp_mcs_partial_resolve(batch, &surf, format,
+                                base_layer, layer_count);
+      break;
+   case ISL_AUX_OP_AMBIGUATE:
+      blorp_mcs_ambiguate(batch, &surf, base_layer, layer_count);
+      break;
+   case ISL_AUX_OP_FULL_RESOLVE:
+   default:
+      unreachable("Unsupported MCS operation");
+   }
+
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                             (devinfo->verx10 == 120 ?
+                                ANV_PIPE_TILE_CACHE_FLUSH_BIT |
+                                ANV_PIPE_DEPTH_STALL_BIT : 0) |
+                             ANV_PIPE_PSS_STALL_SYNC_BIT |
+                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                             "after fast clear mcs");
+}
+
 static void
 clear_color_attachment(struct anv_cmd_buffer *cmd_buffer,
                        struct blorp_batch *batch,
                        const VkClearAttachment *attachment,
                        uint32_t rectCount, const VkClearRect *pRects)
 {
-   const struct anv_subpass *subpass = cmd_buffer->state.subpass;
-   const uint32_t color_att = attachment->colorAttachment;
-   assert(color_att < subpass->color_count);
-   const uint32_t att_idx = subpass->color_attachments[color_att].attachment;
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   const uint32_t att_idx = attachment->colorAttachment;
+   assert(att_idx < gfx->color_att_count);
+   const struct anv_attachment *att = &gfx->color_att[att_idx];
 
-   if (att_idx == VK_ATTACHMENT_UNUSED)
+   if (att->vk_format == VK_FORMAT_UNDEFINED)
       return;
 
-   struct anv_render_pass_attachment *pass_att =
-      &cmd_buffer->state.pass->attachments[att_idx];
-   struct anv_attachment_state *att_state =
-      &cmd_buffer->state.attachments[att_idx];
+   union isl_color_value clear_color =
+      vk_to_isl_color(attachment->clearValue.color);
+
+   const struct anv_image_view *iview = att->iview;
+   if (iview &&
+       can_fast_clear_color_att(cmd_buffer, batch, att,
+                                attachment, rectCount, pRects)) {
+      if (iview->image->vk.samples == 1) {
+         exec_ccs_op(cmd_buffer, batch, iview->image,
+                     iview->planes[0].isl.format,
+                     iview->planes[0].isl.swizzle,
+                     VK_IMAGE_ASPECT_COLOR_BIT,
+                     0, 0, 1, ISL_AUX_OP_FAST_CLEAR,
+                     &clear_color);
+      } else {
+         exec_mcs_op(cmd_buffer, batch, iview->image,
+                     iview->planes[0].isl.format,
+                     iview->planes[0].isl.swizzle,
+                     VK_IMAGE_ASPECT_COLOR_BIT,
+                     0, 1, ISL_AUX_OP_FAST_CLEAR,
+                     &clear_color);
+      }
+
+      anv_cmd_buffer_mark_image_fast_cleared(cmd_buffer, iview->image,
+                                             iview->planes[0].isl.format,
+                                             clear_color);
+      anv_cmd_buffer_load_clear_color_from_image(cmd_buffer,
+                                                 att->surface_state.state,
+                                                 iview->image);
+      return;
+   }
 
    uint32_t binding_table;
    VkResult result =
-      binding_table_for_surface_state(cmd_buffer, att_state->color.state,
+      binding_table_for_surface_state(cmd_buffer, att->surface_state.state,
                                       &binding_table);
    if (result != VK_SUCCESS)
       return;
 
-   union isl_color_value clear_color =
-      vk_to_isl_color(attachment->clearValue.color);
-
    /* If multiview is enabled we ignore baseArrayLayer and layerCount */
-   if (subpass->view_mask) {
-      u_foreach_bit(view_idx, subpass->view_mask) {
+   if (gfx->view_mask) {
+      u_foreach_bit(view_idx, gfx->view_mask) {
          for (uint32_t r = 0; r < rectCount; ++r) {
             const VkOffset2D offset = pRects[r].rect.offset;
             const VkExtent2D extent = pRects[r].rect.extent;
             blorp_clear_attachments(batch, binding_table,
-                                    ISL_FORMAT_UNSUPPORTED, pass_att->samples,
+                                    ISL_FORMAT_UNSUPPORTED,
+                                    gfx->samples,
                                     view_idx, 1,
                                     offset.x, offset.y,
                                     offset.x + extent.width,
@@ -1253,7 +1882,8 @@ clear_color_attachment(struct anv_cmd_buffer *cmd_buffer,
       const VkExtent2D extent = pRects[r].rect.extent;
       assert(pRects[r].layerCount != VK_REMAINING_ARRAY_LAYERS);
       blorp_clear_attachments(batch, binding_table,
-                              ISL_FORMAT_UNSUPPORTED, pass_att->samples,
+                              ISL_FORMAT_UNSUPPORTED,
+                              gfx->samples,
                               pRects[r].baseArrayLayer,
                               pRects[r].layerCount,
                               offset.x, offset.y,
@@ -1263,28 +1893,213 @@ clear_color_attachment(struct anv_cmd_buffer *cmd_buffer,
 }
 
 static void
+anv_fast_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer,
+                             struct blorp_batch *batch,
+                             const struct anv_image *image,
+                             VkImageAspectFlags aspects,
+                             uint32_t level,
+                             uint32_t base_layer, uint32_t layer_count,
+                             VkRect2D area, uint8_t stencil_value)
+{
+   assert(image->vk.aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
+                               VK_IMAGE_ASPECT_STENCIL_BIT));
+
+   struct blorp_surf depth = {};
+   if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
+      const uint32_t plane =
+         anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
+      assert(base_layer + layer_count <=
+             anv_image_aux_layers(image, VK_IMAGE_ASPECT_DEPTH_BIT, level));
+      get_blorp_surf_for_anv_image(cmd_buffer,
+                                   image, VK_IMAGE_ASPECT_DEPTH_BIT,
+                                   0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
+                                   image->planes[plane].aux_usage, &depth);
+   }
+
+   struct blorp_surf stencil = {};
+   if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
+      const uint32_t plane =
+         anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
+      get_blorp_surf_for_anv_image(cmd_buffer,
+                                   image, VK_IMAGE_ASPECT_STENCIL_BIT,
+                                   0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
+                                   image->planes[plane].aux_usage, &stencil);
+   }
+
+   /* From the Sky Lake PRM Volume 7, "Depth Buffer Clear":
+    *
+    *    "The following is required when performing a depth buffer clear with
+    *    using the WM_STATE or 3DSTATE_WM:
+    *
+    *       * If other rendering operations have preceded this clear, a
+    *         PIPE_CONTROL with depth cache flush enabled, Depth Stall bit
+    *         enabled must be issued before the rectangle primitive used for
+    *         the depth buffer clear operation.
+    *       * [...]"
+    *
+    * Even though the PRM only says that this is required if using 3DSTATE_WM
+    * and a 3DPRIMITIVE, the GPU appears to also need this to avoid occasional
+    * hangs when doing a clear with WM_HZ_OP.
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
+                             ANV_PIPE_DEPTH_STALL_BIT,
+                             "before clear hiz");
+
+   if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
+       depth.aux_usage == ISL_AUX_USAGE_HIZ_CCS_WT) {
+      /* From Bspec 47010 (Depth Buffer Clear):
+       *
+       *    Since the fast clear cycles to CCS are not cached in TileCache,
+       *    any previous depth buffer writes to overlapping pixels must be
+       *    flushed out of TileCache before a succeeding Depth Buffer Clear.
+       *    This restriction only applies to Depth Buffer with write-thru
+       *    enabled, since fast clears to CCS only occur for write-thru mode.
+       *
+       * There may have been a write to this depth buffer. Flush it from the
+       * tile cache just in case.
+       *
+       * Set CS stall bit to guarantee that the fast clear starts the execution
+       * after the tile cache flush completed.
+       *
+       * There is no Bspec requirement to flush the data cache but the
+       * experiment shows that flusing the data cache helps to resolve the
+       * corruption.
+       */
+      unsigned wa_flush = cmd_buffer->device->info->verx10 >= 125 ?
+                          ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
+                                ANV_PIPE_CS_STALL_BIT |
+                                ANV_PIPE_TILE_CACHE_FLUSH_BIT |
+                                wa_flush,
+                                "before clear hiz_ccs_wt");
+   }
+
+   blorp_hiz_clear_depth_stencil(batch, &depth, &stencil,
+                                 level, base_layer, layer_count,
+                                 area.offset.x, area.offset.y,
+                                 area.offset.x + area.extent.width,
+                                 area.offset.y + area.extent.height,
+                                 aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                                 ANV_HZ_FC_VAL,
+                                 aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
+                                 stencil_value);
+
+   /* From the SKL PRM, Depth Buffer Clear:
+    *
+    *    "Depth Buffer Clear Workaround
+    *
+    *    Depth buffer clear pass using any of the methods (WM_STATE,
+    *    3DSTATE_WM or 3DSTATE_WM_HZ_OP) must be followed by a PIPE_CONTROL
+    *    command with DEPTH_STALL bit and Depth FLUSH bits “set” before
+    *    starting to render.  DepthStall and DepthFlush are not needed between
+    *    consecutive depth clear passes nor is it required if the depth-clear
+    *    pass was done with “full_surf_clear” bit set in the
+    *    3DSTATE_WM_HZ_OP."
+    *
+    * Even though the PRM provides a bunch of conditions under which this is
+    * supposedly unnecessary, we choose to perform the flush unconditionally
+    * just to be safe.
+    *
+    * From Bspec 46959, a programming note applicable to Gfx12+:
+    *
+    *    "Since HZ_OP has to be sent twice (first time set the clear/resolve state
+    *    and 2nd time to clear the state), and HW internally flushes the depth
+    *    cache on HZ_OP, there is no need to explicitly send a Depth Cache flush
+    *    after Clear or Resolve."
+    */
+   if (cmd_buffer->device->info->verx10 < 120) {
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
+                                ANV_PIPE_DEPTH_STALL_BIT,
+                                "after clear hiz");
+   }
+}
+
+static bool
+can_hiz_clear_att(struct anv_cmd_buffer *cmd_buffer,
+                  struct blorp_batch *batch,
+                  const struct anv_attachment *ds_att,
+                  const VkClearAttachment *attachment,
+                  uint32_t rectCount, const VkClearRect *pRects)
+{
+   if (INTEL_DEBUG(DEBUG_NO_FAST_CLEAR))
+      return false;
+
+   /* From Bspec's section MI_PREDICATE:
+    *
+    *    "The MI_PREDICATE command is used to control the Predicate state bit,
+    *    which in turn can be used to enable/disable the processing of
+    *    3DPRIMITIVE commands."
+    *
+    * Also from BDW/CHV Bspec's 3DSTATE_WM_HZ_OP programming notes:
+    *
+    *    "This command does NOT support predication from the use of the
+    *    MI_PREDICATE register. To predicate depth clears and resolves on you
+    *    must fall back to using the 3D_PRIMITIVE or GPGPU_WALKER commands."
+    *
+    * Since BLORP's predication is currently dependent on MI_PREDICATE, fall
+    * back to the slow depth clear path when the BLORP_BATCH_PREDICATE_ENABLE
+    * flag is set.
+    */
+   if (batch->flags & BLORP_BATCH_PREDICATE_ENABLE)
+      return false;
+
+   if (rectCount > 1) {
+      anv_perf_warn(VK_LOG_OBJS(&cmd_buffer->device->vk.base),
+                    "Fast clears for vkCmdClearAttachments supported only for rectCount == 1");
+      return false;
+   }
+
+   /* When the BLORP_BATCH_NO_EMIT_DEPTH_STENCIL flag is set, BLORP can only
+    * clear the first slice of the currently configured depth/stencil view.
+    */
+   assert(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL);
+   if (pRects[0].layerCount > 1 || pRects[0].baseArrayLayer > 0)
+      return false;
+
+   return anv_can_hiz_clear_ds_view(cmd_buffer->device, ds_att->iview,
+                                    ds_att->layout,
+                                    attachment->aspectMask,
+                                    attachment->clearValue.depthStencil.depth,
+                                    pRects->rect,
+                                    cmd_buffer->queue_family->queueFlags);
+}
+
+static void
 clear_depth_stencil_attachment(struct anv_cmd_buffer *cmd_buffer,
                                struct blorp_batch *batch,
                                const VkClearAttachment *attachment,
                                uint32_t rectCount, const VkClearRect *pRects)
 {
    static const union isl_color_value color_value = { .u32 = { 0, } };
-   const struct anv_subpass *subpass = cmd_buffer->state.subpass;
-   if (!subpass->depth_stencil_attachment)
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   const struct anv_attachment *d_att = &gfx->depth_att;
+   const struct anv_attachment *s_att = &gfx->stencil_att;
+   if (d_att->vk_format == VK_FORMAT_UNDEFINED &&
+       s_att->vk_format == VK_FORMAT_UNDEFINED)
       return;
 
-   const uint32_t att_idx = subpass->depth_stencil_attachment->attachment;
-   assert(att_idx != VK_ATTACHMENT_UNUSED);
-   struct anv_render_pass_attachment *pass_att =
-      &cmd_buffer->state.pass->attachments[att_idx];
+   const struct anv_attachment *ds_att = d_att->iview ? d_att : s_att;
+   if (ds_att->iview &&
+       can_hiz_clear_att(cmd_buffer, batch, ds_att, attachment, rectCount, pRects)) {
+      anv_fast_clear_depth_stencil(cmd_buffer, batch, ds_att->iview->image,
+                                   attachment->aspectMask,
+                                   ds_att->iview->planes[0].isl.base_level,
+                                   ds_att->iview->planes[0].isl.base_array_layer,
+                                   pRects[0].layerCount, pRects->rect,
+                                   attachment->clearValue.depthStencil.stencil);
+      return;
+   }
 
    bool clear_depth = attachment->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT;
    bool clear_stencil = attachment->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT;
 
    enum isl_format depth_format = ISL_FORMAT_UNSUPPORTED;
-   if (clear_depth) {
-      depth_format = anv_get_isl_format(&cmd_buffer->device->info,
-                                        pass_att->format,
+   if (d_att->vk_format != VK_FORMAT_UNDEFINED) {
+      depth_format = anv_get_isl_format(cmd_buffer->device->info,
+                                        d_att->vk_format,
                                         VK_IMAGE_ASPECT_DEPTH_BIT,
                                         VK_IMAGE_TILING_OPTIMAL);
    }
@@ -1292,20 +2107,21 @@ clear_depth_stencil_attachment(struct anv_cmd_buffer *cmd_buffer,
    uint32_t binding_table;
    VkResult result =
       binding_table_for_surface_state(cmd_buffer,
-                                      cmd_buffer->state.null_surface_state,
+                                      gfx->null_surface_state,
                                       &binding_table);
    if (result != VK_SUCCESS)
       return;
 
    /* If multiview is enabled we ignore baseArrayLayer and layerCount */
-   if (subpass->view_mask) {
-      u_foreach_bit(view_idx, subpass->view_mask) {
+   if (gfx->view_mask) {
+      u_foreach_bit(view_idx, gfx->view_mask) {
          for (uint32_t r = 0; r < rectCount; ++r) {
             const VkOffset2D offset = pRects[r].rect.offset;
             const VkExtent2D extent = pRects[r].rect.extent;
             VkClearDepthStencilValue value = attachment->clearValue.depthStencil;
             blorp_clear_attachments(batch, binding_table,
-                                    depth_format, pass_att->samples,
+                                    depth_format,
+                                    gfx->samples,
                                     view_idx, 1,
                                     offset.x, offset.y,
                                     offset.x + extent.width,
@@ -1324,7 +2140,8 @@ clear_depth_stencil_attachment(struct anv_cmd_buffer *cmd_buffer,
       VkClearDepthStencilValue value = attachment->clearValue.depthStencil;
       assert(pRects[r].layerCount != VK_REMAINING_ARRAY_LAYERS);
       blorp_clear_attachments(batch, binding_table,
-                              depth_format, pass_att->samples,
+                              depth_format,
+                              gfx->samples,
                               pRects[r].baseArrayLayer,
                               pRects[r].layerCount,
                               offset.x, offset.y,
@@ -1353,7 +2170,7 @@ void anv_CmdClearAttachments(
       anv_cmd_emit_conditional_render_predicate(cmd_buffer);
       flags |= BLORP_BATCH_PREDICATE_ENABLE;
    }
-   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, flags);
+   anv_blorp_batch_init(cmd_buffer, &batch, flags);
 
    for (uint32_t a = 0; a < attachmentCount; ++a) {
       if (pAttachments[a].aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
@@ -1368,21 +2185,17 @@ void anv_CmdClearAttachments(
       }
    }
 
-   blorp_batch_finish(&batch);
+   anv_blorp_batch_finish(&batch);
 }
 
-enum subpass_stage {
-   SUBPASS_STAGE_LOAD,
-   SUBPASS_STAGE_DRAW,
-   SUBPASS_STAGE_RESOLVE,
-};
-
-void
+static void
 anv_image_msaa_resolve(struct anv_cmd_buffer *cmd_buffer,
                        const struct anv_image *src_image,
+                       enum isl_format src_format_override,
                        enum isl_aux_usage src_aux_usage,
                        uint32_t src_level, uint32_t src_base_layer,
                        const struct anv_image *dst_image,
+                       enum isl_format dst_format_override,
                        enum isl_aux_usage dst_aux_usage,
                        uint32_t dst_level, uint32_t dst_base_layer,
                        VkImageAspectFlagBits aspect,
@@ -1393,16 +2206,16 @@ anv_image_msaa_resolve(struct anv_cmd_buffer *cmd_buffer,
                        enum blorp_filter filter)
 {
    struct blorp_batch batch;
-   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
+   assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
 
    assert(src_image->vk.image_type == VK_IMAGE_TYPE_2D);
    assert(src_image->vk.samples > 1);
    assert(dst_image->vk.image_type == VK_IMAGE_TYPE_2D);
    assert(dst_image->vk.samples == 1);
-   assert(src_image->n_planes == dst_image->n_planes);
 
    struct blorp_surf src_surf, dst_surf;
-   get_blorp_surf_for_anv_image(cmd_buffer->device, src_image, aspect,
+   get_blorp_surf_for_anv_image(cmd_buffer, src_image, aspect,
                                 VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
                                 ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
                                 src_aux_usage, &src_surf);
@@ -1411,7 +2224,7 @@ anv_image_msaa_resolve(struct anv_cmd_buffer *cmd_buffer,
          anv_image_get_clear_color_addr(cmd_buffer->device, src_image,
                                         VK_IMAGE_ASPECT_COLOR_BIT));
    }
-   get_blorp_surf_for_anv_image(cmd_buffer->device, dst_image, aspect,
+   get_blorp_surf_for_anv_image(cmd_buffer, dst_image, aspect,
                                 VK_IMAGE_USAGE_TRANSFER_DST_BIT,
                                 ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
                                 dst_aux_usage, &dst_surf);
@@ -1435,15 +2248,105 @@ anv_image_msaa_resolve(struct anv_cmd_buffer *cmd_buffer,
    for (uint32_t l = 0; l < layer_count; l++) {
       blorp_blit(&batch,
                  &src_surf, src_level, src_base_layer + l,
-                 ISL_FORMAT_UNSUPPORTED, ISL_SWIZZLE_IDENTITY,
+                 src_format_override, ISL_SWIZZLE_IDENTITY,
                  &dst_surf, dst_level, dst_base_layer + l,
-                 ISL_FORMAT_UNSUPPORTED, ISL_SWIZZLE_IDENTITY,
+                 dst_format_override, ISL_SWIZZLE_IDENTITY,
                  src_x, src_y, src_x + width, src_y + height,
                  dst_x, dst_y, dst_x + width, dst_y + height,
                  filter, false, false);
    }
 
-   blorp_batch_finish(&batch);
+   anv_blorp_batch_finish(&batch);
+}
+
+static enum blorp_filter
+vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)
+{
+   switch (vk_mode) {
+   case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT:
+      return BLORP_FILTER_SAMPLE_0;
+   case VK_RESOLVE_MODE_AVERAGE_BIT:
+      return BLORP_FILTER_AVERAGE;
+   case VK_RESOLVE_MODE_MIN_BIT:
+      return BLORP_FILTER_MIN_SAMPLE;
+   case VK_RESOLVE_MODE_MAX_BIT:
+      return BLORP_FILTER_MAX_SAMPLE;
+   default:
+      return BLORP_FILTER_NONE;
+   }
+}
+
+void
+anv_attachment_msaa_resolve(struct anv_cmd_buffer *cmd_buffer,
+                            const struct anv_attachment *att,
+                            VkImageLayout layout,
+                            VkImageAspectFlagBits aspect)
+{
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   const struct anv_image_view *src_iview = att->iview;
+   const struct anv_image_view *dst_iview = att->resolve_iview;
+
+   enum isl_aux_usage src_aux_usage =
+      anv_layout_to_aux_usage(cmd_buffer->device->info,
+                              src_iview->image, aspect,
+                              VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
+                              layout,
+                              cmd_buffer->queue_family->queueFlags);
+
+   enum isl_aux_usage dst_aux_usage =
+      anv_layout_to_aux_usage(cmd_buffer->device->info,
+                              dst_iview->image, aspect,
+                              VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+                              att->resolve_layout,
+                              cmd_buffer->queue_family->queueFlags);
+
+   enum blorp_filter filter = vk_to_blorp_resolve_mode(att->resolve_mode);
+
+   /* Depth/stencil should not use their view format for resolve because they
+    * go in pairs.
+    */
+   enum isl_format src_format = ISL_FORMAT_UNSUPPORTED;
+   enum isl_format dst_format = ISL_FORMAT_UNSUPPORTED;
+   if (!(aspect & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
+      src_format = src_iview->planes[0].isl.format;
+      dst_format = dst_iview->planes[0].isl.format;
+   }
+
+   const VkRect2D render_area = gfx->render_area;
+   if (gfx->view_mask == 0) {
+      anv_image_msaa_resolve(cmd_buffer,
+                             src_iview->image, src_format, src_aux_usage,
+                             src_iview->planes[0].isl.base_level,
+                             src_iview->planes[0].isl.base_array_layer,
+                             dst_iview->image, dst_format, dst_aux_usage,
+                             dst_iview->planes[0].isl.base_level,
+                             dst_iview->planes[0].isl.base_array_layer,
+                             aspect,
+                             render_area.offset.x, render_area.offset.y,
+                             render_area.offset.x, render_area.offset.y,
+                             render_area.extent.width,
+                             render_area.extent.height,
+                             gfx->layer_count, filter);
+   } else {
+      uint32_t res_view_mask = gfx->view_mask;
+      while (res_view_mask) {
+         int i = u_bit_scan(&res_view_mask);
+
+         anv_image_msaa_resolve(cmd_buffer,
+                                src_iview->image, src_format, src_aux_usage,
+                                src_iview->planes[0].isl.base_level,
+                                src_iview->planes[0].isl.base_array_layer + i,
+                                dst_iview->image, dst_format, dst_aux_usage,
+                                dst_iview->planes[0].isl.base_level,
+                                dst_iview->planes[0].isl.base_array_layer + i,
+                                aspect,
+                                render_area.offset.x, render_area.offset.y,
+                                render_area.offset.x, render_area.offset.y,
+                                render_area.extent.width,
+                                render_area.extent.height,
+                                1, filter);
+      }
+   }
 }
 
 static void
@@ -1452,7 +2355,7 @@ resolve_image(struct anv_cmd_buffer *cmd_buffer,
               VkImageLayout src_image_layout,
               struct anv_image *dst_image,
               VkImageLayout dst_image_layout,
-              const VkImageResolve2KHR *region)
+              const VkImageResolve2 *region)
 {
    assert(region->srcSubresource.aspectMask == region->dstSubresource.aspectMask);
    assert(vk_image_subresource_layer_count(&src_image->vk, &region->srcSubresource) ==
@@ -1464,21 +2367,23 @@ resolve_image(struct anv_cmd_buffer *cmd_buffer,
    anv_foreach_image_aspect_bit(aspect_bit, src_image,
                                 region->srcSubresource.aspectMask) {
       enum isl_aux_usage src_aux_usage =
-         anv_layout_to_aux_usage(&cmd_buffer->device->info, src_image,
+         anv_layout_to_aux_usage(cmd_buffer->device->info, src_image,
                                  (1 << aspect_bit),
                                  VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
-                                 src_image_layout);
+                                 src_image_layout,
+                                 cmd_buffer->queue_family->queueFlags);
       enum isl_aux_usage dst_aux_usage =
-         anv_layout_to_aux_usage(&cmd_buffer->device->info, dst_image,
+         anv_layout_to_aux_usage(cmd_buffer->device->info, dst_image,
                                  (1 << aspect_bit),
                                  VK_IMAGE_USAGE_TRANSFER_DST_BIT,
-                                 dst_image_layout);
+                                 dst_image_layout,
+                                 cmd_buffer->queue_family->queueFlags);
 
       anv_image_msaa_resolve(cmd_buffer,
-                             src_image, src_aux_usage,
+                             src_image, ISL_FORMAT_UNSUPPORTED, src_aux_usage,
                              region->srcSubresource.mipLevel,
                              region->srcSubresource.baseArrayLayer,
-                             dst_image, dst_aux_usage,
+                             dst_image, ISL_FORMAT_UNSUPPORTED, dst_aux_usage,
                              region->dstSubresource.mipLevel,
                              region->dstSubresource.baseArrayLayer,
                              (1 << aspect_bit),
@@ -1492,9 +2397,9 @@ resolve_image(struct anv_cmd_buffer *cmd_buffer,
    }
 }
 
-void anv_CmdResolveImage2KHR(
+void anv_CmdResolveImage2(
     VkCommandBuffer                             commandBuffer,
-    const VkResolveImageInfo2KHR*               pResolveImageInfo)
+    const VkResolveImageInfo2*                  pResolveImageInfo)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_image, src_image, pResolveImageInfo->srcImage);
@@ -1509,63 +2414,6 @@ void anv_CmdResolveImage2KHR(
 }
 
 void
-anv_image_copy_to_shadow(struct anv_cmd_buffer *cmd_buffer,
-                         const struct anv_image *image,
-                         VkImageAspectFlagBits aspect,
-                         uint32_t base_level, uint32_t level_count,
-                         uint32_t base_layer, uint32_t layer_count)
-{
-   struct blorp_batch batch;
-   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
-
-   /* We don't know who touched the main surface last so flush a bunch of
-    * caches to ensure we get good data.
-    */
-   anv_add_pending_pipe_bits(cmd_buffer,
-                             ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
-                             ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
-                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
-                             ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT,
-                             "before copy_to_shadow");
-
-   struct blorp_surf surf;
-   get_blorp_surf_for_anv_image(cmd_buffer->device,
-                                image, aspect,
-                                VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
-                                VK_IMAGE_LAYOUT_GENERAL,
-                                ISL_AUX_USAGE_NONE, &surf);
-   assert(surf.aux_usage == ISL_AUX_USAGE_NONE);
-
-   struct blorp_surf shadow_surf;
-   get_blorp_surf_for_anv_shadow_image(cmd_buffer->device,
-                                       image, aspect, &shadow_surf);
-
-   for (uint32_t l = 0; l < level_count; l++) {
-      const uint32_t level = base_level + l;
-
-      const VkExtent3D extent = vk_image_mip_level_extent(&image->vk, level);
-
-      if (image->vk.image_type == VK_IMAGE_TYPE_3D)
-         layer_count = extent.depth;
-
-      for (uint32_t a = 0; a < layer_count; a++) {
-         const uint32_t layer = base_layer + a;
-
-         blorp_copy(&batch, &surf, level, layer,
-                    &shadow_surf, level, layer,
-                    0, 0, 0, 0, extent.width, extent.height);
-      }
-   }
-
-   /* We just wrote to the buffer with the render cache.  Flush it. */
-   anv_add_pending_pipe_bits(cmd_buffer,
-                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
-                             "after copy_to_shadow");
-
-   blorp_batch_finish(&batch);
-}
-
-void
 anv_image_clear_color(struct anv_cmd_buffer *cmd_buffer,
                       const struct anv_image *image,
                       VkImageAspectFlagBits aspect,
@@ -1580,10 +2428,10 @@ anv_image_clear_color(struct anv_cmd_buffer *cmd_buffer,
    assert(image->n_planes == 1);
 
    struct blorp_batch batch;
-   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
 
    struct blorp_surf surf;
-   get_blorp_surf_for_anv_image(cmd_buffer->device, image, aspect,
+   get_blorp_surf_for_anv_image(cmd_buffer, image, aspect,
                                 VK_IMAGE_USAGE_TRANSFER_DST_BIT,
                                 ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
                                 aux_usage, &surf);
@@ -1595,9 +2443,9 @@ anv_image_clear_color(struct anv_cmd_buffer *cmd_buffer,
                area.offset.x, area.offset.y,
                area.offset.x + area.extent.width,
                area.offset.y + area.extent.height,
-               clear_color, NULL);
+               clear_color, 0 /* color_write_disable */);
 
-   blorp_batch_finish(&batch);
+   anv_blorp_batch_finish(&batch);
 }
 
 void
@@ -1614,11 +2462,12 @@ anv_image_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer,
                                VK_IMAGE_ASPECT_STENCIL_BIT));
 
    struct blorp_batch batch;
-   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
+   assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
 
    struct blorp_surf depth = {};
    if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
-      get_blorp_surf_for_anv_image(cmd_buffer->device,
+      get_blorp_surf_for_anv_image(cmd_buffer,
                                    image, VK_IMAGE_ASPECT_DEPTH_BIT,
                                    0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
                                    depth_aux_usage, &depth);
@@ -1628,7 +2477,7 @@ anv_image_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer,
    if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
       const uint32_t plane =
          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
-      get_blorp_surf_for_anv_image(cmd_buffer->device,
+      get_blorp_surf_for_anv_image(cmd_buffer,
                                    image, VK_IMAGE_ASPECT_STENCIL_BIT,
                                    0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
                                    image->planes[plane].aux_usage, &stencil);
@@ -1659,28 +2508,10 @@ anv_image_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer,
     */
    anv_add_pending_pipe_bits(cmd_buffer,
                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
-                             ANV_PIPE_TILE_CACHE_FLUSH_BIT |
                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
                              "after clear DS");
 
-   struct blorp_surf stencil_shadow;
-   if ((aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
-       get_blorp_surf_for_anv_shadow_image(cmd_buffer->device, image,
-                                           VK_IMAGE_ASPECT_STENCIL_BIT,
-                                           &stencil_shadow)) {
-      union isl_color_value stencil_color = {
-         .u32 = { stencil_value },
-      };
-      blorp_clear(&batch, &stencil_shadow,
-                  ISL_FORMAT_R8_UINT, ISL_SWIZZLE_IDENTITY,
-                  level, base_layer, layer_count,
-                  area.offset.x, area.offset.y,
-                  area.offset.x + area.extent.width,
-                  area.offset.y + area.extent.height,
-                  stencil_color, NULL);
-   }
-
-   blorp_batch_finish(&batch);
+   anv_blorp_batch_finish(&batch);
 }
 
 void
@@ -1696,17 +2527,18 @@ anv_image_hiz_op(struct anv_cmd_buffer *cmd_buffer,
    assert(plane == 0);
 
    struct blorp_batch batch;
-   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
+   assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
 
    struct blorp_surf surf;
-   get_blorp_surf_for_anv_image(cmd_buffer->device,
+   get_blorp_surf_for_anv_image(cmd_buffer,
                                 image, VK_IMAGE_ASPECT_DEPTH_BIT,
                                 0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
                                 image->planes[plane].aux_usage, &surf);
 
    blorp_hiz_op(&batch, &surf, level, base_layer, layer_count, hiz_op);
 
-   blorp_batch_finish(&batch);
+   anv_blorp_batch_finish(&batch);
 }
 
 void
@@ -1717,86 +2549,14 @@ anv_image_hiz_clear(struct anv_cmd_buffer *cmd_buffer,
                     uint32_t base_layer, uint32_t layer_count,
                     VkRect2D area, uint8_t stencil_value)
 {
-   assert(image->vk.aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
-                               VK_IMAGE_ASPECT_STENCIL_BIT));
-
    struct blorp_batch batch;
-   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, 0);
+   anv_blorp_batch_init(cmd_buffer, &batch, 0);
+   assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
 
-   struct blorp_surf depth = {};
-   if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
-      const uint32_t plane =
-         anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
-      assert(base_layer + layer_count <=
-             anv_image_aux_layers(image, VK_IMAGE_ASPECT_DEPTH_BIT, level));
-      get_blorp_surf_for_anv_image(cmd_buffer->device,
-                                   image, VK_IMAGE_ASPECT_DEPTH_BIT,
-                                   0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
-                                   image->planes[plane].aux_usage, &depth);
-   }
-
-   struct blorp_surf stencil = {};
-   if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
-      const uint32_t plane =
-         anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
-      get_blorp_surf_for_anv_image(cmd_buffer->device,
-                                   image, VK_IMAGE_ASPECT_STENCIL_BIT,
-                                   0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
-                                   image->planes[plane].aux_usage, &stencil);
-   }
-
-   /* From the Sky Lake PRM Volume 7, "Depth Buffer Clear":
-    *
-    *    "The following is required when performing a depth buffer clear with
-    *    using the WM_STATE or 3DSTATE_WM:
-    *
-    *       * If other rendering operations have preceded this clear, a
-    *         PIPE_CONTROL with depth cache flush enabled, Depth Stall bit
-    *         enabled must be issued before the rectangle primitive used for
-    *         the depth buffer clear operation.
-    *       * [...]"
-    *
-    * Even though the PRM only says that this is required if using 3DSTATE_WM
-    * and a 3DPRIMITIVE, the GPU appears to also need this to avoid occasional
-    * hangs when doing a clear with WM_HZ_OP.
-    */
-   anv_add_pending_pipe_bits(cmd_buffer,
-                             ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
-                             ANV_PIPE_DEPTH_STALL_BIT,
-                             "before clear hiz");
+   anv_fast_clear_depth_stencil(cmd_buffer, &batch, image, aspects, level,
+                                base_layer, layer_count, area, stencil_value);
 
-   blorp_hiz_clear_depth_stencil(&batch, &depth, &stencil,
-                                 level, base_layer, layer_count,
-                                 area.offset.x, area.offset.y,
-                                 area.offset.x + area.extent.width,
-                                 area.offset.y + area.extent.height,
-                                 aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
-                                 ANV_HZ_FC_VAL,
-                                 aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
-                                 stencil_value);
-
-   blorp_batch_finish(&batch);
-
-   /* From the SKL PRM, Depth Buffer Clear:
-    *
-    *    "Depth Buffer Clear Workaround
-    *
-    *    Depth buffer clear pass using any of the methods (WM_STATE,
-    *    3DSTATE_WM or 3DSTATE_WM_HZ_OP) must be followed by a PIPE_CONTROL
-    *    command with DEPTH_STALL bit and Depth FLUSH bits “set” before
-    *    starting to render.  DepthStall and DepthFlush are not needed between
-    *    consecutive depth clear passes nor is it required if the depth-clear
-    *    pass was done with “full_surf_clear” bit set in the
-    *    3DSTATE_WM_HZ_OP."
-    *
-    * Even though the PRM provides a bunch of conditions under which this is
-    * supposedly unnecessary, we choose to perform the flush unconditionally
-    * just to be safe.
-    */
-   anv_add_pending_pipe_bits(cmd_buffer,
-                             ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
-                             ANV_PIPE_DEPTH_STALL_BIT,
-                             "after clear hiz");
+   anv_blorp_batch_finish(&batch);
 }
 
 void
@@ -1808,73 +2568,16 @@ anv_image_mcs_op(struct anv_cmd_buffer *cmd_buffer,
                  enum isl_aux_op mcs_op, union isl_color_value *clear_value,
                  bool predicate)
 {
-   assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
-   assert(image->vk.samples > 1);
-   assert(base_layer + layer_count <= anv_image_aux_layers(image, aspect, 0));
-
-   /* Multisampling with multi-planar formats is not supported */
-   assert(image->n_planes == 1);
-
    struct blorp_batch batch;
-   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer,
-                    BLORP_BATCH_PREDICATE_ENABLE * predicate +
-                    BLORP_BATCH_NO_UPDATE_CLEAR_COLOR * !clear_value);
-
-   struct blorp_surf surf;
-   get_blorp_surf_for_anv_image(cmd_buffer->device, image, aspect,
-                                0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
-                                ISL_AUX_USAGE_MCS, &surf);
-
-   /* Blorp will store the clear color for us if we provide the clear color
-    * address and we are doing a fast clear. So we save the clear value into
-    * the blorp surface.
-    */
-   if (clear_value)
-      surf.clear_color = *clear_value;
-
-   /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear":
-    *
-    *    "After Render target fast clear, pipe-control with color cache
-    *    write-flush must be issued before sending any DRAW commands on
-    *    that render target."
-    *
-    * This comment is a bit cryptic and doesn't really tell you what's going
-    * or what's really needed.  It appears that fast clear ops are not
-    * properly synchronized with other drawing.  This means that we cannot
-    * have a fast clear operation in the pipe at the same time as other
-    * regular drawing operations.  We need to use a PIPE_CONTROL to ensure
-    * that the contents of the previous draw hit the render target before we
-    * resolve and then use a second PIPE_CONTROL after the resolve to ensure
-    * that it is completed before any additional drawing occurs.
-    */
-   anv_add_pending_pipe_bits(cmd_buffer,
-                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
-                             ANV_PIPE_TILE_CACHE_FLUSH_BIT |
-                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
-                             "before fast clear mcs");
-
-   switch (mcs_op) {
-   case ISL_AUX_OP_FAST_CLEAR:
-      blorp_fast_clear(&batch, &surf, format, swizzle,
-                       0, base_layer, layer_count,
-                       0, 0, image->vk.extent.width, image->vk.extent.height);
-      break;
-   case ISL_AUX_OP_PARTIAL_RESOLVE:
-      blorp_mcs_partial_resolve(&batch, &surf, format,
-                                base_layer, layer_count);
-      break;
-   case ISL_AUX_OP_FULL_RESOLVE:
-   case ISL_AUX_OP_AMBIGUATE:
-   default:
-      unreachable("Unsupported MCS operation");
-   }
+   anv_blorp_batch_init(cmd_buffer, &batch,
+                        BLORP_BATCH_PREDICATE_ENABLE * predicate +
+                        BLORP_BATCH_NO_UPDATE_CLEAR_COLOR * !clear_value);
+   assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
 
-   anv_add_pending_pipe_bits(cmd_buffer,
-                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
-                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
-                             "after fast clear mcs");
+   exec_mcs_op(cmd_buffer, &batch, image, format, swizzle, aspect,
+               base_layer, layer_count, mcs_op, clear_value);
 
-   blorp_batch_finish(&batch);
+   anv_blorp_batch_finish(&batch);
 }
 
 void
@@ -1886,83 +2589,14 @@ anv_image_ccs_op(struct anv_cmd_buffer *cmd_buffer,
                  enum isl_aux_op ccs_op, union isl_color_value *clear_value,
                  bool predicate)
 {
-   assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
-   assert(image->vk.samples == 1);
-   assert(level < anv_image_aux_levels(image, aspect));
-   /* Multi-LOD YcBcR is not allowed */
-   assert(image->n_planes == 1 || level == 0);
-   assert(base_layer + layer_count <=
-          anv_image_aux_layers(image, aspect, level));
-
-   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
-
    struct blorp_batch batch;
-   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer,
-                    BLORP_BATCH_PREDICATE_ENABLE * predicate +
-                    BLORP_BATCH_NO_UPDATE_CLEAR_COLOR * !clear_value);
+   anv_blorp_batch_init(cmd_buffer, &batch,
+                        BLORP_BATCH_PREDICATE_ENABLE * predicate +
+                        BLORP_BATCH_NO_UPDATE_CLEAR_COLOR * !clear_value);
+   assert((batch.flags & BLORP_BATCH_USE_COMPUTE) == 0);
 
-   struct blorp_surf surf;
-   get_blorp_surf_for_anv_image(cmd_buffer->device, image, aspect,
-                                0, ANV_IMAGE_LAYOUT_EXPLICIT_AUX,
-                                image->planes[plane].aux_usage,
-                                &surf);
-
-   uint32_t level_width = anv_minify(surf.surf->logical_level0_px.w, level);
-   uint32_t level_height = anv_minify(surf.surf->logical_level0_px.h, level);
-
-   /* Blorp will store the clear color for us if we provide the clear color
-    * address and we are doing a fast clear. So we save the clear value into
-    * the blorp surface.
-    */
-   if (clear_value)
-      surf.clear_color = *clear_value;
-
-   /* From the Sky Lake PRM Vol. 7, "Render Target Fast Clear":
-    *
-    *    "After Render target fast clear, pipe-control with color cache
-    *    write-flush must be issued before sending any DRAW commands on
-    *    that render target."
-    *
-    * This comment is a bit cryptic and doesn't really tell you what's going
-    * or what's really needed.  It appears that fast clear ops are not
-    * properly synchronized with other drawing.  This means that we cannot
-    * have a fast clear operation in the pipe at the same time as other
-    * regular drawing operations.  We need to use a PIPE_CONTROL to ensure
-    * that the contents of the previous draw hit the render target before we
-    * resolve and then use a second PIPE_CONTROL after the resolve to ensure
-    * that it is completed before any additional drawing occurs.
-    */
-   anv_add_pending_pipe_bits(cmd_buffer,
-                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
-                             ANV_PIPE_TILE_CACHE_FLUSH_BIT |
-                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
-                             "before fast clear ccs");
-
-   switch (ccs_op) {
-   case ISL_AUX_OP_FAST_CLEAR:
-      blorp_fast_clear(&batch, &surf, format, swizzle,
-                       level, base_layer, layer_count,
-                       0, 0, level_width, level_height);
-      break;
-   case ISL_AUX_OP_FULL_RESOLVE:
-   case ISL_AUX_OP_PARTIAL_RESOLVE:
-      blorp_ccs_resolve(&batch, &surf, level, base_layer, layer_count,
-                        format, ccs_op);
-      break;
-   case ISL_AUX_OP_AMBIGUATE:
-      for (uint32_t a = 0; a < layer_count; a++) {
-         const uint32_t layer = base_layer + a;
-         blorp_ccs_ambiguate(&batch, &surf, level, layer);
-      }
-      break;
-   default:
-      unreachable("Unsupported CCS operation");
-   }
-
-   anv_add_pending_pipe_bits(cmd_buffer,
-                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
-                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
-                             "after fast clear ccs");
+   exec_ccs_op(cmd_buffer, &batch, image, format, swizzle, aspect, level,
+               base_layer, layer_count, ccs_op, clear_value);
 
-   blorp_batch_finish(&batch);
+   anv_blorp_batch_finish(&batch);
 }
diff --git a/src/intel/vulkan/anv_bo_sync.c b/src/intel/vulkan/anv_bo_sync.c
new file mode 100644
index 00000000000..af12c6d61dd
--- /dev/null
+++ b/src/intel/vulkan/anv_bo_sync.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright © 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include "util/os_time.h"
+#include "util/perf/cpu_trace.h"
+
+static struct anv_bo_sync *
+to_anv_bo_sync(struct vk_sync *sync)
+{
+   assert(sync->type == &anv_bo_sync_type);
+   return container_of(sync, struct anv_bo_sync, sync);
+}
+
+static VkResult
+anv_bo_sync_init(struct vk_device *vk_device,
+                 struct vk_sync *vk_sync,
+                 uint64_t initial_value)
+{
+   struct anv_device *device = container_of(vk_device, struct anv_device, vk);
+   struct anv_bo_sync *sync = to_anv_bo_sync(vk_sync);
+
+   sync->state = initial_value ? ANV_BO_SYNC_STATE_SIGNALED :
+                                 ANV_BO_SYNC_STATE_RESET;
+
+   return anv_device_alloc_bo(device, "bo-sync", 4096,
+                              ANV_BO_ALLOC_EXTERNAL |
+                              ANV_BO_ALLOC_IMPLICIT_SYNC |
+                              ANV_BO_ALLOC_INTERNAL,
+                              0 /* explicit_address */,
+                              &sync->bo);
+}
+
+static void
+anv_bo_sync_finish(struct vk_device *vk_device,
+                   struct vk_sync *vk_sync)
+{
+   struct anv_device *device = container_of(vk_device, struct anv_device, vk);
+   struct anv_bo_sync *sync = to_anv_bo_sync(vk_sync);
+
+   anv_device_release_bo(device, sync->bo);
+}
+
+static VkResult
+anv_bo_sync_reset(struct vk_device *vk_device,
+                  struct vk_sync *vk_sync)
+{
+   struct anv_bo_sync *sync = to_anv_bo_sync(vk_sync);
+
+   sync->state = ANV_BO_SYNC_STATE_RESET;
+
+   return VK_SUCCESS;
+}
+
+static int64_t
+anv_get_relative_timeout(uint64_t abs_timeout)
+{
+   uint64_t now = os_time_get_nano();
+
+   /* We don't want negative timeouts.
+    *
+    * DRM_IOCTL_I915_GEM_WAIT uses a signed 64 bit timeout and is
+    * supposed to block indefinitely timeouts < 0.  Unfortunately,
+    * this was broken for a couple of kernel releases.  Since there's
+    * no way to know whether or not the kernel we're using is one of
+    * the broken ones, the best we can do is to clamp the timeout to
+    * INT64_MAX.  This limits the maximum timeout from 584 years to
+    * 292 years - likely not a big deal.
+    */
+   if (abs_timeout < now)
+      return 0;
+
+   uint64_t rel_timeout = abs_timeout - now;
+   if (rel_timeout > (uint64_t) INT64_MAX)
+      rel_timeout = INT64_MAX;
+
+   return rel_timeout;
+}
+
+static VkResult
+anv_bo_sync_wait(struct vk_device *vk_device,
+                 uint32_t wait_count,
+                 const struct vk_sync_wait *waits,
+                 enum vk_sync_wait_flags wait_flags,
+                 uint64_t abs_timeout_ns)
+{
+   struct anv_device *device = container_of(vk_device, struct anv_device, vk);
+   VkResult result;
+   MESA_TRACE_FUNC();
+
+   uint32_t pending = wait_count;
+   while (pending) {
+      pending = 0;
+      bool signaled = false;
+      for (uint32_t i = 0; i < wait_count; i++) {
+         struct anv_bo_sync *sync = to_anv_bo_sync(waits[i].sync);
+         switch (sync->state) {
+         case ANV_BO_SYNC_STATE_RESET:
+            /* This fence hasn't been submitted yet, we'll catch it the next
+             * time around.  Yes, this may mean we dead-loop but, short of
+             * lots of locking and a condition variable, there's not much that
+             * we can do about that.
+             */
+            assert(!(wait_flags & VK_SYNC_WAIT_PENDING));
+            pending++;
+            continue;
+
+         case ANV_BO_SYNC_STATE_SIGNALED:
+            /* This fence is not pending.  If waitAll isn't set, we can return
+             * early.  Otherwise, we have to keep going.
+             */
+            if (wait_flags & VK_SYNC_WAIT_ANY)
+               return VK_SUCCESS;
+            continue;
+
+         case ANV_BO_SYNC_STATE_SUBMITTED:
+            /* These are the fences we really care about.  Go ahead and wait
+             * on it until we hit a timeout.
+             */
+            if (!(wait_flags & VK_SYNC_WAIT_PENDING)) {
+               uint64_t rel_timeout = anv_get_relative_timeout(abs_timeout_ns);
+               result = anv_device_wait(device, sync->bo, rel_timeout);
+               /* This also covers VK_TIMEOUT */
+               if (result != VK_SUCCESS)
+                  return result;
+
+               sync->state = ANV_BO_SYNC_STATE_SIGNALED;
+               signaled = true;
+            }
+            if (wait_flags & VK_SYNC_WAIT_ANY)
+               return VK_SUCCESS;
+            break;
+
+         default:
+            unreachable("Invalid BO sync state");
+         }
+      }
+
+      if (pending && !signaled) {
+         /* If we've hit this then someone decided to vkWaitForFences before
+          * they've actually submitted any of them to a queue.  This is a
+          * fairly pessimal case, so it's ok to lock here and use a standard
+          * pthreads condition variable.
+          */
+         pthread_mutex_lock(&device->mutex);
+
+         /* It's possible that some of the fences have changed state since the
+          * last time we checked.  Now that we have the lock, check for
+          * pending fences again and don't wait if it's changed.
+          */
+         uint32_t now_pending = 0;
+         for (uint32_t i = 0; i < wait_count; i++) {
+            struct anv_bo_sync *sync = to_anv_bo_sync(waits[i].sync);
+            if (sync->state == ANV_BO_SYNC_STATE_RESET)
+               now_pending++;
+         }
+         assert(now_pending <= pending);
+
+         if (now_pending == pending) {
+            struct timespec abstime = {
+               .tv_sec = abs_timeout_ns / NSEC_PER_SEC,
+               .tv_nsec = abs_timeout_ns % NSEC_PER_SEC,
+            };
+
+            ASSERTED int ret;
+            ret = pthread_cond_timedwait(&device->queue_submit,
+                                         &device->mutex, &abstime);
+            assert(ret != EINVAL);
+            if (os_time_get_nano() >= abs_timeout_ns) {
+               pthread_mutex_unlock(&device->mutex);
+               return VK_TIMEOUT;
+            }
+         }
+
+         pthread_mutex_unlock(&device->mutex);
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+const struct vk_sync_type anv_bo_sync_type = {
+   .size = sizeof(struct anv_bo_sync),
+   .features = VK_SYNC_FEATURE_BINARY |
+               VK_SYNC_FEATURE_GPU_WAIT |
+               VK_SYNC_FEATURE_GPU_MULTI_WAIT |
+               VK_SYNC_FEATURE_CPU_WAIT |
+               VK_SYNC_FEATURE_CPU_RESET |
+               VK_SYNC_FEATURE_WAIT_ANY |
+               VK_SYNC_FEATURE_WAIT_PENDING,
+   .init = anv_bo_sync_init,
+   .finish = anv_bo_sync_finish,
+   .reset = anv_bo_sync_reset,
+   .wait_many = anv_bo_sync_wait,
+};
+
+VkResult
+anv_create_sync_for_memory(struct vk_device *device,
+                           VkDeviceMemory memory,
+                           bool signal_memory,
+                           struct vk_sync **sync_out)
+{
+   ANV_FROM_HANDLE(anv_device_memory, mem, memory);
+   struct anv_bo_sync *bo_sync;
+
+   bo_sync = vk_zalloc(&device->alloc, sizeof(*bo_sync), 8,
+                       VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (bo_sync == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   bo_sync->sync.type = &anv_bo_sync_type;
+   bo_sync->state = signal_memory ? ANV_BO_SYNC_STATE_RESET :
+                                    ANV_BO_SYNC_STATE_SUBMITTED;
+   bo_sync->bo = anv_bo_ref(mem->bo);
+
+   *sync_out = &bo_sync->sync;
+
+   return VK_SUCCESS;
+}
diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c
index ece9dd32f96..25a79f3e52f 100644
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -41,181 +41,6 @@
  * is concerned, most of anv_cmd_buffer is magic.
  */
 
-/* TODO: These are taken from GLES.  We should check the Vulkan spec */
-const struct anv_dynamic_state default_dynamic_state = {
-   .viewport = {
-      .count = 0,
-   },
-   .scissor = {
-      .count = 0,
-   },
-   .line_width = 1.0f,
-   .depth_bias = {
-      .bias = 0.0f,
-      .clamp = 0.0f,
-      .slope = 0.0f,
-   },
-   .blend_constants = { 0.0f, 0.0f, 0.0f, 0.0f },
-   .depth_bounds = {
-      .min = 0.0f,
-      .max = 1.0f,
-   },
-   .stencil_compare_mask = {
-      .front = ~0u,
-      .back = ~0u,
-   },
-   .stencil_write_mask = {
-      .front = ~0u,
-      .back = ~0u,
-   },
-   .stencil_reference = {
-      .front = 0u,
-      .back = 0u,
-   },
-   .stencil_op = {
-      .front = {
-         .fail_op = 0,
-         .pass_op = 0,
-         .depth_fail_op = 0,
-         .compare_op = 0,
-      },
-      .back = {
-         .fail_op = 0,
-         .pass_op = 0,
-         .depth_fail_op = 0,
-         .compare_op = 0,
-      },
-   },
-   .line_stipple = {
-      .factor = 0u,
-      .pattern = 0u,
-   },
-   .cull_mode = 0,
-   .front_face = 0,
-   .primitive_topology = 0,
-   .depth_test_enable = 0,
-   .depth_write_enable = 0,
-   .depth_compare_op = 0,
-   .depth_bounds_test_enable = 0,
-   .stencil_test_enable = 0,
-   .dyn_vbo_stride = 0,
-   .dyn_vbo_size = 0,
-   .color_writes = 0xff,
-   .raster_discard = 0,
-   .depth_bias_enable = 0,
-   .primitive_restart_enable = 0,
-   .logic_op = 0,
-};
-
-/**
- * Copy the dynamic state from src to dest based on the copy_mask.
- *
- * Avoid copying states that have not changed, except for VIEWPORT, SCISSOR and
- * BLEND_CONSTANTS (always copy them if they are in the copy_mask).
- *
- * Returns a mask of the states which changed.
- */
-anv_cmd_dirty_mask_t
-anv_dynamic_state_copy(struct anv_dynamic_state *dest,
-                       const struct anv_dynamic_state *src,
-                       anv_cmd_dirty_mask_t copy_mask)
-{
-   anv_cmd_dirty_mask_t changed = 0;
-
-   if (copy_mask & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT) {
-      dest->viewport.count = src->viewport.count;
-      typed_memcpy(dest->viewport.viewports, src->viewport.viewports,
-                   src->viewport.count);
-      changed |= ANV_CMD_DIRTY_DYNAMIC_VIEWPORT;
-   }
-
-   if (copy_mask & ANV_CMD_DIRTY_DYNAMIC_SCISSOR) {
-      dest->scissor.count = src->scissor.count;
-      typed_memcpy(dest->scissor.scissors, src->scissor.scissors,
-                   src->scissor.count);
-      changed |= ANV_CMD_DIRTY_DYNAMIC_SCISSOR;
-   }
-
-   if (copy_mask & ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) {
-      typed_memcpy(dest->blend_constants, src->blend_constants, 4);
-      changed |= ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
-   }
-
-#define ANV_CMP_COPY(field, flag)                                 \
-   if (copy_mask & flag) {                                        \
-      if (dest->field != src->field) {                            \
-         dest->field = src->field;                                \
-         changed |= flag;                                         \
-      }                                                           \
-   }
-
-   ANV_CMP_COPY(line_width, ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH);
-
-   ANV_CMP_COPY(depth_bias.bias, ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS);
-   ANV_CMP_COPY(depth_bias.clamp, ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS);
-   ANV_CMP_COPY(depth_bias.slope, ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS);
-
-   ANV_CMP_COPY(depth_bounds.min, ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS);
-   ANV_CMP_COPY(depth_bounds.max, ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS);
-
-   ANV_CMP_COPY(stencil_compare_mask.front, ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK);
-   ANV_CMP_COPY(stencil_compare_mask.back, ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK);
-
-   ANV_CMP_COPY(stencil_write_mask.front, ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK);
-   ANV_CMP_COPY(stencil_write_mask.back, ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK);
-
-   ANV_CMP_COPY(stencil_reference.front, ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE);
-   ANV_CMP_COPY(stencil_reference.back, ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE);
-
-   ANV_CMP_COPY(line_stipple.factor, ANV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE);
-   ANV_CMP_COPY(line_stipple.pattern, ANV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE);
-
-   ANV_CMP_COPY(cull_mode, ANV_CMD_DIRTY_DYNAMIC_CULL_MODE);
-   ANV_CMP_COPY(front_face, ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE);
-   ANV_CMP_COPY(primitive_topology, ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY);
-   ANV_CMP_COPY(depth_test_enable, ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE);
-   ANV_CMP_COPY(depth_write_enable, ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE);
-   ANV_CMP_COPY(depth_compare_op, ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP);
-   ANV_CMP_COPY(depth_bounds_test_enable, ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE);
-   ANV_CMP_COPY(stencil_test_enable, ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE);
-
-   if (copy_mask & VK_DYNAMIC_STATE_STENCIL_OP_EXT) {
-      ANV_CMP_COPY(stencil_op.front.fail_op, ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP);
-      ANV_CMP_COPY(stencil_op.front.pass_op, ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP);
-      ANV_CMP_COPY(stencil_op.front.depth_fail_op, ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP);
-      ANV_CMP_COPY(stencil_op.front.compare_op, ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP);
-      ANV_CMP_COPY(stencil_op.back.fail_op, ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP);
-      ANV_CMP_COPY(stencil_op.back.pass_op, ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP);
-      ANV_CMP_COPY(stencil_op.back.depth_fail_op, ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP);
-      ANV_CMP_COPY(stencil_op.back.compare_op, ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP);
-   }
-
-   ANV_CMP_COPY(dyn_vbo_stride, ANV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE);
-   ANV_CMP_COPY(dyn_vbo_size, ANV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE);
-
-   ANV_CMP_COPY(raster_discard, ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE);
-   ANV_CMP_COPY(depth_bias_enable, ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE);
-   ANV_CMP_COPY(primitive_restart_enable, ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE);
-   ANV_CMP_COPY(logic_op, ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP);
-
-   if (copy_mask & ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS) {
-      dest->sample_locations.samples = src->sample_locations.samples;
-      typed_memcpy(dest->sample_locations.locations,
-                   src->sample_locations.locations,
-                   dest->sample_locations.samples);
-      changed |= ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
-   }
-
-   ANV_CMP_COPY(color_writes, ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE);
-
-   ANV_CMP_COPY(fragment_shading_rate.width, ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE);
-   ANV_CMP_COPY(fragment_shading_rate.height, ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE);
-
-#undef ANV_CMP_COPY
-
-   return changed;
-}
-
 static void
 anv_cmd_state_init(struct anv_cmd_buffer *cmd_buffer)
 {
@@ -224,21 +49,20 @@ anv_cmd_state_init(struct anv_cmd_buffer *cmd_buffer)
    memset(state, 0, sizeof(*state));
 
    state->current_pipeline = UINT32_MAX;
-   state->restart_index = UINT32_MAX;
-   state->gfx.dynamic = default_dynamic_state;
+   state->gfx.restart_index = UINT32_MAX;
+   state->gfx.object_preemption = true;
+   state->gfx.dirty = 0;
+
+   memcpy(state->gfx.dyn_state.dirty,
+          cmd_buffer->device->gfx_dirty_state,
+          sizeof(state->gfx.dyn_state.dirty));
 }
 
 static void
 anv_cmd_pipeline_state_finish(struct anv_cmd_buffer *cmd_buffer,
                               struct anv_cmd_pipeline_state *pipe_state)
 {
-   for (uint32_t i = 0; i < ARRAY_SIZE(pipe_state->push_descriptors); i++) {
-      if (pipe_state->push_descriptors[i]) {
-         anv_descriptor_set_layout_unref(cmd_buffer->device,
-             pipe_state->push_descriptors[i]->set.layout);
-         vk_free(&cmd_buffer->pool->alloc, pipe_state->push_descriptors[i]);
-      }
-   }
+   anv_push_descriptor_set_finish(&pipe_state->push_descriptor);
 }
 
 static void
@@ -248,8 +72,6 @@ anv_cmd_state_finish(struct anv_cmd_buffer *cmd_buffer)
 
    anv_cmd_pipeline_state_finish(cmd_buffer, &state->gfx.base);
    anv_cmd_pipeline_state_finish(cmd_buffer, &state->compute.base);
-
-   vk_free(&cmd_buffer->pool->alloc, state->attachments);
 }
 
 static void
@@ -257,158 +79,262 @@ anv_cmd_state_reset(struct anv_cmd_buffer *cmd_buffer)
 {
    anv_cmd_state_finish(cmd_buffer);
    anv_cmd_state_init(cmd_buffer);
+
+   cmd_buffer->last_compute_walker = NULL;
+   cmd_buffer->last_indirect_dispatch = NULL;
 }
 
-static VkResult anv_create_cmd_buffer(
-    struct anv_device *                         device,
-    struct anv_cmd_pool *                       pool,
-    VkCommandBufferLevel                        level,
-    VkCommandBuffer*                            pCommandBuffer)
+VkResult
+anv_cmd_buffer_ensure_rcs_companion(struct anv_cmd_buffer *cmd_buffer)
 {
+   if (cmd_buffer->companion_rcs_cmd_buffer)
+      return VK_SUCCESS;
+
+   VkResult result = VK_SUCCESS;
+   pthread_mutex_lock(&cmd_buffer->device->mutex);
+   VK_FROM_HANDLE(vk_command_pool, pool,
+                  cmd_buffer->device->companion_rcs_cmd_pool);
+   assert(pool != NULL);
+
+   struct vk_command_buffer *tmp_cmd_buffer = NULL;
+   result = pool->command_buffer_ops->create(pool, cmd_buffer->vk.level, &tmp_cmd_buffer);
+
+   if (result != VK_SUCCESS)
+      goto unlock_and_return;
+
+   cmd_buffer->companion_rcs_cmd_buffer =
+      container_of(tmp_cmd_buffer, struct anv_cmd_buffer, vk);
+   anv_genX(cmd_buffer->device->info, cmd_buffer_begin_companion)(
+      cmd_buffer->companion_rcs_cmd_buffer, cmd_buffer->vk.level);
+
+unlock_and_return:
+   pthread_mutex_unlock(&cmd_buffer->device->mutex);
+   return result;
+}
+
+static VkResult
+anv_create_cmd_buffer(struct vk_command_pool *pool,
+                      VkCommandBufferLevel level,
+                      struct vk_command_buffer **cmd_buffer_out)
+{
+   struct anv_device *device =
+      container_of(pool->base.device, struct anv_device, vk);
    struct anv_cmd_buffer *cmd_buffer;
    VkResult result;
 
-   cmd_buffer = vk_object_alloc(&device->vk, &pool->alloc, sizeof(*cmd_buffer),
-                                VK_OBJECT_TYPE_COMMAND_BUFFER);
+   cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8,
+                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (cmd_buffer == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(pool, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = vk_command_buffer_init(pool, &cmd_buffer->vk,
+                                   &anv_cmd_buffer_ops, level);
+   if (result != VK_SUCCESS)
+      goto fail_alloc;
+
+   cmd_buffer->vk.dynamic_graphics_state.ms.sample_locations =
+      &cmd_buffer->state.gfx.sample_locations;
+   cmd_buffer->vk.dynamic_graphics_state.vi =
+      &cmd_buffer->state.gfx.vertex_input;
 
    cmd_buffer->batch.status = VK_SUCCESS;
+   cmd_buffer->generation.batch.status = VK_SUCCESS;
 
    cmd_buffer->device = device;
-   cmd_buffer->pool = pool;
-   cmd_buffer->level = level;
+
+   assert(pool->queue_family_index < device->physical->queue.family_count);
+   cmd_buffer->queue_family =
+      &device->physical->queue.families[pool->queue_family_index];
 
    result = anv_cmd_buffer_init_batch_bo_chain(cmd_buffer);
    if (result != VK_SUCCESS)
-      goto fail;
+      goto fail_vk;
 
    anv_state_stream_init(&cmd_buffer->surface_state_stream,
-                         &device->surface_state_pool, 4096);
+                         &device->internal_surface_state_pool, 4096);
    anv_state_stream_init(&cmd_buffer->dynamic_state_stream,
                          &device->dynamic_state_pool, 16384);
+   anv_state_stream_init(&cmd_buffer->dynamic_state_db_stream,
+                         &device->dynamic_state_db_pool, 16384);
    anv_state_stream_init(&cmd_buffer->general_state_stream,
                          &device->general_state_pool, 16384);
+   anv_state_stream_init(&cmd_buffer->indirect_push_descriptor_stream,
+                         &device->indirect_push_descriptor_pool, 4096);
+   anv_state_stream_init(&cmd_buffer->push_descriptor_buffer_stream,
+                         &device->push_descriptor_buffer_pool, 4096);
 
-   cmd_buffer->self_mod_locations = NULL;
-
-   anv_cmd_state_init(cmd_buffer);
+   int success = u_vector_init_pow2(&cmd_buffer->dynamic_bos, 8,
+                                    sizeof(struct anv_bo *));
+   if (!success)
+      goto fail_batch_bo;
 
-   list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
+   cmd_buffer->self_mod_locations = NULL;
+   cmd_buffer->companion_rcs_cmd_buffer = NULL;
+   cmd_buffer->is_companion_rcs_cmd_buffer = false;
 
-   anv_measure_init(cmd_buffer);
+   cmd_buffer->generation.jump_addr = ANV_NULL_ADDRESS;
+   cmd_buffer->generation.return_addr = ANV_NULL_ADDRESS;
 
-   *pCommandBuffer = anv_cmd_buffer_to_handle(cmd_buffer);
+   cmd_buffer->last_compute_walker = NULL;
+   cmd_buffer->last_indirect_dispatch = NULL;
 
-   return VK_SUCCESS;
+   memset(&cmd_buffer->generation.shader_state, 0,
+          sizeof(cmd_buffer->generation.shader_state));
 
- fail:
-   vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
+   anv_cmd_state_init(cmd_buffer);
 
-   return result;
-}
+   anv_measure_init(cmd_buffer);
 
-VkResult anv_AllocateCommandBuffers(
-    VkDevice                                    _device,
-    const VkCommandBufferAllocateInfo*          pAllocateInfo,
-    VkCommandBuffer*                            pCommandBuffers)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_cmd_pool, pool, pAllocateInfo->commandPool);
+   u_trace_init(&cmd_buffer->trace, &device->ds.trace_context);
 
-   VkResult result = VK_SUCCESS;
-   uint32_t i;
+   *cmd_buffer_out = &cmd_buffer->vk;
 
-   for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
-      result = anv_create_cmd_buffer(device, pool, pAllocateInfo->level,
-                                     &pCommandBuffers[i]);
-      if (result != VK_SUCCESS)
-         break;
-   }
+   return VK_SUCCESS;
 
-   if (result != VK_SUCCESS) {
-      anv_FreeCommandBuffers(_device, pAllocateInfo->commandPool,
-                             i, pCommandBuffers);
-      for (i = 0; i < pAllocateInfo->commandBufferCount; i++)
-         pCommandBuffers[i] = VK_NULL_HANDLE;
-   }
+ fail_batch_bo:
+   anv_cmd_buffer_fini_batch_bo_chain(cmd_buffer);
+ fail_vk:
+   vk_command_buffer_finish(&cmd_buffer->vk);
+ fail_alloc:
+   vk_free2(&device->vk.alloc, &pool->alloc, cmd_buffer);
 
    return result;
 }
 
 static void
-anv_cmd_buffer_destroy(struct anv_cmd_buffer *cmd_buffer)
+destroy_cmd_buffer(struct anv_cmd_buffer *cmd_buffer)
 {
-   anv_measure_destroy(cmd_buffer);
+   u_trace_fini(&cmd_buffer->trace);
 
-   list_del(&cmd_buffer->pool_link);
+   anv_measure_destroy(cmd_buffer);
 
    anv_cmd_buffer_fini_batch_bo_chain(cmd_buffer);
 
    anv_state_stream_finish(&cmd_buffer->surface_state_stream);
    anv_state_stream_finish(&cmd_buffer->dynamic_state_stream);
+   anv_state_stream_finish(&cmd_buffer->dynamic_state_db_stream);
    anv_state_stream_finish(&cmd_buffer->general_state_stream);
+   anv_state_stream_finish(&cmd_buffer->indirect_push_descriptor_stream);
+   anv_state_stream_finish(&cmd_buffer->push_descriptor_buffer_stream);
+
+   while (u_vector_length(&cmd_buffer->dynamic_bos) > 0) {
+      struct anv_bo **bo = u_vector_remove(&cmd_buffer->dynamic_bos);
+      anv_bo_pool_free((*bo)->map != NULL ?
+                       &cmd_buffer->device->batch_bo_pool :
+                       &cmd_buffer->device->bvh_bo_pool, *bo);
+   }
+   u_vector_finish(&cmd_buffer->dynamic_bos);
 
    anv_cmd_state_finish(cmd_buffer);
 
-   vk_free(&cmd_buffer->pool->alloc, cmd_buffer->self_mod_locations);
+   vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer->self_mod_locations);
 
-   vk_object_free(&cmd_buffer->device->vk, &cmd_buffer->pool->alloc, cmd_buffer);
+   vk_command_buffer_finish(&cmd_buffer->vk);
+   vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
 }
 
-void anv_FreeCommandBuffers(
-    VkDevice                                    device,
-    VkCommandPool                               commandPool,
-    uint32_t                                    commandBufferCount,
-    const VkCommandBuffer*                      pCommandBuffers)
+static void
+anv_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
 {
-   for (uint32_t i = 0; i < commandBufferCount; i++) {
-      ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
-
-      if (!cmd_buffer)
-         continue;
+   struct anv_cmd_buffer *cmd_buffer =
+      container_of(vk_cmd_buffer, struct anv_cmd_buffer, vk);
+   struct anv_device *device = cmd_buffer->device;
 
-      anv_cmd_buffer_destroy(cmd_buffer);
+   pthread_mutex_lock(&device->mutex);
+   if (cmd_buffer->companion_rcs_cmd_buffer) {
+      destroy_cmd_buffer(cmd_buffer->companion_rcs_cmd_buffer);
+      cmd_buffer->companion_rcs_cmd_buffer = NULL;
    }
+
+   ANV_RMV(cmd_buffer_destroy, cmd_buffer->device, cmd_buffer);
+
+   destroy_cmd_buffer(cmd_buffer);
+   pthread_mutex_unlock(&device->mutex);
 }
 
-VkResult
-anv_cmd_buffer_reset(struct anv_cmd_buffer *cmd_buffer)
+static void
+reset_cmd_buffer(struct anv_cmd_buffer *cmd_buffer,
+                 UNUSED VkCommandBufferResetFlags flags)
 {
+   vk_command_buffer_reset(&cmd_buffer->vk);
+
    cmd_buffer->usage_flags = 0;
    cmd_buffer->perf_query_pool = NULL;
+   cmd_buffer->is_companion_rcs_cmd_buffer = false;
    anv_cmd_buffer_reset_batch_bo_chain(cmd_buffer);
    anv_cmd_state_reset(cmd_buffer);
 
+   memset(&cmd_buffer->generation.shader_state, 0,
+          sizeof(cmd_buffer->generation.shader_state));
+
+   cmd_buffer->generation.jump_addr = ANV_NULL_ADDRESS;
+   cmd_buffer->generation.return_addr = ANV_NULL_ADDRESS;
+
    anv_state_stream_finish(&cmd_buffer->surface_state_stream);
    anv_state_stream_init(&cmd_buffer->surface_state_stream,
-                         &cmd_buffer->device->surface_state_pool, 4096);
+                         &cmd_buffer->device->internal_surface_state_pool, 4096);
 
    anv_state_stream_finish(&cmd_buffer->dynamic_state_stream);
    anv_state_stream_init(&cmd_buffer->dynamic_state_stream,
                          &cmd_buffer->device->dynamic_state_pool, 16384);
 
+   anv_state_stream_finish(&cmd_buffer->dynamic_state_db_stream);
+   anv_state_stream_init(&cmd_buffer->dynamic_state_db_stream,
+                         &cmd_buffer->device->dynamic_state_db_pool, 16384);
+
    anv_state_stream_finish(&cmd_buffer->general_state_stream);
    anv_state_stream_init(&cmd_buffer->general_state_stream,
                          &cmd_buffer->device->general_state_pool, 16384);
 
+   anv_state_stream_finish(&cmd_buffer->indirect_push_descriptor_stream);
+   anv_state_stream_init(&cmd_buffer->indirect_push_descriptor_stream,
+                         &cmd_buffer->device->indirect_push_descriptor_pool,
+                         4096);
+
+   anv_state_stream_finish(&cmd_buffer->push_descriptor_buffer_stream);
+   anv_state_stream_init(&cmd_buffer->push_descriptor_buffer_stream,
+                         &cmd_buffer->device->push_descriptor_buffer_pool, 4096);
+
+   while (u_vector_length(&cmd_buffer->dynamic_bos) > 0) {
+      struct anv_bo **bo = u_vector_remove(&cmd_buffer->dynamic_bos);
+      anv_device_release_bo(cmd_buffer->device, *bo);
+   }
+
    anv_measure_reset(cmd_buffer);
-   return VK_SUCCESS;
+
+   u_trace_fini(&cmd_buffer->trace);
+   u_trace_init(&cmd_buffer->trace, &cmd_buffer->device->ds.trace_context);
 }
 
-VkResult anv_ResetCommandBuffer(
-    VkCommandBuffer                             commandBuffer,
-    VkCommandBufferResetFlags                   flags)
+void
+anv_cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer,
+                     UNUSED VkCommandBufferResetFlags flags)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   return anv_cmd_buffer_reset(cmd_buffer);
+   struct anv_cmd_buffer *cmd_buffer =
+      container_of(vk_cmd_buffer, struct anv_cmd_buffer, vk);
+
+   if (cmd_buffer->companion_rcs_cmd_buffer) {
+      reset_cmd_buffer(cmd_buffer->companion_rcs_cmd_buffer, flags);
+      destroy_cmd_buffer(cmd_buffer->companion_rcs_cmd_buffer);
+      cmd_buffer->companion_rcs_cmd_buffer = NULL;
+   }
+
+   ANV_RMV(cmd_buffer_destroy, cmd_buffer->device, cmd_buffer);
+
+   reset_cmd_buffer(cmd_buffer, flags);
 }
 
+const struct vk_command_buffer_ops anv_cmd_buffer_ops = {
+   .create = anv_create_cmd_buffer,
+   .reset = anv_cmd_buffer_reset,
+   .destroy = anv_cmd_buffer_destroy,
+};
+
 void
-anv_cmd_buffer_emit_state_base_address(struct anv_cmd_buffer *cmd_buffer)
+anv_cmd_buffer_emit_bt_pool_base_address(struct anv_cmd_buffer *cmd_buffer)
 {
-   const struct intel_device_info *devinfo = &cmd_buffer->device->info;
-   anv_genX(devinfo, cmd_buffer_emit_state_base_address)(cmd_buffer);
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+   anv_genX(devinfo, cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
 }
 
 void
@@ -420,7 +346,7 @@ anv_cmd_buffer_mark_image_written(struct anv_cmd_buffer *cmd_buffer,
                                   uint32_t base_layer,
                                   uint32_t layer_count)
 {
-   const struct intel_device_info *devinfo = &cmd_buffer->device->info;
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
    anv_genX(devinfo, cmd_buffer_mark_image_written)(cmd_buffer, image,
                                                     aspect, aux_usage,
                                                     level, base_layer,
@@ -428,12 +354,63 @@ anv_cmd_buffer_mark_image_written(struct anv_cmd_buffer *cmd_buffer,
 }
 
 void
+anv_cmd_buffer_mark_image_fast_cleared(struct anv_cmd_buffer *cmd_buffer,
+                                       const struct anv_image *image,
+                                       const enum isl_format format,
+                                       union isl_color_value clear_color)
+{
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+   anv_genX(devinfo, set_fast_clear_state)(cmd_buffer, image, format,
+                                           clear_color);
+}
+
+void
+anv_cmd_buffer_load_clear_color_from_image(struct anv_cmd_buffer *cmd_buffer,
+                                           struct anv_state state,
+                                           const struct anv_image *image)
+{
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+   anv_genX(devinfo, load_image_clear_color)(cmd_buffer, state, image);
+}
+
+void
 anv_cmd_emit_conditional_render_predicate(struct anv_cmd_buffer *cmd_buffer)
 {
-   const struct intel_device_info *devinfo = &cmd_buffer->device->info;
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
    anv_genX(devinfo, cmd_emit_conditional_render_predicate)(cmd_buffer);
 }
 
+static void
+clear_pending_query_bits(enum anv_query_bits *query_bits,
+                         enum anv_pipe_bits flushed_bits)
+{
+   if (flushed_bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
+      *query_bits &= ~ANV_QUERY_WRITES_RT_FLUSH;
+
+   if (flushed_bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT)
+      *query_bits &= ~ANV_QUERY_WRITES_TILE_FLUSH;
+
+   if ((flushed_bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT) &&
+       (flushed_bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT) &&
+       (flushed_bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT))
+      *query_bits &= ~ANV_QUERY_WRITES_TILE_FLUSH;
+
+   /* Once RT/TILE have been flushed, we can consider the CS_STALL flush */
+   if ((*query_bits & (ANV_QUERY_WRITES_TILE_FLUSH |
+                       ANV_QUERY_WRITES_RT_FLUSH |
+                       ANV_QUERY_WRITES_DATA_FLUSH)) == 0 &&
+       (flushed_bits & (ANV_PIPE_END_OF_PIPE_SYNC_BIT | ANV_PIPE_CS_STALL_BIT)))
+      *query_bits &= ~ANV_QUERY_WRITES_CS_STALL;
+}
+
+void
+anv_cmd_buffer_update_pending_query_bits(struct anv_cmd_buffer *cmd_buffer,
+                                         enum anv_pipe_bits flushed_bits)
+{
+   clear_pending_query_bits(&cmd_buffer->state.queries.clear_bits, flushed_bits);
+   clear_pending_query_bits(&cmd_buffer->state.queries.buffer_write_bits, flushed_bits);
+}
+
 static bool
 mem_update(void *dst, const void *src, size_t size)
 {
@@ -465,6 +442,184 @@ set_dirty_for_bind_map(struct anv_cmd_buffer *cmd_buffer,
       cmd_buffer->state.push_constants_dirty |= mesa_to_vk_shader_stage(stage);
 }
 
+static void
+anv_cmd_buffer_set_ray_query_buffer(struct anv_cmd_buffer *cmd_buffer,
+                                    struct anv_cmd_pipeline_state *pipeline_state,
+                                    struct anv_pipeline *pipeline,
+                                    VkShaderStageFlags stages)
+{
+   struct anv_device *device = cmd_buffer->device;
+
+   uint64_t ray_shadow_size =
+      align64(brw_rt_ray_queries_shadow_stacks_size(device->info,
+                                                    pipeline->ray_queries),
+              4096);
+   if (ray_shadow_size > 0 &&
+       (!cmd_buffer->state.ray_query_shadow_bo ||
+        cmd_buffer->state.ray_query_shadow_bo->size < ray_shadow_size)) {
+      unsigned shadow_size_log2 = MAX2(util_logbase2_ceil(ray_shadow_size), 16);
+      unsigned bucket = shadow_size_log2 - 16;
+      assert(bucket < ARRAY_SIZE(device->ray_query_shadow_bos));
+
+      struct anv_bo *bo = p_atomic_read(&device->ray_query_shadow_bos[bucket]);
+      if (bo == NULL) {
+         struct anv_bo *new_bo;
+         VkResult result = anv_device_alloc_bo(device, "RT queries shadow",
+                                               ray_shadow_size,
+                                               ANV_BO_ALLOC_INTERNAL, /* alloc_flags */
+                                               0, /* explicit_address */
+                                               &new_bo);
+         if (result != VK_SUCCESS) {
+            anv_batch_set_error(&cmd_buffer->batch, result);
+            return;
+         }
+
+         bo = p_atomic_cmpxchg(&device->ray_query_shadow_bos[bucket], NULL, new_bo);
+         if (bo != NULL) {
+            anv_device_release_bo(device, bo);
+         } else {
+            bo = new_bo;
+         }
+      }
+      cmd_buffer->state.ray_query_shadow_bo = bo;
+
+      /* Add the ray query buffers to the batch list. */
+      anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+                            cmd_buffer->state.ray_query_shadow_bo);
+   }
+
+   /* Add the HW buffer to the list of BO used. */
+   anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+                         device->ray_query_bo);
+
+   /* Fill the push constants & mark them dirty. */
+   struct anv_address ray_query_globals_addr =
+      anv_genX(device->info, cmd_buffer_ray_query_globals)(cmd_buffer);
+   pipeline_state->push_constants.ray_query_globals =
+      anv_address_physical(ray_query_globals_addr);
+   cmd_buffer->state.push_constants_dirty |= stages;
+   pipeline_state->push_constants_data_dirty = true;
+}
+
+/**
+ * This function compute changes between 2 pipelines and flags the dirty HW
+ * state appropriately.
+ */
+static void
+anv_cmd_buffer_flush_pipeline_state(struct anv_cmd_buffer *cmd_buffer,
+                                    struct anv_graphics_pipeline *old_pipeline,
+                                    struct anv_graphics_pipeline *new_pipeline)
+{
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
+
+#define diff_fix_state(bit, name)                                       \
+   do {                                                                 \
+      /* Fixed states should always have matching sizes */              \
+      assert(old_pipeline == NULL ||                                    \
+             old_pipeline->name.len == new_pipeline->name.len);         \
+      /* Don't bother memcmp if the state is already dirty */           \
+      if (!BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_##bit) &&         \
+          (old_pipeline == NULL ||                                      \
+           memcmp(&old_pipeline->batch_data[old_pipeline->name.offset], \
+                  &new_pipeline->batch_data[new_pipeline->name.offset], \
+                  4 * new_pipeline->name.len) != 0))                    \
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit);              \
+   } while (0)
+#define diff_var_state(bit, name)                                       \
+   do {                                                                 \
+      /* Don't bother memcmp if the state is already dirty */           \
+      /* Also if the new state is empty, avoid marking dirty */         \
+      if (!BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_##bit) &&         \
+          new_pipeline->name.len != 0 &&                                \
+          (old_pipeline == NULL ||                                      \
+           old_pipeline->name.len != new_pipeline->name.len ||          \
+           memcmp(&old_pipeline->batch_data[old_pipeline->name.offset], \
+                  &new_pipeline->batch_data[new_pipeline->name.offset], \
+                  4 * new_pipeline->name.len) != 0))                    \
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit);              \
+   } while (0)
+#define assert_identical(bit, name)                                     \
+   do {                                                                 \
+      /* Fixed states should always have matching sizes */              \
+      assert(old_pipeline == NULL ||                                    \
+             old_pipeline->name.len == new_pipeline->name.len);         \
+      assert(old_pipeline == NULL ||                                    \
+             memcmp(&old_pipeline->batch_data[old_pipeline->name.offset], \
+                    &new_pipeline->batch_data[new_pipeline->name.offset], \
+                    4 * new_pipeline->name.len) == 0);                  \
+   } while (0)
+#define assert_empty(name) assert(new_pipeline->name.len == 0)
+
+   /* Compare all states, including partial packed ones, the dynamic part is
+    * left at 0 but the static part could still change.
+    */
+   diff_fix_state(URB,                      final.urb);
+   diff_fix_state(VF_SGVS,                  final.vf_sgvs);
+   if (cmd_buffer->device->info->ver >= 11)
+      diff_fix_state(VF_SGVS_2,             final.vf_sgvs_2);
+   if (cmd_buffer->device->info->ver >= 12)
+      diff_fix_state(PRIMITIVE_REPLICATION, final.primitive_replication);
+   diff_fix_state(SBE,                      final.sbe);
+   diff_fix_state(SBE_SWIZ,                 final.sbe_swiz);
+   diff_fix_state(VS,                       final.vs);
+   diff_fix_state(HS,                       final.hs);
+   diff_fix_state(DS,                       final.ds);
+
+   diff_fix_state(CLIP,                     partial.clip);
+   diff_fix_state(SF,                       partial.sf);
+   diff_fix_state(RASTER,                   partial.raster);
+   diff_fix_state(MULTISAMPLE,              partial.ms);
+   diff_fix_state(WM,                       partial.wm);
+   diff_fix_state(STREAMOUT,                partial.so);
+   diff_fix_state(GS,                       partial.gs);
+   diff_fix_state(TE,                       partial.te);
+   diff_fix_state(VFG,                      partial.vfg);
+   diff_fix_state(PS,                       partial.ps);
+   diff_fix_state(PS_EXTRA,                 partial.ps_extra);
+
+   if (cmd_buffer->device->vk.enabled_extensions.EXT_mesh_shader) {
+      diff_fix_state(TASK_CONTROL,          final.task_control);
+      diff_fix_state(TASK_SHADER,           final.task_shader);
+      diff_fix_state(TASK_REDISTRIB,        final.task_redistrib);
+      diff_fix_state(MESH_CONTROL,          final.mesh_control);
+      diff_fix_state(MESH_SHADER,           final.mesh_shader);
+      diff_fix_state(MESH_DISTRIB,          final.mesh_distrib);
+      diff_fix_state(CLIP_MESH,             final.clip_mesh);
+      diff_fix_state(SBE_MESH,              final.sbe_mesh);
+   } else {
+      assert_empty(final.task_control);
+      assert_empty(final.task_shader);
+      assert_empty(final.task_redistrib);
+      assert_empty(final.mesh_control);
+      assert_empty(final.mesh_shader);
+      assert_empty(final.mesh_distrib);
+      assert_empty(final.clip_mesh);
+      assert_empty(final.sbe_mesh);
+   }
+
+   /* States that should never vary between pipelines, but can be affected by
+    * blorp etc...
+    */
+   assert_identical(VF_STATISTICS,            final.vf_statistics);
+
+   /* States that can vary in length */
+   diff_var_state(VF_SGVS_INSTANCING,       final.vf_sgvs_instancing);
+   diff_var_state(SO_DECL_LIST,             final.so_decl_list);
+
+#undef diff_fix_state
+#undef diff_var_state
+#undef assert_identical
+#undef assert_empty
+
+   /* We're not diffing the following :
+    *    - anv_graphics_pipeline::vertex_input_data
+    *    - anv_graphics_pipeline::final::vf_instancing
+    *
+    * since they are tracked by the runtime.
+    */
+}
+
 void anv_CmdBindPipeline(
     VkCommandBuffer                             commandBuffer,
     VkPipelineBindPoint                         pipelineBindPoint,
@@ -472,463 +627,194 @@ void anv_CmdBindPipeline(
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);
+   struct anv_cmd_pipeline_state *state;
+   VkShaderStageFlags stages = 0;
 
    switch (pipelineBindPoint) {
    case VK_PIPELINE_BIND_POINT_COMPUTE: {
-      struct anv_compute_pipeline *compute_pipeline =
-         anv_pipeline_to_compute(pipeline);
-      if (cmd_buffer->state.compute.pipeline == compute_pipeline)
+      if (cmd_buffer->state.compute.base.pipeline == pipeline)
          return;
 
-      cmd_buffer->state.compute.pipeline = compute_pipeline;
+      cmd_buffer->state.compute.base.pipeline = pipeline;
       cmd_buffer->state.compute.pipeline_dirty = true;
+
+      struct anv_compute_pipeline *compute_pipeline =
+         anv_pipeline_to_compute(pipeline);
       set_dirty_for_bind_map(cmd_buffer, MESA_SHADER_COMPUTE,
                              &compute_pipeline->cs->bind_map);
+
+      state = &cmd_buffer->state.compute.base;
+      stages = VK_SHADER_STAGE_COMPUTE_BIT;
       break;
    }
 
    case VK_PIPELINE_BIND_POINT_GRAPHICS: {
-      struct anv_graphics_pipeline *gfx_pipeline =
+      struct anv_graphics_pipeline *new_pipeline =
          anv_pipeline_to_graphics(pipeline);
-      if (cmd_buffer->state.gfx.pipeline == gfx_pipeline)
+
+      /* Apply the non dynamic state from the pipeline */
+      vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk,
+                                        &new_pipeline->dynamic_state);
+
+      if (cmd_buffer->state.gfx.base.pipeline == pipeline)
          return;
 
-      cmd_buffer->state.gfx.pipeline = gfx_pipeline;
-      cmd_buffer->state.gfx.vb_dirty |= gfx_pipeline->vb_used;
+      struct anv_graphics_pipeline *old_pipeline =
+         cmd_buffer->state.gfx.base.pipeline == NULL ? NULL :
+         anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+
+      cmd_buffer->state.gfx.base.pipeline = pipeline;
       cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE;
 
-      anv_foreach_stage(stage, gfx_pipeline->active_stages) {
+      anv_foreach_stage(stage, new_pipeline->base.base.active_stages) {
          set_dirty_for_bind_map(cmd_buffer, stage,
-                                &gfx_pipeline->shaders[stage]->bind_map);
+                                &new_pipeline->base.shaders[stage]->bind_map);
+      }
+
+      state = &cmd_buffer->state.gfx.base;
+      stages = new_pipeline->base.base.active_stages;
+
+
+      /* When the pipeline is using independent states and dynamic buffers,
+       * this will trigger an update of anv_push_constants::dynamic_base_index
+       * & anv_push_constants::dynamic_offsets.
+       */
+      struct anv_push_constants *push =
+         &cmd_buffer->state.gfx.base.push_constants;
+      struct anv_pipeline_sets_layout *layout = &new_pipeline->base.base.layout;
+      if (layout->independent_sets && layout->num_dynamic_buffers > 0) {
+         bool modified = false;
+         for (uint32_t s = 0; s < layout->num_sets; s++) {
+            if (layout->set[s].layout == NULL)
+               continue;
+
+            assert(layout->set[s].dynamic_offset_start < MAX_DYNAMIC_BUFFERS);
+            if (layout->set[s].layout->dynamic_offset_count > 0 &&
+                (push->desc_surface_offsets[s] & ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK) !=
+                layout->set[s].dynamic_offset_start) {
+               push->desc_surface_offsets[s] &= ~ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK;
+               push->desc_surface_offsets[s] |= (layout->set[s].dynamic_offset_start &
+                                                 ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK);
+               modified = true;
+            }
+         }
+         if (modified) {
+            cmd_buffer->state.push_constants_dirty |= stages;
+            state->push_constants_data_dirty = true;
+         }
       }
 
-      /* Apply the dynamic state from the pipeline */
-      cmd_buffer->state.gfx.dirty |=
-         anv_dynamic_state_copy(&cmd_buffer->state.gfx.dynamic,
-                                &gfx_pipeline->dynamic_state,
-                                gfx_pipeline->dynamic_state_mask);
+      anv_cmd_buffer_flush_pipeline_state(cmd_buffer, old_pipeline, new_pipeline);
       break;
    }
 
    case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: {
-      struct anv_ray_tracing_pipeline *rt_pipeline =
-         anv_pipeline_to_ray_tracing(pipeline);
-      if (cmd_buffer->state.rt.pipeline == rt_pipeline)
+      if (cmd_buffer->state.rt.base.pipeline == pipeline)
          return;
 
-      cmd_buffer->state.rt.pipeline = rt_pipeline;
+      cmd_buffer->state.rt.base.pipeline = pipeline;
       cmd_buffer->state.rt.pipeline_dirty = true;
 
+      struct anv_ray_tracing_pipeline *rt_pipeline =
+         anv_pipeline_to_ray_tracing(pipeline);
       if (rt_pipeline->stack_size > 0) {
          anv_CmdSetRayTracingPipelineStackSizeKHR(commandBuffer,
                                                   rt_pipeline->stack_size);
       }
+
+      state = &cmd_buffer->state.rt.base;
       break;
    }
 
    default:
-      assert(!"invalid bind point");
+      unreachable("invalid bind point");
       break;
    }
-}
-
-void anv_CmdSetRasterizerDiscardEnableEXT(
-    VkCommandBuffer                             commandBuffer,
-    VkBool32                                    rasterizerDiscardEnable)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   cmd_buffer->state.gfx.dynamic.raster_discard = rasterizerDiscardEnable;
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
-}
-
-void anv_CmdSetDepthBiasEnableEXT(
-    VkCommandBuffer                             commandBuffer,
-    VkBool32                                    depthBiasEnable)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   cmd_buffer->state.gfx.dynamic.depth_bias_enable = depthBiasEnable;
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE;
-}
-
-void anv_CmdSetPrimitiveRestartEnableEXT(
-    VkCommandBuffer                             commandBuffer,
-    VkBool32                                    primitiveRestartEnable)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   cmd_buffer->state.gfx.dynamic.primitive_restart_enable = primitiveRestartEnable;
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
-}
-
-void anv_CmdSetLogicOpEXT(
-   VkCommandBuffer                              commandBuffer,
-    VkLogicOp                                   logicOp)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   cmd_buffer->state.gfx.dynamic.logic_op = logicOp;
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
-}
-
-void anv_CmdSetPatchControlPointsEXT(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    patchControlPoints)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_FEATURE_NOT_PRESENT);
-}
-
-void anv_CmdSetViewport(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    firstViewport,
-    uint32_t                                    viewportCount,
-    const VkViewport*                           pViewports)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   const uint32_t total_count = firstViewport + viewportCount;
-   if (cmd_buffer->state.gfx.dynamic.viewport.count < total_count)
-      cmd_buffer->state.gfx.dynamic.viewport.count = total_count;
-
-   memcpy(cmd_buffer->state.gfx.dynamic.viewport.viewports + firstViewport,
-          pViewports, viewportCount * sizeof(*pViewports));
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_VIEWPORT;
-}
-
-void anv_CmdSetViewportWithCountEXT(
-   VkCommandBuffer                              commandBuffer,
-   uint32_t                                     viewportCount,
-   const VkViewport*                            pViewports)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   cmd_buffer->state.gfx.dynamic.viewport.count = viewportCount;
-
-   memcpy(cmd_buffer->state.gfx.dynamic.viewport.viewports,
-          pViewports, viewportCount * sizeof(*pViewports));
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_VIEWPORT;
-}
-
-void anv_CmdSetScissor(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    firstScissor,
-    uint32_t                                    scissorCount,
-    const VkRect2D*                             pScissors)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   const uint32_t total_count = firstScissor + scissorCount;
-   if (cmd_buffer->state.gfx.dynamic.scissor.count < total_count)
-      cmd_buffer->state.gfx.dynamic.scissor.count = total_count;
-
-   memcpy(cmd_buffer->state.gfx.dynamic.scissor.scissors + firstScissor,
-          pScissors, scissorCount * sizeof(*pScissors));
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_SCISSOR;
-}
 
-void anv_CmdSetScissorWithCountEXT(
-   VkCommandBuffer                              commandBuffer,
-   uint32_t                                     scissorCount,
-   const VkRect2D*                              pScissors)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   cmd_buffer->state.gfx.dynamic.scissor.count = scissorCount;
-
-   memcpy(cmd_buffer->state.gfx.dynamic.scissor.scissors,
-          pScissors, scissorCount * sizeof(*pScissors));
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_SCISSOR;
+   if (pipeline->ray_queries > 0)
+      anv_cmd_buffer_set_ray_query_buffer(cmd_buffer, state, pipeline, stages);
 }
 
-void anv_CmdSetPrimitiveTopologyEXT(
-   VkCommandBuffer                              commandBuffer,
-   VkPrimitiveTopology                          primitiveTopology)
+static struct anv_cmd_pipeline_state *
+anv_cmd_buffer_get_pipeline_layout_state(struct anv_cmd_buffer *cmd_buffer,
+                                         VkPipelineBindPoint bind_point,
+                                         const struct anv_descriptor_set_layout *set_layout,
+                                         VkShaderStageFlags *out_stages)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   cmd_buffer->state.gfx.dynamic.primitive_topology = primitiveTopology;
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
-}
-
-void anv_CmdSetLineWidth(
-    VkCommandBuffer                             commandBuffer,
-    float                                       lineWidth)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   cmd_buffer->state.gfx.dynamic.line_width = lineWidth;
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
-}
-
-void anv_CmdSetDepthBias(
-    VkCommandBuffer                             commandBuffer,
-    float                                       depthBiasConstantFactor,
-    float                                       depthBiasClamp,
-    float                                       depthBiasSlopeFactor)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   cmd_buffer->state.gfx.dynamic.depth_bias.bias = depthBiasConstantFactor;
-   cmd_buffer->state.gfx.dynamic.depth_bias.clamp = depthBiasClamp;
-   cmd_buffer->state.gfx.dynamic.depth_bias.slope = depthBiasSlopeFactor;
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
-}
-
-void anv_CmdSetBlendConstants(
-    VkCommandBuffer                             commandBuffer,
-    const float                                 blendConstants[4])
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   memcpy(cmd_buffer->state.gfx.dynamic.blend_constants,
-          blendConstants, sizeof(float) * 4);
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
-}
-
-void anv_CmdSetDepthBounds(
-    VkCommandBuffer                             commandBuffer,
-    float                                       minDepthBounds,
-    float                                       maxDepthBounds)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   cmd_buffer->state.gfx.dynamic.depth_bounds.min = minDepthBounds;
-   cmd_buffer->state.gfx.dynamic.depth_bounds.max = maxDepthBounds;
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
-}
-
-void anv_CmdSetStencilCompareMask(
-    VkCommandBuffer                             commandBuffer,
-    VkStencilFaceFlags                          faceMask,
-    uint32_t                                    compareMask)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
-      cmd_buffer->state.gfx.dynamic.stencil_compare_mask.front = compareMask;
-   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
-      cmd_buffer->state.gfx.dynamic.stencil_compare_mask.back = compareMask;
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
-}
-
-void anv_CmdSetStencilWriteMask(
-    VkCommandBuffer                             commandBuffer,
-    VkStencilFaceFlags                          faceMask,
-    uint32_t                                    writeMask)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
-      cmd_buffer->state.gfx.dynamic.stencil_write_mask.front = writeMask;
-   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
-      cmd_buffer->state.gfx.dynamic.stencil_write_mask.back = writeMask;
+   *out_stages = set_layout->shader_stages;
 
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
-}
-
-void anv_CmdSetStencilReference(
-    VkCommandBuffer                             commandBuffer,
-    VkStencilFaceFlags                          faceMask,
-    uint32_t                                    reference)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
-      cmd_buffer->state.gfx.dynamic.stencil_reference.front = reference;
-   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
-      cmd_buffer->state.gfx.dynamic.stencil_reference.back = reference;
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
-}
-
-void anv_CmdSetSampleLocationsEXT(
-    VkCommandBuffer                             commandBuffer,
-    const VkSampleLocationsInfoEXT*             pSampleLocationsInfo)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   struct anv_dynamic_state *dyn_state = &cmd_buffer->state.gfx.dynamic;
-   uint32_t samples = pSampleLocationsInfo->sampleLocationsPerPixel;
-
-   dyn_state->sample_locations.samples = samples;
-   typed_memcpy(dyn_state->sample_locations.locations,
-                pSampleLocationsInfo->pSampleLocations, samples);
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
-}
-
-void anv_CmdSetLineStippleEXT(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    lineStippleFactor,
-    uint16_t                                    lineStipplePattern)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   cmd_buffer->state.gfx.dynamic.line_stipple.factor = lineStippleFactor;
-   cmd_buffer->state.gfx.dynamic.line_stipple.pattern = lineStipplePattern;
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
-}
-
-void anv_CmdSetCullModeEXT(
-   VkCommandBuffer                              commandBuffer,
-   VkCullModeFlags                              cullMode)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   cmd_buffer->state.gfx.dynamic.cull_mode = cullMode;
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_CULL_MODE;
-}
-
-void anv_CmdSetFrontFaceEXT(
-   VkCommandBuffer                              commandBuffer,
-   VkFrontFace                                  frontFace)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   cmd_buffer->state.gfx.dynamic.front_face = frontFace;
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE;
-}
-
-void anv_CmdSetDepthTestEnableEXT(
-   VkCommandBuffer                              commandBuffer,
-   VkBool32                                     depthTestEnable)
-
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   cmd_buffer->state.gfx.dynamic.depth_test_enable = depthTestEnable;
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE;
-}
-
-void anv_CmdSetDepthWriteEnableEXT(
-   VkCommandBuffer                              commandBuffer,
-   VkBool32                                     depthWriteEnable)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   cmd_buffer->state.gfx.dynamic.depth_write_enable = depthWriteEnable;
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE;
-}
-
-void anv_CmdSetDepthCompareOpEXT(
-   VkCommandBuffer                              commandBuffer,
-   VkCompareOp                                  depthCompareOp)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   cmd_buffer->state.gfx.dynamic.depth_compare_op = depthCompareOp;
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP;
-}
-
-void anv_CmdSetDepthBoundsTestEnableEXT(
-   VkCommandBuffer                              commandBuffer,
-   VkBool32                                     depthBoundsTestEnable)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   cmd_buffer->state.gfx.dynamic.depth_bounds_test_enable = depthBoundsTestEnable;
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
-}
+   switch (bind_point) {
+   case VK_PIPELINE_BIND_POINT_GRAPHICS:
+      *out_stages &= VK_SHADER_STAGE_ALL_GRAPHICS |
+         (cmd_buffer->device->vk.enabled_extensions.EXT_mesh_shader ?
+          (VK_SHADER_STAGE_TASK_BIT_EXT |
+           VK_SHADER_STAGE_MESH_BIT_EXT) : 0);
+      return &cmd_buffer->state.gfx.base;
 
-void anv_CmdSetStencilTestEnableEXT(
-   VkCommandBuffer                              commandBuffer,
-   VkBool32                                     stencilTestEnable)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   case VK_PIPELINE_BIND_POINT_COMPUTE:
+      *out_stages &= VK_SHADER_STAGE_COMPUTE_BIT;
+      return &cmd_buffer->state.compute.base;
 
-   cmd_buffer->state.gfx.dynamic.stencil_test_enable = stencilTestEnable;
+   case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
+      *out_stages &= VK_SHADER_STAGE_RAYGEN_BIT_KHR |
+         VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
+         VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
+         VK_SHADER_STAGE_MISS_BIT_KHR |
+         VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
+         VK_SHADER_STAGE_CALLABLE_BIT_KHR;
+      return &cmd_buffer->state.rt.base;
 
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE;
+   default:
+      unreachable("invalid bind point");
+   }
 }
 
-void anv_CmdSetStencilOpEXT(
-   VkCommandBuffer                              commandBuffer,
-   VkStencilFaceFlags                           faceMask,
-   VkStencilOp                                  failOp,
-   VkStencilOp                                  passOp,
-   VkStencilOp                                  depthFailOp,
-   VkCompareOp                                  compareOp)
+static void
+anv_cmd_buffer_maybe_dirty_descriptor_mode(struct anv_cmd_buffer *cmd_buffer,
+                                           enum anv_cmd_descriptor_buffer_mode new_mode)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   if (faceMask & VK_STENCIL_FACE_FRONT_BIT) {
-      cmd_buffer->state.gfx.dynamic.stencil_op.front.fail_op = failOp;
-      cmd_buffer->state.gfx.dynamic.stencil_op.front.pass_op = passOp;
-      cmd_buffer->state.gfx.dynamic.stencil_op.front.depth_fail_op = depthFailOp;
-      cmd_buffer->state.gfx.dynamic.stencil_op.front.compare_op = compareOp;
-    }
-
-   if (faceMask & VK_STENCIL_FACE_BACK_BIT) {
-      cmd_buffer->state.gfx.dynamic.stencil_op.back.fail_op = failOp;
-      cmd_buffer->state.gfx.dynamic.stencil_op.back.pass_op = passOp;
-      cmd_buffer->state.gfx.dynamic.stencil_op.back.depth_fail_op = depthFailOp;
-      cmd_buffer->state.gfx.dynamic.stencil_op.back.compare_op = compareOp;
-   }
+   if (cmd_buffer->state.current_db_mode == new_mode)
+      return;
 
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
+   /* Ensure we program the STATE_BASE_ADDRESS properly at least once */
+   cmd_buffer->state.descriptor_buffers.dirty = true;
+   cmd_buffer->state.pending_db_mode = new_mode;
 }
 
 static void
 anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
                                    VkPipelineBindPoint bind_point,
-                                   struct anv_pipeline_layout *layout,
+                                   struct anv_pipeline_sets_layout *layout,
                                    uint32_t set_index,
                                    struct anv_descriptor_set *set,
                                    uint32_t *dynamic_offset_count,
                                    const uint32_t **dynamic_offsets)
 {
+   /* Either we have no pool because it's a push descriptor or the pool is not
+    * host only :
+    *
+    * VUID-vkCmdBindDescriptorSets-pDescriptorSets-04616:
+    *
+    *    "Each element of pDescriptorSets must not have been allocated from a
+    *     VkDescriptorPool with the
+    *     VK_DESCRIPTOR_POOL_CREATE_HOST_ONLY_BIT_EXT flag set"
+    */
+   assert(!set->pool || !set->pool->host_only);
+
    struct anv_descriptor_set_layout *set_layout =
       layout->set[set_index].layout;
 
-   VkShaderStageFlags stages = set_layout->shader_stages;
-   struct anv_cmd_pipeline_state *pipe_state;
+   anv_cmd_buffer_maybe_dirty_descriptor_mode(
+      cmd_buffer,
+      (set->layout->flags &
+       VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT) != 0 ?
+      ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER :
+      ANV_CMD_DESCRIPTOR_BUFFER_MODE_LEGACY);
 
-   switch (bind_point) {
-   case VK_PIPELINE_BIND_POINT_GRAPHICS:
-      stages &= VK_SHADER_STAGE_ALL_GRAPHICS;
-      pipe_state = &cmd_buffer->state.gfx.base;
-      break;
-
-   case VK_PIPELINE_BIND_POINT_COMPUTE:
-      stages &= VK_SHADER_STAGE_COMPUTE_BIT;
-      pipe_state = &cmd_buffer->state.compute.base;
-      break;
-
-   case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
-      stages &= VK_SHADER_STAGE_RAYGEN_BIT_KHR |
-                VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
-                VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
-                VK_SHADER_STAGE_MISS_BIT_KHR |
-                VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
-                VK_SHADER_STAGE_CALLABLE_BIT_KHR;
-      pipe_state = &cmd_buffer->state.rt.base;
-      break;
-
-   default:
-      unreachable("invalid bind point");
-   }
+   VkShaderStageFlags stages;
+   struct anv_cmd_pipeline_state *pipe_state =
+      anv_cmd_buffer_get_pipeline_layout_state(cmd_buffer, bind_point,
+                                               set_layout, &stages);
 
    VkShaderStageFlags dirty_stages = 0;
    /* If it's a push descriptor set, we have to flag things as dirty
@@ -936,23 +822,58 @@ anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
     * may have edited in-place.
     */
    if (pipe_state->descriptors[set_index] != set ||
-         anv_descriptor_set_is_push(set)) {
+       anv_descriptor_set_is_push(set)) {
       pipe_state->descriptors[set_index] = set;
 
-      /* Ray-tracing shaders are entirely bindless and so they don't have
-       * access to HW binding tables.  This means that we have to upload the
-       * descriptor set as an 64-bit address in the push constants.
-       */
-      if (bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR) {
-         struct anv_push_constants *push = &pipe_state->push_constants;
+      if (set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT) {
+         assert(set->is_push);
 
-         struct anv_address addr = anv_descriptor_set_address(set);
-         push->desc_sets[set_index] = anv_address_physical(addr);
+         pipe_state->descriptor_buffers[set_index].buffer_index = -1;
+         pipe_state->descriptor_buffers[set_index].buffer_offset = set->desc_offset;
+         pipe_state->descriptor_buffers[set_index].bound = true;
+         cmd_buffer->state.descriptors_dirty |= stages;
+         cmd_buffer->state.descriptor_buffers.offsets_dirty |= stages;
+      } else {
+         /* When using indirect descriptors, stages that have access to the HW
+          * binding tables, never need to access the
+          * anv_push_constants::desc_offsets fields, because any data they
+          * need from the descriptor buffer is accessible through a binding
+          * table entry. For stages that are "bindless" (Mesh/Task/RT), we
+          * need to provide anv_push_constants::desc_offsets matching the
+          * bound descriptor so that shaders can access the descriptor buffer
+          * through A64 messages.
+          *
+          * With direct descriptors, the shaders can use the
+          * anv_push_constants::desc_offsets to build bindless offsets. So
+          * it's we always need to update the push constant data.
+          */
+         bool update_desc_sets =
+            !cmd_buffer->device->physical->indirect_descriptors ||
+            (stages & (VK_SHADER_STAGE_TASK_BIT_EXT |
+                       VK_SHADER_STAGE_MESH_BIT_EXT |
+                       VK_SHADER_STAGE_RAYGEN_BIT_KHR |
+                       VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
+                       VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
+                       VK_SHADER_STAGE_MISS_BIT_KHR |
+                       VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
+                       VK_SHADER_STAGE_CALLABLE_BIT_KHR));
+
+         if (update_desc_sets) {
+            struct anv_push_constants *push = &pipe_state->push_constants;
+            uint64_t offset =
+               anv_address_physical(set->desc_surface_addr) -
+               cmd_buffer->device->physical->va.internal_surface_state_pool.addr;
+            assert((offset & ~ANV_DESCRIPTOR_SET_OFFSET_MASK) == 0);
+            push->desc_surface_offsets[set_index] &= ~ANV_DESCRIPTOR_SET_OFFSET_MASK;
+            push->desc_surface_offsets[set_index] |= offset;
+            push->desc_sampler_offsets[set_index] |=
+               anv_address_physical(set->desc_sampler_addr) -
+               cmd_buffer->device->physical->va.dynamic_state_pool.addr;
 
-         if (addr.bo) {
             anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
-                                  cmd_buffer->batch.alloc,
-                                  addr.bo);
+                                  set->desc_surface_addr.bo);
+            anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+                                  set->desc_sampler_addr.bo);
          }
       }
 
@@ -967,6 +888,11 @@ anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
          uint32_t *push_offsets =
             &push->dynamic_offsets[dynamic_offset_start];
 
+         memcpy(pipe_state->dynamic_offsets[set_index].offsets,
+                *dynamic_offsets,
+                sizeof(uint32_t) * MIN2(*dynamic_offset_count,
+                                        set_layout->dynamic_offset_count));
+
          /* Assert that everything is in range */
          assert(set_layout->dynamic_offset_count <= *dynamic_offset_count);
          assert(dynamic_offset_start + set_layout->dynamic_offset_count <=
@@ -974,7 +900,8 @@ anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
 
          for (uint32_t i = 0; i < set_layout->dynamic_offset_count; i++) {
             if (push_offsets[i] != (*dynamic_offsets)[i]) {
-               push_offsets[i] = (*dynamic_offsets)[i];
+               pipe_state->dynamic_offsets[set_index].offsets[i] =
+                  push_offsets[i] = (*dynamic_offsets)[i];
                /* dynamic_offset_stages[] elements could contain blanket
                 * values like VK_SHADER_STAGE_ALL, so limit this to the
                 * binding point's bits.
@@ -988,35 +915,187 @@ anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
       }
    }
 
-   cmd_buffer->state.descriptors_dirty |= dirty_stages;
+   if (set->is_push)
+      cmd_buffer->state.push_descriptors_dirty |= dirty_stages;
+   else
+      cmd_buffer->state.descriptors_dirty |= dirty_stages;
    cmd_buffer->state.push_constants_dirty |= dirty_stages;
+   pipe_state->push_constants_data_dirty = true;
 }
 
-void anv_CmdBindDescriptorSets(
+#define ANV_GRAPHICS_STAGE_BITS \
+   (VK_SHADER_STAGE_ALL_GRAPHICS | \
+    VK_SHADER_STAGE_MESH_BIT_EXT | \
+    VK_SHADER_STAGE_TASK_BIT_EXT)
+
+#define ANV_RT_STAGE_BITS \
+   (VK_SHADER_STAGE_RAYGEN_BIT_KHR | \
+    VK_SHADER_STAGE_ANY_HIT_BIT_KHR | \
+    VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | \
+    VK_SHADER_STAGE_MISS_BIT_KHR | \
+    VK_SHADER_STAGE_INTERSECTION_BIT_KHR | \
+    VK_SHADER_STAGE_CALLABLE_BIT_KHR)
+
+void anv_CmdBindDescriptorSets2KHR(
     VkCommandBuffer                             commandBuffer,
-    VkPipelineBindPoint                         pipelineBindPoint,
-    VkPipelineLayout                            _layout,
-    uint32_t                                    firstSet,
-    uint32_t                                    descriptorSetCount,
-    const VkDescriptorSet*                      pDescriptorSets,
-    uint32_t                                    dynamicOffsetCount,
-    const uint32_t*                             pDynamicOffsets)
+    const VkBindDescriptorSetsInfoKHR*          pInfo)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   ANV_FROM_HANDLE(anv_pipeline_layout, layout, _layout);
+   ANV_FROM_HANDLE(anv_pipeline_layout, pipeline_layout, pInfo->layout);
+   struct anv_pipeline_sets_layout *layout = &pipeline_layout->sets_layout;
+
+   assert(pInfo->firstSet + pInfo->descriptorSetCount <= MAX_SETS);
+
+   if (pInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
+      uint32_t dynamicOffsetCount = pInfo->dynamicOffsetCount;
+      const uint32_t *pDynamicOffsets = pInfo->pDynamicOffsets;
+
+      for (uint32_t i = 0; i < pInfo->descriptorSetCount; i++) {
+         ANV_FROM_HANDLE(anv_descriptor_set, set, pInfo->pDescriptorSets[i]);
+         if (set == NULL)
+            continue;
+         anv_cmd_buffer_bind_descriptor_set(cmd_buffer,
+                                            VK_PIPELINE_BIND_POINT_COMPUTE,
+                                            layout, pInfo->firstSet + i, set,
+                                            &dynamicOffsetCount,
+                                            &pDynamicOffsets);
+      }
+   }
+   if (pInfo->stageFlags & ANV_GRAPHICS_STAGE_BITS) {
+      uint32_t dynamicOffsetCount = pInfo->dynamicOffsetCount;
+      const uint32_t *pDynamicOffsets = pInfo->pDynamicOffsets;
+
+      for (uint32_t i = 0; i < pInfo->descriptorSetCount; i++) {
+         ANV_FROM_HANDLE(anv_descriptor_set, set, pInfo->pDescriptorSets[i]);
+         if (set == NULL)
+            continue;
+         anv_cmd_buffer_bind_descriptor_set(cmd_buffer,
+                                            VK_PIPELINE_BIND_POINT_GRAPHICS,
+                                            layout, pInfo->firstSet + i, set,
+                                            &dynamicOffsetCount,
+                                            &pDynamicOffsets);
+      }
+   }
+   if (pInfo->stageFlags & ANV_RT_STAGE_BITS) {
+      uint32_t dynamicOffsetCount = pInfo->dynamicOffsetCount;
+      const uint32_t *pDynamicOffsets = pInfo->pDynamicOffsets;
+
+      for (uint32_t i = 0; i < pInfo->descriptorSetCount; i++) {
+         ANV_FROM_HANDLE(anv_descriptor_set, set, pInfo->pDescriptorSets[i]);
+         if (set == NULL)
+            continue;
+         anv_cmd_buffer_bind_descriptor_set(cmd_buffer,
+                                            VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR,
+                                            layout, pInfo->firstSet + i, set,
+                                            &dynamicOffsetCount,
+                                            &pDynamicOffsets);
+      }
+   }
+}
 
-   assert(firstSet + descriptorSetCount <= MAX_SETS);
+void anv_CmdBindDescriptorBuffersEXT(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    bufferCount,
+    const VkDescriptorBufferBindingInfoEXT*     pBindingInfos)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_cmd_state *state = &cmd_buffer->state;
 
-   for (uint32_t i = 0; i < descriptorSetCount; i++) {
-      ANV_FROM_HANDLE(anv_descriptor_set, set, pDescriptorSets[i]);
-      anv_cmd_buffer_bind_descriptor_set(cmd_buffer, pipelineBindPoint,
-                                         layout, firstSet + i, set,
-                                         &dynamicOffsetCount,
-                                         &pDynamicOffsets);
+   for (uint32_t i = 0; i < bufferCount; i++) {
+      assert(pBindingInfos[i].address >= cmd_buffer->device->physical->va.descriptor_buffer_pool.addr &&
+             pBindingInfos[i].address < (cmd_buffer->device->physical->va.descriptor_buffer_pool.addr +
+                                         cmd_buffer->device->physical->va.descriptor_buffer_pool.size));
+
+      if (state->descriptor_buffers.address[i] != pBindingInfos[i].address) {
+         state->descriptor_buffers.address[i] = pBindingInfos[i].address;
+         if (pBindingInfos[i].usage & VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT)
+            state->descriptor_buffers.surfaces_address = pBindingInfos[i].address;
+         if (pBindingInfos[i].usage & VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT)
+            state->descriptor_buffers.samplers_address = pBindingInfos[i].address;
+         state->descriptor_buffers.dirty = true;
+         state->descriptor_buffers.offsets_dirty = ~0;
+      }
+   }
+
+   anv_cmd_buffer_maybe_dirty_descriptor_mode(cmd_buffer,
+                                              ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER);
+}
+
+static void
+anv_cmd_buffer_set_descriptor_buffer_offsets(struct anv_cmd_buffer *cmd_buffer,
+                                             VkPipelineBindPoint bind_point,
+                                             struct anv_pipeline_layout *layout,
+                                             uint32_t first_set,
+                                             uint32_t set_count,
+                                             const VkDeviceSize *buffer_offsets,
+                                             const uint32_t *buffer_indices)
+{
+   for (uint32_t i = 0; i < set_count; i++) {
+      const uint32_t set_index = first_set + i;
+
+      const struct anv_descriptor_set_layout *set_layout =
+         layout->sets_layout.set[set_index].layout;
+      VkShaderStageFlags stages;
+      struct anv_cmd_pipeline_state *pipe_state =
+         anv_cmd_buffer_get_pipeline_layout_state(cmd_buffer, bind_point,
+                                                  set_layout, &stages);
+
+      if (buffer_offsets[i] != pipe_state->descriptor_buffers[set_index].buffer_offset ||
+          buffer_indices[i] != pipe_state->descriptor_buffers[set_index].buffer_index ||
+          !pipe_state->descriptor_buffers[set_index].bound) {
+         pipe_state->descriptor_buffers[set_index].buffer_index = buffer_indices[i];
+         pipe_state->descriptor_buffers[set_index].buffer_offset = buffer_offsets[i];
+         cmd_buffer->state.descriptors_dirty |= stages;
+         cmd_buffer->state.descriptor_buffers.offsets_dirty |= stages;
+      }
+      pipe_state->descriptor_buffers[set_index].bound = true;
+   }
+}
+
+void anv_CmdSetDescriptorBufferOffsets2EXT(
+    VkCommandBuffer                             commandBuffer,
+    const VkSetDescriptorBufferOffsetsInfoEXT*  pSetDescriptorBufferOffsetsInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_pipeline_layout, layout, pSetDescriptorBufferOffsetsInfo->layout);
+
+   if (pSetDescriptorBufferOffsetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
+      anv_cmd_buffer_set_descriptor_buffer_offsets(cmd_buffer,
+                                                   VK_PIPELINE_BIND_POINT_COMPUTE,
+                                                   layout,
+                                                   pSetDescriptorBufferOffsetsInfo->firstSet,
+                                                   pSetDescriptorBufferOffsetsInfo->setCount,
+                                                   pSetDescriptorBufferOffsetsInfo->pOffsets,
+                                                   pSetDescriptorBufferOffsetsInfo->pBufferIndices);
+   }
+   if (pSetDescriptorBufferOffsetsInfo->stageFlags & ANV_GRAPHICS_STAGE_BITS) {
+      anv_cmd_buffer_set_descriptor_buffer_offsets(cmd_buffer,
+                                                   VK_PIPELINE_BIND_POINT_GRAPHICS,
+                                                   layout,
+                                                   pSetDescriptorBufferOffsetsInfo->firstSet,
+                                                   pSetDescriptorBufferOffsetsInfo->setCount,
+                                                   pSetDescriptorBufferOffsetsInfo->pOffsets,
+                                                   pSetDescriptorBufferOffsetsInfo->pBufferIndices);
+   }
+   if (pSetDescriptorBufferOffsetsInfo->stageFlags & ANV_RT_STAGE_BITS) {
+      anv_cmd_buffer_set_descriptor_buffer_offsets(cmd_buffer,
+                                                   VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR,
+                                                   layout,
+                                                   pSetDescriptorBufferOffsetsInfo->firstSet,
+                                                   pSetDescriptorBufferOffsetsInfo->setCount,
+                                                   pSetDescriptorBufferOffsetsInfo->pOffsets,
+                                                   pSetDescriptorBufferOffsetsInfo->pBufferIndices);
    }
 }
 
-void anv_CmdBindVertexBuffers2EXT(
+void anv_CmdBindDescriptorBufferEmbeddedSamplers2EXT(
+    VkCommandBuffer                             commandBuffer,
+    const VkBindDescriptorBufferEmbeddedSamplersInfoEXT* pBindDescriptorBufferEmbeddedSamplersInfo)
+{
+   /* no-op */
+}
+
+void anv_CmdBindVertexBuffers2(
    VkCommandBuffer                              commandBuffer,
    uint32_t                                     firstBinding,
    uint32_t                                     bindingCount,
@@ -1031,31 +1110,29 @@ void anv_CmdBindVertexBuffers2EXT(
    /* We have to defer setting up vertex buffer since we need the buffer
     * stride from the pipeline. */
 
-   if (pSizes)
-      cmd_buffer->state.gfx.dynamic.dyn_vbo_size = true;
-   if (pStrides)
-      cmd_buffer->state.gfx.dynamic.dyn_vbo_stride = true;
-
    assert(firstBinding + bindingCount <= MAX_VBS);
    for (uint32_t i = 0; i < bindingCount; i++) {
-      vb[firstBinding + i].buffer = anv_buffer_from_handle(pBuffers[i]);
-      vb[firstBinding + i].offset = pOffsets[i];
-      vb[firstBinding + i].size = pSizes ? pSizes[i] : 0;
-      vb[firstBinding + i].stride = pStrides ? pStrides[i] : 0;
+      ANV_FROM_HANDLE(anv_buffer, buffer, pBuffers[i]);
+
+      if (buffer == NULL) {
+         vb[firstBinding + i] = (struct anv_vertex_binding) {
+            .buffer = NULL,
+         };
+      } else {
+         vb[firstBinding + i] = (struct anv_vertex_binding) {
+            .buffer = buffer,
+            .offset = pOffsets[i],
+            .size = vk_buffer_range(&buffer->vk, pOffsets[i],
+                                    pSizes ? pSizes[i] : VK_WHOLE_SIZE),
+         };
+      }
       cmd_buffer->state.gfx.vb_dirty |= 1 << (firstBinding + i);
    }
-}
 
-void anv_CmdBindVertexBuffers(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    firstBinding,
-    uint32_t                                    bindingCount,
-    const VkBuffer*                             pBuffers,
-    const VkDeviceSize*                         pOffsets)
-{
-   return anv_CmdBindVertexBuffers2EXT(commandBuffer, firstBinding,
-                                       bindingCount, pBuffers, pOffsets,
-                                       NULL, NULL);
+   if (pStrides != NULL) {
+      vk_cmd_set_vertex_binding_strides(&cmd_buffer->vk, firstBinding,
+                                        bindingCount, pStrides);
+   }
 }
 
 void anv_CmdBindTransformFeedbackBuffersEXT(
@@ -1081,8 +1158,8 @@ void anv_CmdBindTransformFeedbackBuffersEXT(
          xfb[firstBinding + i].buffer = buffer;
          xfb[firstBinding + i].offset = pOffsets[i];
          xfb[firstBinding + i].size =
-            anv_buffer_get_range(buffer, pOffsets[i],
-                                 pSizes ? pSizes[i] : VK_WHOLE_SIZE);
+            vk_buffer_range(&buffer->vk, pOffsets[i],
+                            pSizes ? pSizes[i] : VK_WHOLE_SIZE);
       }
    }
 }
@@ -1146,9 +1223,12 @@ anv_cmd_buffer_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer)
       &cmd_buffer->state.gfx.base.push_constants;
 
    struct anv_state state =
-      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
-                                         sizeof(struct anv_push_constants),
-                                         32 /* bottom 5 bits MBZ */);
+      anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
+                                           sizeof(struct anv_push_constants),
+                                           32 /* bottom 5 bits MBZ */);
+   if (state.alloc_size == 0)
+      return state;
+
    memcpy(state.map, data, sizeof(struct anv_push_constants));
 
    return state;
@@ -1157,22 +1237,22 @@ anv_cmd_buffer_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer)
 struct anv_state
 anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer)
 {
-   const struct intel_device_info *devinfo = &cmd_buffer->device->info;
-   struct anv_push_constants *data =
-      &cmd_buffer->state.compute.base.push_constants;
-   struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+   struct anv_cmd_pipeline_state *pipe_state = &cmd_buffer->state.compute.base;
+   struct anv_push_constants *data = &pipe_state->push_constants;
+   struct anv_compute_pipeline *pipeline =
+      anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
    const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
    const struct anv_push_range *range = &pipeline->cs->bind_map.push_ranges[0];
 
-   const struct brw_cs_dispatch_info dispatch =
+   const struct intel_cs_dispatch_info dispatch =
       brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
    const unsigned total_push_constants_size =
       brw_cs_push_const_total_size(cs_prog_data, dispatch.threads);
    if (total_push_constants_size == 0)
       return (struct anv_state) { .offset = 0 };
 
-   const unsigned push_constant_alignment =
-      cmd_buffer->device->info.ver < 8 ? 32 : 64;
+   const unsigned push_constant_alignment = 64;
    const unsigned aligned_total_push_constants_size =
       ALIGN(total_push_constants_size, push_constant_alignment);
    struct anv_state state;
@@ -1185,6 +1265,8 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer)
                                                  aligned_total_push_constants_size,
                                                  push_constant_alignment);
    }
+   if (state.map == NULL)
+      return state;
 
    void *dst = state.map;
    const void *src = (char *)data + (range->start * 32);
@@ -1211,394 +1293,131 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer)
    return state;
 }
 
-void anv_CmdPushConstants(
+void anv_CmdPushConstants2KHR(
     VkCommandBuffer                             commandBuffer,
-    VkPipelineLayout                            layout,
-    VkShaderStageFlags                          stageFlags,
-    uint32_t                                    offset,
-    uint32_t                                    size,
-    const void*                                 pValues)
+    const VkPushConstantsInfoKHR*               pInfo)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   if (stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) {
+   if (pInfo->stageFlags & ANV_GRAPHICS_STAGE_BITS) {
       struct anv_cmd_pipeline_state *pipe_state =
          &cmd_buffer->state.gfx.base;
 
-      memcpy(pipe_state->push_constants.client_data + offset, pValues, size);
+      memcpy(pipe_state->push_constants.client_data + pInfo->offset,
+             pInfo->pValues, pInfo->size);
+      pipe_state->push_constants_data_dirty = true;
    }
-   if (stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
+   if (pInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
       struct anv_cmd_pipeline_state *pipe_state =
          &cmd_buffer->state.compute.base;
 
-      memcpy(pipe_state->push_constants.client_data + offset, pValues, size);
+      memcpy(pipe_state->push_constants.client_data + pInfo->offset,
+             pInfo->pValues, pInfo->size);
+      pipe_state->push_constants_data_dirty = true;
    }
-   if (stageFlags & (VK_SHADER_STAGE_RAYGEN_BIT_KHR |
-                     VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
-                     VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
-                     VK_SHADER_STAGE_MISS_BIT_KHR |
-                     VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
-                     VK_SHADER_STAGE_CALLABLE_BIT_KHR)) {
+   if (pInfo->stageFlags & ANV_RT_STAGE_BITS) {
       struct anv_cmd_pipeline_state *pipe_state =
          &cmd_buffer->state.rt.base;
 
-      memcpy(pipe_state->push_constants.client_data + offset, pValues, size);
+      memcpy(pipe_state->push_constants.client_data + pInfo->offset,
+             pInfo->pValues, pInfo->size);
+      pipe_state->push_constants_data_dirty = true;
    }
 
-   cmd_buffer->state.push_constants_dirty |= stageFlags;
-}
-
-VkResult anv_CreateCommandPool(
-    VkDevice                                    _device,
-    const VkCommandPoolCreateInfo*              pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkCommandPool*                              pCmdPool)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   struct anv_cmd_pool *pool;
-
-   pool = vk_object_alloc(&device->vk, pAllocator, sizeof(*pool),
-                          VK_OBJECT_TYPE_COMMAND_POOL);
-   if (pool == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   if (pAllocator)
-      pool->alloc = *pAllocator;
-   else
-      pool->alloc = device->vk.alloc;
-
-   list_inithead(&pool->cmd_buffers);
-
-   pool->flags = pCreateInfo->flags;
-
-   *pCmdPool = anv_cmd_pool_to_handle(pool);
-
-   return VK_SUCCESS;
+   cmd_buffer->state.push_constants_dirty |= pInfo->stageFlags;
 }
 
-void anv_DestroyCommandPool(
-    VkDevice                                    _device,
-    VkCommandPool                               commandPool,
-    const VkAllocationCallbacks*                pAllocator)
+static struct anv_cmd_pipeline_state *
+anv_cmd_buffer_get_pipe_state(struct anv_cmd_buffer *cmd_buffer,
+                              VkPipelineBindPoint bind_point)
 {
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_cmd_pool, pool, commandPool);
-
-   if (!pool)
-      return;
-
-   list_for_each_entry_safe(struct anv_cmd_buffer, cmd_buffer,
-                            &pool->cmd_buffers, pool_link) {
-      anv_cmd_buffer_destroy(cmd_buffer);
-   }
-
-   vk_object_free(&device->vk, pAllocator, pool);
-}
-
-VkResult anv_ResetCommandPool(
-    VkDevice                                    device,
-    VkCommandPool                               commandPool,
-    VkCommandPoolResetFlags                     flags)
-{
-   ANV_FROM_HANDLE(anv_cmd_pool, pool, commandPool);
-
-   list_for_each_entry(struct anv_cmd_buffer, cmd_buffer,
-                       &pool->cmd_buffers, pool_link) {
-      anv_cmd_buffer_reset(cmd_buffer);
-   }
-
-   return VK_SUCCESS;
-}
-
-void anv_TrimCommandPool(
-    VkDevice                                    device,
-    VkCommandPool                               commandPool,
-    VkCommandPoolTrimFlags                      flags)
-{
-   /* Nothing for us to do here.  Our pools stay pretty tidy. */
-}
-
-/**
- * Return NULL if the current subpass has no depthstencil attachment.
- */
-const struct anv_image_view *
-anv_cmd_buffer_get_depth_stencil_view(const struct anv_cmd_buffer *cmd_buffer)
-{
-   const struct anv_subpass *subpass = cmd_buffer->state.subpass;
-
-   if (subpass->depth_stencil_attachment == NULL)
-      return NULL;
-
-   const struct anv_image_view *iview =
-      cmd_buffer->state.attachments[subpass->depth_stencil_attachment->attachment].image_view;
-
-   assert(iview->vk.aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
-                               VK_IMAGE_ASPECT_STENCIL_BIT));
-
-   return iview;
-}
-
-static struct anv_descriptor_set *
-anv_cmd_buffer_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
-                                   VkPipelineBindPoint bind_point,
-                                   struct anv_descriptor_set_layout *layout,
-                                   uint32_t _set)
-{
-   struct anv_cmd_pipeline_state *pipe_state;
-
    switch (bind_point) {
    case VK_PIPELINE_BIND_POINT_GRAPHICS:
-      pipe_state = &cmd_buffer->state.gfx.base;
-      break;
-
+      return &cmd_buffer->state.gfx.base;
    case VK_PIPELINE_BIND_POINT_COMPUTE:
-      pipe_state = &cmd_buffer->state.compute.base;
-      break;
-
+      return &cmd_buffer->state.compute.base;
    case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
-      pipe_state = &cmd_buffer->state.rt.base;
+      return &cmd_buffer->state.rt.base;
       break;
-
    default:
       unreachable("invalid bind point");
    }
-
-   struct anv_push_descriptor_set **push_set =
-      &pipe_state->push_descriptors[_set];
-
-   if (*push_set == NULL) {
-      *push_set = vk_zalloc(&cmd_buffer->pool->alloc,
-                            sizeof(struct anv_push_descriptor_set), 8,
-                            VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-      if (*push_set == NULL) {
-         anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
-         return NULL;
-      }
-   }
-
-   struct anv_descriptor_set *set = &(*push_set)->set;
-
-   if (set->layout != layout) {
-      if (set->layout)
-         anv_descriptor_set_layout_unref(cmd_buffer->device, set->layout);
-      anv_descriptor_set_layout_ref(layout);
-      set->layout = layout;
-   }
-   set->size = anv_descriptor_set_layout_size(layout, 0);
-   set->buffer_view_count = layout->buffer_view_count;
-   set->descriptor_count = layout->descriptor_count;
-   set->buffer_views = (*push_set)->buffer_views;
-
-   if (layout->descriptor_buffer_size &&
-       ((*push_set)->set_used_on_gpu ||
-        set->desc_mem.alloc_size < layout->descriptor_buffer_size)) {
-      /* The previous buffer is either actively used by some GPU command (so
-       * we can't modify it) or is too small.  Allocate a new one.
-       */
-      struct anv_state desc_mem =
-         anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,
-                                anv_descriptor_set_layout_descriptor_buffer_size(layout, 0),
-                                ANV_UBO_ALIGNMENT);
-      if (set->desc_mem.alloc_size) {
-         /* TODO: Do we really need to copy all the time? */
-         memcpy(desc_mem.map, set->desc_mem.map,
-                MIN2(desc_mem.alloc_size, set->desc_mem.alloc_size));
-      }
-      set->desc_mem = desc_mem;
-
-      set->desc_addr = (struct anv_address) {
-         .bo = cmd_buffer->dynamic_state_stream.state_pool->block_pool.bo,
-         .offset = set->desc_mem.offset,
-      };
-
-      enum isl_format format =
-         anv_isl_format_for_descriptor_type(cmd_buffer->device,
-                                            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
-
-      const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
-      set->desc_surface_state =
-         anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
-                                isl_dev->ss.size, isl_dev->ss.align);
-      anv_fill_buffer_surface_state(cmd_buffer->device,
-                                    set->desc_surface_state, format,
-                                    ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
-                                    set->desc_addr,
-                                    layout->descriptor_buffer_size, 1);
-   }
-
-   return set;
 }
 
-void anv_CmdPushDescriptorSetKHR(
-    VkCommandBuffer commandBuffer,
-    VkPipelineBindPoint pipelineBindPoint,
-    VkPipelineLayout _layout,
-    uint32_t _set,
-    uint32_t descriptorWriteCount,
-    const VkWriteDescriptorSet* pDescriptorWrites)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   ANV_FROM_HANDLE(anv_pipeline_layout, layout, _layout);
-
-   assert(_set < MAX_SETS);
-
-   struct anv_descriptor_set_layout *set_layout = layout->set[_set].layout;
-
-   struct anv_descriptor_set *set =
-      anv_cmd_buffer_push_descriptor_set(cmd_buffer, pipelineBindPoint,
-                                         set_layout, _set);
-   if (!set)
-      return;
-
-   /* Go through the user supplied descriptors. */
-   for (uint32_t i = 0; i < descriptorWriteCount; i++) {
-      const VkWriteDescriptorSet *write = &pDescriptorWrites[i];
-
-      switch (write->descriptorType) {
-      case VK_DESCRIPTOR_TYPE_SAMPLER:
-      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
-      case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
-      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
-      case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
-         for (uint32_t j = 0; j < write->descriptorCount; j++) {
-            anv_descriptor_set_write_image_view(cmd_buffer->device, set,
-                                                write->pImageInfo + j,
-                                                write->descriptorType,
-                                                write->dstBinding,
-                                                write->dstArrayElement + j);
-         }
-         break;
-
-      case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
-      case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
-         for (uint32_t j = 0; j < write->descriptorCount; j++) {
-            ANV_FROM_HANDLE(anv_buffer_view, bview,
-                            write->pTexelBufferView[j]);
-
-            anv_descriptor_set_write_buffer_view(cmd_buffer->device, set,
-                                                 write->descriptorType,
-                                                 bview,
-                                                 write->dstBinding,
-                                                 write->dstArrayElement + j);
-         }
-         break;
-
-      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
-      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
-      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
-      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
-         for (uint32_t j = 0; j < write->descriptorCount; j++) {
-            ANV_FROM_HANDLE(anv_buffer, buffer, write->pBufferInfo[j].buffer);
-
-            anv_descriptor_set_write_buffer(cmd_buffer->device, set,
-                                            &cmd_buffer->surface_state_stream,
-                                            write->descriptorType,
-                                            buffer,
-                                            write->dstBinding,
-                                            write->dstArrayElement + j,
-                                            write->pBufferInfo[j].offset,
-                                            write->pBufferInfo[j].range);
-         }
-         break;
-
-      case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: {
-         const VkWriteDescriptorSetAccelerationStructureKHR *accel_write =
-            vk_find_struct_const(write, WRITE_DESCRIPTOR_SET_ACCELERATION_STRUCTURE_KHR);
-         assert(accel_write->accelerationStructureCount ==
-                write->descriptorCount);
-         for (uint32_t j = 0; j < write->descriptorCount; j++) {
-            ANV_FROM_HANDLE(anv_acceleration_structure, accel,
-                            accel_write->pAccelerationStructures[j]);
-            anv_descriptor_set_write_acceleration_structure(cmd_buffer->device,
-                                                            set, accel,
-                                                            write->dstBinding,
-                                                            write->dstArrayElement + j);
-         }
-         break;
-      }
-
-      default:
-         break;
-      }
-   }
-
-   anv_cmd_buffer_bind_descriptor_set(cmd_buffer, pipelineBindPoint,
-                                      layout, _set, set, NULL, NULL);
-}
-
-void anv_CmdPushDescriptorSetWithTemplateKHR(
-    VkCommandBuffer                             commandBuffer,
-    VkDescriptorUpdateTemplate                  descriptorUpdateTemplate,
-    VkPipelineLayout                            _layout,
-    uint32_t                                    _set,
-    const void*                                 pData)
+static void
+anv_cmd_buffer_push_descriptor_sets(struct anv_cmd_buffer *cmd_buffer,
+                                    VkPipelineBindPoint bind_point,
+                                    const VkPushDescriptorSetInfoKHR *pInfo)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   ANV_FROM_HANDLE(anv_descriptor_update_template, template,
-                   descriptorUpdateTemplate);
-   ANV_FROM_HANDLE(anv_pipeline_layout, layout, _layout);
+   ANV_FROM_HANDLE(anv_pipeline_layout, pipeline_layout, pInfo->layout);
+   struct anv_pipeline_sets_layout *layout = &pipeline_layout->sets_layout;
 
-   assert(_set < MAX_PUSH_DESCRIPTORS);
+   assert(pInfo->set < MAX_SETS);
 
-   struct anv_descriptor_set_layout *set_layout = layout->set[_set].layout;
-
-   struct anv_descriptor_set *set =
-      anv_cmd_buffer_push_descriptor_set(cmd_buffer, template->bind_point,
-                                         set_layout, _set);
-   if (!set)
+   struct anv_descriptor_set_layout *set_layout = layout->set[pInfo->set].layout;
+   struct anv_push_descriptor_set *push_set =
+      &anv_cmd_buffer_get_pipe_state(cmd_buffer,
+                                     bind_point)->push_descriptor;
+   if (!anv_push_descriptor_set_init(cmd_buffer, push_set, set_layout))
       return;
 
-   anv_descriptor_set_write_template(cmd_buffer->device, set,
-                                     &cmd_buffer->surface_state_stream,
-                                     template,
-                                     pData);
-
-   anv_cmd_buffer_bind_descriptor_set(cmd_buffer, template->bind_point,
-                                      layout, _set, set, NULL, NULL);
-}
+   anv_descriptor_set_write(cmd_buffer->device, &push_set->set,
+                            pInfo->descriptorWriteCount,
+                            pInfo->pDescriptorWrites);
 
-void anv_CmdSetDeviceMask(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    deviceMask)
-{
-   /* No-op */
+   anv_cmd_buffer_bind_descriptor_set(cmd_buffer, bind_point,
+                                      layout, pInfo->set, &push_set->set,
+                                      NULL, NULL);
 }
 
-void anv_CmdSetColorWriteEnableEXT(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    attachmentCount,
-    const VkBool32*                             pColorWriteEnables)
+void anv_CmdPushDescriptorSet2KHR(
+    VkCommandBuffer                            commandBuffer,
+    const VkPushDescriptorSetInfoKHR*          pInfo)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   assert(attachmentCount < MAX_RTS);
-
-   uint8_t color_writes = 0;
-   for (uint32_t i = 0; i < attachmentCount; i++)
-      color_writes |= pColorWriteEnables[i] ? (1 << i) : 0;
-
-   if (cmd_buffer->state.gfx.dynamic.color_writes != color_writes) {
-      cmd_buffer->state.gfx.dynamic.color_writes = color_writes;
-      cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE;
-   }
+   if (pInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT)
+      anv_cmd_buffer_push_descriptor_sets(cmd_buffer,
+                                          VK_PIPELINE_BIND_POINT_COMPUTE,
+                                          pInfo);
+   if (pInfo->stageFlags & ANV_GRAPHICS_STAGE_BITS)
+      anv_cmd_buffer_push_descriptor_sets(cmd_buffer,
+                                          VK_PIPELINE_BIND_POINT_GRAPHICS,
+                                          pInfo);
+   if (pInfo->stageFlags & ANV_RT_STAGE_BITS)
+      anv_cmd_buffer_push_descriptor_sets(cmd_buffer,
+                                          VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR,
+                                          pInfo);
 }
 
-void anv_CmdSetFragmentShadingRateKHR(
-    VkCommandBuffer                             commandBuffer,
-    const VkExtent2D*                           pFragmentSize,
-    const VkFragmentShadingRateCombinerOpKHR    combinerOps[2])
+void anv_CmdPushDescriptorSetWithTemplate2KHR(
+    VkCommandBuffer                                commandBuffer,
+    const VkPushDescriptorSetWithTemplateInfoKHR*  pInfo)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   VK_FROM_HANDLE(vk_descriptor_update_template, template,
+                  pInfo->descriptorUpdateTemplate);
+   ANV_FROM_HANDLE(anv_pipeline_layout, pipeline_layout, pInfo->layout);
+   struct anv_pipeline_sets_layout *layout = &pipeline_layout->sets_layout;
+
+   assert(pInfo->set < MAX_PUSH_DESCRIPTORS);
+
+   struct anv_descriptor_set_layout *set_layout = layout->set[pInfo->set].layout;
+   UNUSED VkShaderStageFlags stages;
+   struct anv_cmd_pipeline_state *pipe_state =
+      anv_cmd_buffer_get_pipeline_layout_state(cmd_buffer, template->bind_point,
+                                               set_layout, &stages);
+   struct anv_push_descriptor_set *push_set = &pipe_state->push_descriptor;
+   if (!anv_push_descriptor_set_init(cmd_buffer, push_set, set_layout))
+      return;
 
-   cmd_buffer->state.gfx.dynamic.fragment_shading_rate = *pFragmentSize;
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE;
-}
+   anv_descriptor_set_write_template(cmd_buffer->device, &push_set->set,
+                                     template,
+                                     pInfo->pData);
 
-static inline uint32_t
-ilog2_round_up(uint32_t value)
-{
-   assert(value != 0);
-   return 32 - __builtin_clz(value - 1);
+   anv_cmd_buffer_bind_descriptor_set(cmd_buffer, template->bind_point,
+                                      layout, pInfo->set, &push_set->set,
+                                      NULL, NULL);
 }
 
 void anv_CmdSetRayTracingPipelineStackSizeKHR(
@@ -1614,14 +1433,14 @@ void anv_CmdSetRayTracingPipelineStackSizeKHR(
 
    uint32_t stack_ids_per_dss = 2048; /* TODO */
 
-   unsigned stack_size_log2 = ilog2_round_up(pipelineStackSize);
+   unsigned stack_size_log2 = util_logbase2_ceil(pipelineStackSize);
    if (stack_size_log2 < 10)
       stack_size_log2 = 10;
 
    if (rt->scratch.layout.total_size == 1 << stack_size_log2)
       return;
 
-   brw_rt_compute_scratch_layout(&rt->scratch.layout, &device->info,
+   brw_rt_compute_scratch_layout(&rt->scratch.layout, device->info,
                                  stack_ids_per_dss, 1 << stack_size_log2);
 
    unsigned bucket = stack_size_log2 - 10;
@@ -1632,7 +1451,7 @@ void anv_CmdSetRayTracingPipelineStackSizeKHR(
       struct anv_bo *new_bo;
       VkResult result = anv_device_alloc_bo(device, "RT scratch",
                                             rt->scratch.layout.total_size,
-                                            0, /* alloc_flags */
+                                            ANV_BO_ALLOC_INTERNAL, /* alloc_flags */
                                             0, /* explicit_address */
                                             &new_bo);
       if (result != VK_SUCCESS) {
@@ -1651,3 +1470,69 @@ void anv_CmdSetRayTracingPipelineStackSizeKHR(
 
    rt->scratch.bo = bo;
 }
+
+void
+anv_cmd_buffer_save_state(struct anv_cmd_buffer *cmd_buffer,
+                          uint32_t flags,
+                          struct anv_cmd_saved_state *state)
+{
+   state->flags = flags;
+
+   /* we only support the compute pipeline at the moment */
+   assert(state->flags & ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE);
+   const struct anv_cmd_pipeline_state *pipe_state =
+      &cmd_buffer->state.compute.base;
+
+   if (state->flags & ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE)
+      state->pipeline = pipe_state->pipeline;
+
+   if (state->flags & ANV_CMD_SAVED_STATE_DESCRIPTOR_SET_0)
+      state->descriptor_set = pipe_state->descriptors[0];
+
+   if (state->flags & ANV_CMD_SAVED_STATE_PUSH_CONSTANTS) {
+      memcpy(state->push_constants, pipe_state->push_constants.client_data,
+             sizeof(state->push_constants));
+   }
+}
+
+void
+anv_cmd_buffer_restore_state(struct anv_cmd_buffer *cmd_buffer,
+                             struct anv_cmd_saved_state *state)
+{
+   VkCommandBuffer cmd_buffer_ = anv_cmd_buffer_to_handle(cmd_buffer);
+
+   assert(state->flags & ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE);
+   const VkPipelineBindPoint bind_point = VK_PIPELINE_BIND_POINT_COMPUTE;
+   const VkShaderStageFlags stage_flags = VK_SHADER_STAGE_COMPUTE_BIT;
+   struct anv_cmd_pipeline_state *pipe_state = &cmd_buffer->state.compute.base;
+
+   if (state->flags & ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE) {
+       if (state->pipeline) {
+          anv_CmdBindPipeline(cmd_buffer_, bind_point,
+                              anv_pipeline_to_handle(state->pipeline));
+       } else {
+          pipe_state->pipeline = NULL;
+       }
+   }
+
+   if (state->flags & ANV_CMD_SAVED_STATE_DESCRIPTOR_SET_0) {
+      if (state->descriptor_set) {
+         anv_cmd_buffer_bind_descriptor_set(cmd_buffer, bind_point, NULL, 0,
+                                            state->descriptor_set, NULL, NULL);
+      } else {
+         pipe_state->descriptors[0] = NULL;
+      }
+   }
+
+   if (state->flags & ANV_CMD_SAVED_STATE_PUSH_CONSTANTS) {
+      VkPushConstantsInfoKHR push_info = {
+         .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO_KHR,
+         .layout = VK_NULL_HANDLE,
+         .stageFlags = stage_flags,
+         .offset = 0,
+         .size = sizeof(state->push_constants),
+         .pValues = state->push_constants,
+      };
+      anv_CmdPushConstants2KHR(cmd_buffer_, &push_info);
+   }
+}
diff --git a/src/intel/vulkan/anv_descriptor_set.c b/src/intel/vulkan/anv_descriptor_set.c
index cab5402e51b..21fa7f534ca 100644
--- a/src/intel/vulkan/anv_descriptor_set.c
+++ b/src/intel/vulkan/anv_descriptor_set.c
@@ -36,215 +36,465 @@
  * Descriptor set layouts.
  */
 
+static void
+anv_descriptor_data_alignment(enum anv_descriptor_data data,
+                              enum anv_descriptor_set_layout_type layout_type,
+                              unsigned *out_surface_align,
+                              unsigned *out_sampler_align)
+{
+   unsigned surface_align = 1, sampler_align = 1;
+
+   if (data & (ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE |
+               ANV_DESCRIPTOR_INDIRECT_STORAGE_IMAGE |
+               ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE))
+      surface_align = MAX2(surface_align, 8);
+
+   if (data & ANV_DESCRIPTOR_SURFACE)
+      surface_align = MAX2(surface_align, ANV_SURFACE_STATE_SIZE);
+
+   if (data & ANV_DESCRIPTOR_SURFACE_SAMPLER) {
+      surface_align = MAX2(surface_align, ANV_SURFACE_STATE_SIZE);
+      if (layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT)
+         sampler_align = MAX2(sampler_align, ANV_SAMPLER_STATE_SIZE);
+   }
+
+   if (data & ANV_DESCRIPTOR_SAMPLER) {
+      if (layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT)
+         sampler_align = MAX2(sampler_align, ANV_SAMPLER_STATE_SIZE);
+      else
+         surface_align = MAX2(surface_align, ANV_SAMPLER_STATE_SIZE);
+   }
+
+   if (data & ANV_DESCRIPTOR_INLINE_UNIFORM)
+      surface_align = MAX2(surface_align, ANV_UBO_ALIGNMENT);
+
+   *out_surface_align = surface_align;
+   *out_sampler_align = sampler_align;
+}
+
 static enum anv_descriptor_data
-anv_descriptor_data_for_type(const struct anv_physical_device *device,
-                             VkDescriptorType type)
+anv_indirect_descriptor_data_for_type(VkDescriptorType type)
 {
    enum anv_descriptor_data data = 0;
 
    switch (type) {
    case VK_DESCRIPTOR_TYPE_SAMPLER:
-      data = ANV_DESCRIPTOR_SAMPLER_STATE;
-      if (device->has_bindless_samplers)
-         data |= ANV_DESCRIPTOR_SAMPLED_IMAGE;
+      data = ANV_DESCRIPTOR_BTI_SAMPLER_STATE |
+             ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE;
       break;
 
    case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
-      data = ANV_DESCRIPTOR_SURFACE_STATE |
-             ANV_DESCRIPTOR_SAMPLER_STATE;
-      if (device->has_bindless_images || device->has_bindless_samplers)
-         data |= ANV_DESCRIPTOR_SAMPLED_IMAGE;
+      data = ANV_DESCRIPTOR_BTI_SURFACE_STATE |
+             ANV_DESCRIPTOR_BTI_SAMPLER_STATE |
+             ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE;
       break;
 
    case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
    case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
-      data = ANV_DESCRIPTOR_SURFACE_STATE;
-      if (device->has_bindless_images)
-         data |= ANV_DESCRIPTOR_SAMPLED_IMAGE;
-      break;
-
    case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
-      data = ANV_DESCRIPTOR_SURFACE_STATE;
+      data = ANV_DESCRIPTOR_BTI_SURFACE_STATE |
+             ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE;
       break;
 
    case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
    case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
-      data = ANV_DESCRIPTOR_SURFACE_STATE;
-      if (device->info.ver < 9)
-         data |= ANV_DESCRIPTOR_IMAGE_PARAM;
-      if (device->has_bindless_images)
-         data |= ANV_DESCRIPTOR_STORAGE_IMAGE;
+      data = ANV_DESCRIPTOR_BTI_SURFACE_STATE |
+             ANV_DESCRIPTOR_INDIRECT_STORAGE_IMAGE;
       break;
 
    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
-      data = ANV_DESCRIPTOR_SURFACE_STATE |
+      data = ANV_DESCRIPTOR_BTI_SURFACE_STATE |
              ANV_DESCRIPTOR_BUFFER_VIEW;
       break;
 
    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
-      data = ANV_DESCRIPTOR_SURFACE_STATE;
+      data = ANV_DESCRIPTOR_BTI_SURFACE_STATE;
       break;
 
-   case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
+   case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
       data = ANV_DESCRIPTOR_INLINE_UNIFORM;
       break;
 
    case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR:
-      data = ANV_DESCRIPTOR_ADDRESS_RANGE;
+      data = ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE;
       break;
 
    default:
       unreachable("Unsupported descriptor type");
    }
 
-   /* On gfx8 and above when we have softpin enabled, we also need to push
-    * SSBO address ranges so that we can use A64 messages in the shader.
-    */
-   if (device->has_a64_buffer_access &&
-       (type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER ||
-        type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC ||
-        type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
-        type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC))
-      data |= ANV_DESCRIPTOR_ADDRESS_RANGE;
-
-   /* On Ivy Bridge and Bay Trail, we need swizzles textures in the shader
-    * Do not handle VK_DESCRIPTOR_TYPE_STORAGE_IMAGE and
-    * VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT because they already must
-    * have identity swizzle.
+   /* We also need to push SSBO address ranges so that we can use A64
+    * messages in the shader.
     */
-   if (device->info.verx10 == 70 &&
-       (type == VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE ||
-        type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER))
-      data |= ANV_DESCRIPTOR_TEXTURE_SWIZZLE;
+   if (type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER ||
+       type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC ||
+       type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
+       type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
+      data |= ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE;
+
+   return data;
+}
+
+static enum anv_descriptor_data
+anv_direct_descriptor_data_for_type(const struct anv_physical_device *device,
+                                    enum anv_descriptor_set_layout_type layout_type,
+                                    VkDescriptorSetLayoutCreateFlags set_flags,
+                                    VkDescriptorType type)
+{
+   enum anv_descriptor_data data = 0;
+
+   switch (type) {
+   case VK_DESCRIPTOR_TYPE_SAMPLER:
+      if (set_flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_EMBEDDED_IMMUTABLE_SAMPLERS_BIT_EXT)
+         return 0;
+      data = ANV_DESCRIPTOR_BTI_SAMPLER_STATE |
+             ANV_DESCRIPTOR_SAMPLER;
+      break;
+
+   case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      if (layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT) {
+         data = ANV_DESCRIPTOR_BTI_SURFACE_STATE |
+                ANV_DESCRIPTOR_BTI_SAMPLER_STATE |
+                ANV_DESCRIPTOR_SURFACE |
+                ANV_DESCRIPTOR_SAMPLER;
+      } else {
+         data = ANV_DESCRIPTOR_BTI_SURFACE_STATE |
+                ANV_DESCRIPTOR_BTI_SAMPLER_STATE |
+                ANV_DESCRIPTOR_SURFACE_SAMPLER;
+      }
+      break;
+
+   case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+   case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+   case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+   case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+   case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+      data = ANV_DESCRIPTOR_BTI_SURFACE_STATE |
+             ANV_DESCRIPTOR_SURFACE;
+      break;
+
+   case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
+      data = ANV_DESCRIPTOR_INLINE_UNIFORM;
+      break;
+
+   case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR:
+      data = ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE;
+      break;
+
+   default:
+      unreachable("Unsupported descriptor type");
+   }
+
+   if (layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER) {
+      if (set_flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR) {
+         /* Push descriptors are special with descriptor buffers. On Gfx12.5+
+          * they have their own pool and are not reachable by the binding
+          * table. On previous generations, they are only reachable through
+          * the binding table.
+          */
+         if (device->uses_ex_bso) {
+            data &= ~(ANV_DESCRIPTOR_BTI_SURFACE_STATE |
+                      ANV_DESCRIPTOR_BTI_SAMPLER_STATE);
+         }
+      } else {
+         /* Non push descriptor buffers cannot be accesses through the binding
+          * table on all platforms.
+          */
+         data &= ~(ANV_DESCRIPTOR_BTI_SURFACE_STATE |
+                   ANV_DESCRIPTOR_BTI_SAMPLER_STATE);
+      }
+   }
 
    return data;
 }
 
-static unsigned
-anv_descriptor_data_size(enum anv_descriptor_data data)
+static enum anv_descriptor_data
+anv_descriptor_data_for_type(const struct anv_physical_device *device,
+                             enum anv_descriptor_set_layout_type layout_type,
+                             VkDescriptorSetLayoutCreateFlags set_flags,
+                             VkDescriptorType type)
 {
-   unsigned size = 0;
+   if (layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER)
+      return anv_direct_descriptor_data_for_type(device, layout_type, set_flags, type);
+   else if (device->indirect_descriptors)
+      return anv_indirect_descriptor_data_for_type(type);
+   else
+      return anv_direct_descriptor_data_for_type(device, layout_type, set_flags, type);
+}
 
-   if (data & ANV_DESCRIPTOR_SAMPLED_IMAGE)
-      size += sizeof(struct anv_sampled_image_descriptor);
+static enum anv_descriptor_data
+anv_descriptor_data_for_mutable_type(const struct anv_physical_device *device,
+                                     enum anv_descriptor_set_layout_type layout_type,
+                                     VkDescriptorSetLayoutCreateFlags set_flags,
+                                     const VkMutableDescriptorTypeCreateInfoEXT *mutable_info,
+                                     int binding)
+{
+   enum anv_descriptor_data desc_data = 0;
 
-   if (data & ANV_DESCRIPTOR_STORAGE_IMAGE)
-      size += sizeof(struct anv_storage_image_descriptor);
+   if (!mutable_info || mutable_info->mutableDescriptorTypeListCount <= binding) {
+      for(VkDescriptorType i = 0; i <= VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT; i++) {
+         if (i == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC ||
+             i == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
+             i == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
+            continue;
 
-   if (data & ANV_DESCRIPTOR_IMAGE_PARAM)
-      size += BRW_IMAGE_PARAM_SIZE * 4;
+         desc_data |= anv_descriptor_data_for_type(device, layout_type, set_flags, i);
+      }
 
-   if (data & ANV_DESCRIPTOR_ADDRESS_RANGE)
-      size += sizeof(struct anv_address_range_descriptor);
+      desc_data |= anv_descriptor_data_for_type(
+         device, layout_type, set_flags, VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR);
 
-   if (data & ANV_DESCRIPTOR_TEXTURE_SWIZZLE)
-      size += sizeof(struct anv_texture_swizzle_descriptor);
+      return desc_data;
+   }
 
-   return size;
+   const VkMutableDescriptorTypeListEXT *type_list =
+      &mutable_info->pMutableDescriptorTypeLists[binding];
+   for (uint32_t i = 0; i < type_list->descriptorTypeCount; i++) {
+      desc_data |=
+         anv_descriptor_data_for_type(device, layout_type, set_flags,
+                                      type_list->pDescriptorTypes[i]);
+   }
+
+   return desc_data;
+}
+
+static void
+anv_descriptor_data_size(enum anv_descriptor_data data,
+                         enum anv_descriptor_set_layout_type layout_type,
+                         uint16_t *out_surface_size,
+                         uint16_t *out_sampler_size)
+{
+   unsigned surface_size = 0;
+   unsigned sampler_size = 0;
+
+   if (data & ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE)
+      surface_size += sizeof(struct anv_sampled_image_descriptor);
+
+   if (data & ANV_DESCRIPTOR_INDIRECT_STORAGE_IMAGE)
+      surface_size += sizeof(struct anv_storage_image_descriptor);
+
+   if (data & ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE)
+      surface_size += sizeof(struct anv_address_range_descriptor);
+
+   if (data & ANV_DESCRIPTOR_SURFACE)
+      surface_size += ANV_SURFACE_STATE_SIZE;
+
+   /* Direct descriptors have sampler states stored separately */
+   if (layout_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT) {
+      if (data & ANV_DESCRIPTOR_SAMPLER)
+         sampler_size += ANV_SAMPLER_STATE_SIZE;
+
+      if (data & ANV_DESCRIPTOR_SURFACE_SAMPLER) {
+         surface_size += ANV_SURFACE_STATE_SIZE;
+         sampler_size += ANV_SAMPLER_STATE_SIZE;
+      }
+   } else {
+      if (data & ANV_DESCRIPTOR_SAMPLER)
+         surface_size += ANV_SAMPLER_STATE_SIZE;
+
+      if (data & ANV_DESCRIPTOR_SURFACE_SAMPLER) {
+         surface_size += ALIGN(ANV_SURFACE_STATE_SIZE + ANV_SAMPLER_STATE_SIZE,
+                               ANV_SURFACE_STATE_SIZE);
+      }
+   }
+
+   *out_surface_size = surface_size;
+   *out_sampler_size = sampler_size;
 }
 
 static bool
 anv_needs_descriptor_buffer(VkDescriptorType desc_type,
+                            enum anv_descriptor_set_layout_type layout_type,
                             enum anv_descriptor_data desc_data)
 {
-   if (desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT ||
-       anv_descriptor_data_size(desc_data) > 0)
+   if (desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
       return true;
-   return false;
+
+   uint16_t surface_size, sampler_size;
+   anv_descriptor_data_size(desc_data, layout_type,
+                            &surface_size, &sampler_size);
+   return surface_size > 0 || sampler_size > 0;
 }
 
 /** Returns the size in bytes of each descriptor with the given layout */
-unsigned
-anv_descriptor_size(const struct anv_descriptor_set_binding_layout *layout)
+static void
+anv_descriptor_size(const struct anv_descriptor_set_binding_layout *layout,
+                    enum anv_descriptor_set_layout_type layout_type,
+                    uint16_t *out_surface_stride,
+                    uint16_t *out_sampler_stride)
 {
    if (layout->data & ANV_DESCRIPTOR_INLINE_UNIFORM) {
       assert(layout->data == ANV_DESCRIPTOR_INLINE_UNIFORM);
-      return layout->array_size;
+      assert(layout->array_size <= UINT16_MAX);
+      *out_surface_stride = layout->array_size;
+      *out_sampler_stride = 0;
+      return;
    }
 
-   unsigned size = anv_descriptor_data_size(layout->data);
-
-   /* For multi-planar bindings, we make every descriptor consume the maximum
-    * number of planes so we don't have to bother with walking arrays and
-    * adding things up every time.  Fortunately, YCbCr samplers aren't all
-    * that common and likely won't be in the middle of big arrays.
-    */
-   if (layout->max_plane_count > 1)
-      size *= layout->max_plane_count;
-
-   return size;
+   anv_descriptor_data_size(layout->data, layout_type,
+                            out_surface_stride,
+                            out_sampler_stride);
 }
 
-/** Returns the size in bytes of each descriptor of the given type
- *
- * This version of the function does not have access to the entire layout so
- * it may only work on certain descriptor types where the descriptor size is
- * entirely determined by the descriptor type.  Whenever possible, code should
- * use anv_descriptor_size() instead.
- */
-unsigned
-anv_descriptor_type_size(const struct anv_physical_device *pdevice,
-                         VkDescriptorType type)
+/** Returns size in bytes of the biggest descriptor in the given layout */
+static void
+anv_descriptor_size_for_mutable_type(const struct anv_physical_device *device,
+                                     enum anv_descriptor_set_layout_type layout_type,
+                                     VkDescriptorSetLayoutCreateFlags set_flags,
+                                     const VkMutableDescriptorTypeCreateInfoEXT *mutable_info,
+                                     int binding,
+                                     uint16_t *out_surface_stride,
+                                     uint16_t *out_sampler_stride)
 {
-   assert(type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT &&
-          type != VK_DESCRIPTOR_TYPE_SAMPLER &&
-          type != VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE &&
-          type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);
+   *out_surface_stride = 0;
+   *out_sampler_stride = 0;
+
+   if (!mutable_info ||
+       mutable_info->mutableDescriptorTypeListCount <= binding ||
+       binding >= mutable_info->mutableDescriptorTypeListCount) {
+      for(VkDescriptorType i = 0; i <= VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT; i++) {
+
+         if (i == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC ||
+             i == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
+             i == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
+            continue;
+
+         enum anv_descriptor_data desc_data =
+            anv_descriptor_data_for_type(device, layout_type, set_flags, i);
+         uint16_t surface_stride, sampler_stride;
+         anv_descriptor_data_size(desc_data, layout_type,
+                                  &surface_stride, &sampler_stride);
+
+         *out_surface_stride = MAX2(*out_surface_stride, surface_stride);
+         *out_sampler_stride = MAX2(*out_sampler_stride, sampler_stride);
+      }
+
+      enum anv_descriptor_data desc_data =
+         anv_descriptor_data_for_type(device, layout_type, set_flags,
+                                      VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR);
+      uint16_t surface_stride, sampler_stride;
+      anv_descriptor_data_size(desc_data, layout_type,
+                               &surface_stride, &sampler_stride);
+
+      *out_surface_stride = MAX2(*out_surface_stride, surface_stride);
+      *out_sampler_stride = MAX2(*out_sampler_stride, sampler_stride);
 
-   return anv_descriptor_data_size(anv_descriptor_data_for_type(pdevice, type));
+      return;
+   }
+
+   const VkMutableDescriptorTypeListEXT *type_list =
+      &mutable_info->pMutableDescriptorTypeLists[binding];
+   for (uint32_t i = 0; i < type_list->descriptorTypeCount; i++) {
+      enum anv_descriptor_data desc_data =
+         anv_descriptor_data_for_type(device, layout_type, set_flags,
+                                      type_list->pDescriptorTypes[i]);
+
+      uint16_t surface_stride, sampler_stride;
+      anv_descriptor_data_size(desc_data, layout_type,
+                               &surface_stride, &sampler_stride);
+
+      *out_surface_stride = MAX2(*out_surface_stride, surface_stride);
+      *out_sampler_stride = MAX2(*out_sampler_stride, sampler_stride);
+   }
 }
 
 static bool
 anv_descriptor_data_supports_bindless(const struct anv_physical_device *pdevice,
-                                      enum anv_descriptor_data data,
-                                      bool sampler)
+                                      VkDescriptorSetLayoutCreateFlags set_flags,
+                                      enum anv_descriptor_data data)
 {
-   if (data & ANV_DESCRIPTOR_ADDRESS_RANGE) {
-      assert(pdevice->has_a64_buffer_access);
-      return true;
-   }
+   if (set_flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT) {
+      /* When using descriptor buffers, on platforms that don't have extended
+       * bindless offset, all push descriptors have to go through the binding
+       * tables.
+       */
+      if (!pdevice->uses_ex_bso &&
+          (set_flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)) {
+         return data & (ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE |
+                        ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE |
+                        ANV_DESCRIPTOR_INDIRECT_STORAGE_IMAGE);
+      }
 
-   if (data & ANV_DESCRIPTOR_SAMPLED_IMAGE) {
-      assert(pdevice->has_bindless_images || pdevice->has_bindless_samplers);
-      return sampler ? pdevice->has_bindless_samplers :
-                       pdevice->has_bindless_images;
-   }
+      /* Otherwise we can do bindless for everything */
+      return true;
+   } else {
+      if (pdevice->indirect_descriptors) {
+         return data & (ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE |
+                        ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE |
+                        ANV_DESCRIPTOR_INDIRECT_STORAGE_IMAGE);
+      }
 
-   if (data & ANV_DESCRIPTOR_STORAGE_IMAGE) {
-      assert(pdevice->has_bindless_images);
+      /* Direct descriptor support bindless for everything */
       return true;
    }
-
-   return false;
 }
 
 bool
 anv_descriptor_supports_bindless(const struct anv_physical_device *pdevice,
-                                 const struct anv_descriptor_set_binding_layout *binding,
-                                 bool sampler)
+                                 const struct anv_descriptor_set_layout *set,
+                                 const struct anv_descriptor_set_binding_layout *binding)
 {
-   return anv_descriptor_data_supports_bindless(pdevice, binding->data,
-                                                sampler);
+   return anv_descriptor_data_supports_bindless(pdevice, set->flags, binding->data);
 }
 
 bool
 anv_descriptor_requires_bindless(const struct anv_physical_device *pdevice,
-                                 const struct anv_descriptor_set_binding_layout *binding,
-                                 bool sampler)
+                                 const struct anv_descriptor_set_layout *set,
+                                 const struct anv_descriptor_set_binding_layout *binding)
 {
    if (pdevice->always_use_bindless)
-      return anv_descriptor_supports_bindless(pdevice, binding, sampler);
+      return anv_descriptor_supports_bindless(pdevice, set, binding);
 
-   static const VkDescriptorBindingFlagBitsEXT flags_requiring_bindless =
-      VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT_EXT |
-      VK_DESCRIPTOR_BINDING_UPDATE_UNUSED_WHILE_PENDING_BIT_EXT |
-      VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT_EXT;
+   if (set->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)
+      return false;
+
+   if (set->flags & (VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT |
+                     VK_DESCRIPTOR_SET_LAYOUT_CREATE_EMBEDDED_IMMUTABLE_SAMPLERS_BIT_EXT))
+      return true;
+
+   static const VkDescriptorBindingFlagBits flags_requiring_bindless =
+      VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT |
+      VK_DESCRIPTOR_BINDING_UPDATE_UNUSED_WHILE_PENDING_BIT |
+      VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT;
 
    return (binding->flags & flags_requiring_bindless) != 0;
 }
 
+static enum anv_descriptor_set_layout_type
+anv_descriptor_set_layout_type_for_flags(const struct anv_physical_device *device,
+                                         const VkDescriptorSetLayoutCreateInfo *pCreateInfo)
+{
+   if (pCreateInfo->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT)
+      return ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER;
+   else if (device->indirect_descriptors)
+      return ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT;
+   else
+      return ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT;
+}
+
+static bool
+mutable_list_includes_type(const VkMutableDescriptorTypeCreateInfoEXT *mutable_info,
+                           uint32_t binding, VkDescriptorType type)
+{
+   if (!mutable_info || mutable_info->mutableDescriptorTypeListCount == 0)
+      return true;
+
+   const VkMutableDescriptorTypeListEXT *type_list =
+      &mutable_info->pMutableDescriptorTypeLists[binding];
+   for (uint32_t i = 0; i < type_list->descriptorTypeCount; i++) {
+      if (type_list->pDescriptorTypes[i] == type)
+         return true;
+   }
+
+   return false;
+}
+
 void anv_GetDescriptorSetLayoutSupport(
     VkDevice                                    _device,
     const VkDescriptorSetLayoutCreateInfo*      pCreateInfo,
@@ -260,6 +510,12 @@ void anv_GetDescriptorSetLayoutSupport(
    const VkDescriptorSetLayoutBindingFlagsCreateInfo *binding_flags_info =
       vk_find_struct_const(pCreateInfo->pNext,
                            DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO);
+   const VkMutableDescriptorTypeCreateInfoEXT *mutable_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT);
+
+   enum anv_descriptor_set_layout_type layout_type =
+      anv_descriptor_set_layout_type_for_flags(pdevice, pCreateInfo);
 
    for (uint32_t b = 0; b < pCreateInfo->bindingCount; b++) {
       const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[b];
@@ -270,10 +526,32 @@ void anv_GetDescriptorSetLayoutSupport(
          flags = binding_flags_info->pBindingFlags[b];
       }
 
+      /* Combined image/sampler descriptor are not supported with descriptor
+       * buffers & mutable descriptor types because we cannot know from the
+       * shader where to find the sampler structure. It can be written to the
+       * beginning of the descriptor (at offset 0) or in the second part (at
+       * offset 64bytes).
+       */
+      if ((pCreateInfo->flags &
+           VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT) &&
+          binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_EXT &&
+          mutable_list_includes_type(mutable_info, b,
+                                     VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)) {
+         pSupport->supported = false;
+         return;
+      }
+
       enum anv_descriptor_data desc_data =
-         anv_descriptor_data_for_type(pdevice, binding->descriptorType);
+         binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ?
+         anv_descriptor_data_for_mutable_type(pdevice, layout_type,
+                                              pCreateInfo->flags,
+                                              mutable_info, b) :
+         anv_descriptor_data_for_type(pdevice, layout_type,
+                                      pCreateInfo->flags,
+                                      binding->descriptorType);
 
-      if (anv_needs_descriptor_buffer(binding->descriptorType, desc_data))
+      if (anv_needs_descriptor_buffer(binding->descriptorType,
+                                      layout_type, desc_data))
          needs_descriptor_buffer = true;
 
       if (flags & VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)
@@ -284,12 +562,14 @@ void anv_GetDescriptorSetLayoutSupport(
          /* There is no real limit on samplers */
          break;
 
-      case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
+      case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
          /* Inline uniforms don't use a binding */
          break;
 
       case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
-         if (anv_descriptor_data_supports_bindless(pdevice, desc_data, false))
+         if (anv_descriptor_data_supports_bindless(pdevice,
+                                                   pCreateInfo->flags,
+                                                   desc_data))
             break;
 
          if (binding->pImmutableSamplers) {
@@ -306,7 +586,9 @@ void anv_GetDescriptorSetLayoutSupport(
          break;
 
       default:
-         if (anv_descriptor_data_supports_bindless(pdevice, desc_data, false))
+         if (anv_descriptor_data_supports_bindless(pdevice,
+                                                   pCreateInfo->flags,
+                                                   desc_data))
             break;
 
          anv_foreach_stage(s, binding->stageFlags)
@@ -324,7 +606,7 @@ void anv_GetDescriptorSetLayoutSupport(
       vk_find_struct(pSupport->pNext,
                      DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT);
    if (vdcls != NULL) {
-      if (varying_desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
+      if (varying_desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
          vdcls->maxVariableDescriptorCount = MAX_INLINE_UNIFORM_BLOCK_SIZE;
       } else if (varying_desc_type != VK_DESCRIPTOR_TYPE_MAX_ENUM) {
          vdcls->maxVariableDescriptorCount = UINT16_MAX;
@@ -378,7 +660,7 @@ VkResult anv_CreateDescriptorSetLayout(
          immutable_sampler_count += pCreateInfo->pBindings[j].descriptorCount;
    }
 
-   /* We need to allocate decriptor set layouts off the device allocator
+   /* We need to allocate descriptor set layouts off the device allocator
     * with DEVICE scope because they are reference counted and may not be
     * destroyed when vkDestroyDescriptorSetLayout is called.
     */
@@ -391,10 +673,13 @@ VkResult anv_CreateDescriptorSetLayout(
 
    if (!vk_object_multizalloc(&device->vk, &ma, NULL,
                               VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT))
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    set_layout->ref_cnt = 1;
    set_layout->binding_count = num_bindings;
+   set_layout->flags = pCreateInfo->flags;
+   set_layout->type = anv_descriptor_set_layout_type_for_flags(device->physical,
+                                                               pCreateInfo);
 
    for (uint32_t b = 0; b < num_bindings; b++) {
       /* Initialize all binding_layout entries to -1 */
@@ -412,7 +697,9 @@ VkResult anv_CreateDescriptorSetLayout(
 
    uint32_t buffer_view_count = 0;
    uint32_t dynamic_offset_count = 0;
-   uint32_t descriptor_buffer_size = 0;
+   uint32_t descriptor_buffer_surface_size = 0;
+   uint32_t descriptor_buffer_sampler_size = 0;
+   uint32_t sampler_count = 0;
 
    for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
       const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[j];
@@ -424,9 +711,13 @@ VkResult anv_CreateDescriptorSetLayout(
       set_layout->binding[b].immutable_samplers = (void *)(uintptr_t)(j + 1);
    }
 
-   const VkDescriptorSetLayoutBindingFlagsCreateInfoEXT *binding_flags_info =
+   const VkDescriptorSetLayoutBindingFlagsCreateInfo *binding_flags_info =
       vk_find_struct_const(pCreateInfo->pNext,
-                           DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO_EXT);
+                           DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO);
+
+   const VkMutableDescriptorTypeCreateInfoEXT *mutable_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT);
 
    for (uint32_t b = 0; b < num_bindings; b++) {
       /* We stashed the pCreateInfo->pBindings[] index (plus one) in the
@@ -466,13 +757,21 @@ VkResult anv_CreateDescriptorSetLayout(
             assert(!(set_layout->binding[b].flags &
                (VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT |
                 VK_DESCRIPTOR_BINDING_UPDATE_UNUSED_WHILE_PENDING_BIT |
-                VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT_EXT)));
+                VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)));
          }
       }
 
       set_layout->binding[b].data =
+         binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ?
+         anv_descriptor_data_for_mutable_type(device->physical,
+                                              set_layout->type,
+                                              pCreateInfo->flags,
+                                              mutable_info, b) :
          anv_descriptor_data_for_type(device->physical,
+                                      set_layout->type,
+                                      pCreateInfo->flags,
                                       binding->descriptorType);
+
       set_layout->binding[b].array_size = binding->descriptorCount;
       set_layout->binding[b].descriptor_index = set_layout->descriptor_count;
       set_layout->descriptor_count += binding->descriptorCount;
@@ -485,6 +784,7 @@ VkResult anv_CreateDescriptorSetLayout(
       switch (binding->descriptorType) {
       case VK_DESCRIPTOR_TYPE_SAMPLER:
       case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
          set_layout->binding[b].max_plane_count = 1;
          if (binding->pImmutableSamplers) {
             set_layout->binding[b].immutable_samplers = samplers;
@@ -522,27 +822,77 @@ VkResult anv_CreateDescriptorSetLayout(
          break;
       }
 
-      if (binding->descriptorType ==
-          VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
-         /* Inline uniform blocks are specified to use the descriptor array
-          * size as the size in bytes of the block.
-          */
-         descriptor_buffer_size = align_u32(descriptor_buffer_size,
-                                            ANV_UBO_ALIGNMENT);
-         set_layout->binding[b].descriptor_offset = descriptor_buffer_size;
-         descriptor_buffer_size += binding->descriptorCount;
+      if (binding->descriptorType == VK_DESCRIPTOR_TYPE_MUTABLE_EXT) {
+         anv_descriptor_size_for_mutable_type(
+            device->physical, set_layout->type,
+            pCreateInfo->flags, mutable_info, b,
+            &set_layout->binding[b].descriptor_data_surface_size,
+            &set_layout->binding[b].descriptor_data_sampler_size);
+      } else {
+         anv_descriptor_size(&set_layout->binding[b],
+                             set_layout->type,
+                             &set_layout->binding[b].descriptor_data_surface_size,
+                             &set_layout->binding[b].descriptor_data_sampler_size);
+      }
+
+      /* For multi-planar bindings, we make every descriptor consume the maximum
+       * number of planes so we don't have to bother with walking arrays and
+       * adding things up every time.  Fortunately, YCbCr samplers aren't all
+       * that common and likely won't be in the middle of big arrays.
+       */
+      set_layout->binding[b].descriptor_surface_stride =
+         MAX2(set_layout->binding[b].max_plane_count, 1) *
+         set_layout->binding[b].descriptor_data_surface_size;
+      set_layout->binding[b].descriptor_sampler_stride =
+         MAX2(set_layout->binding[b].max_plane_count, 1) *
+         set_layout->binding[b].descriptor_data_sampler_size;
+
+      if (binding->descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) {
+         sampler_count += binding->descriptorCount *
+                          set_layout->binding[b].max_plane_count;
+      }
+
+      unsigned surface_align, sampler_align;
+      anv_descriptor_data_alignment(set_layout->binding[b].data,
+                                    set_layout->type,
+                                    &surface_align,
+                                    &sampler_align);
+      descriptor_buffer_surface_size =
+         align(descriptor_buffer_surface_size, surface_align);
+      descriptor_buffer_sampler_size =
+         align(descriptor_buffer_sampler_size, sampler_align);
+
+      if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+         set_layout->binding[b].descriptor_surface_offset = descriptor_buffer_surface_size;
+         descriptor_buffer_surface_size += binding->descriptorCount;
       } else {
-         set_layout->binding[b].descriptor_offset = descriptor_buffer_size;
-         descriptor_buffer_size += anv_descriptor_size(&set_layout->binding[b]) *
-                                   binding->descriptorCount;
+         set_layout->binding[b].descriptor_surface_offset = descriptor_buffer_surface_size;
+         descriptor_buffer_surface_size +=
+            set_layout->binding[b].descriptor_surface_stride * binding->descriptorCount;
       }
 
+      set_layout->binding[b].descriptor_sampler_offset = descriptor_buffer_sampler_size;
+      descriptor_buffer_sampler_size +=
+         set_layout->binding[b].descriptor_sampler_stride * binding->descriptorCount;
+
       set_layout->shader_stages |= binding->stageFlags;
    }
 
+   /* Sanity checks */
+   assert(descriptor_buffer_sampler_size == 0 ||
+          set_layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT);
+
    set_layout->buffer_view_count = buffer_view_count;
    set_layout->dynamic_offset_count = dynamic_offset_count;
-   set_layout->descriptor_buffer_size = descriptor_buffer_size;
+   set_layout->descriptor_buffer_surface_size = descriptor_buffer_surface_size;
+   set_layout->descriptor_buffer_sampler_size = descriptor_buffer_sampler_size;
+
+   if (pCreateInfo->flags &
+       VK_DESCRIPTOR_SET_LAYOUT_CREATE_EMBEDDED_IMMUTABLE_SAMPLERS_BIT_EXT) {
+      assert(set_layout->descriptor_buffer_surface_size == 0);
+      assert(set_layout->descriptor_buffer_sampler_size == 0);
+      set_layout->embedded_sampler_count = sampler_count;
+   }
 
    *pSetLayout = anv_descriptor_set_layout_to_handle(set_layout);
 
@@ -583,7 +933,7 @@ set_layout_descriptor_count(const struct anv_descriptor_set_layout *set_layout,
    assert(var_desc_count <= dynamic_binding->array_size);
    uint32_t shrink = dynamic_binding->array_size - var_desc_count;
 
-   if (dynamic_binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT)
+   if (dynamic_binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
       return set_layout->descriptor_count;
 
    return set_layout->descriptor_count - shrink;
@@ -607,30 +957,50 @@ set_layout_buffer_view_count(const struct anv_descriptor_set_layout *set_layout,
    return set_layout->buffer_view_count - shrink;
 }
 
-uint32_t
+static bool
+anv_descriptor_set_layout_empty(const struct anv_descriptor_set_layout *set_layout)
+{
+   return set_layout->binding_count == 0;
+}
+
+static void
 anv_descriptor_set_layout_descriptor_buffer_size(const struct anv_descriptor_set_layout *set_layout,
-                                                 uint32_t var_desc_count)
+                                                 uint32_t var_desc_count,
+                                                 uint32_t *out_surface_size,
+                                                 uint32_t *out_sampler_size)
 {
    const struct anv_descriptor_set_binding_layout *dynamic_binding =
       set_layout_dynamic_binding(set_layout);
-   if (dynamic_binding == NULL)
-      return ALIGN(set_layout->descriptor_buffer_size, ANV_UBO_ALIGNMENT);
+   if (dynamic_binding == NULL) {
+      *out_surface_size = ALIGN(set_layout->descriptor_buffer_surface_size,
+                                ANV_UBO_ALIGNMENT);
+      *out_sampler_size = set_layout->descriptor_buffer_sampler_size;
+      return;
+   }
 
    assert(var_desc_count <= dynamic_binding->array_size);
    uint32_t shrink = dynamic_binding->array_size - var_desc_count;
-   uint32_t set_size;
+   uint32_t set_surface_size, set_sampler_size;
 
-   if (dynamic_binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
+   if (dynamic_binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
       /* Inline uniform blocks are specified to use the descriptor array
        * size as the size in bytes of the block.
        */
-      set_size = set_layout->descriptor_buffer_size - shrink;
+      set_surface_size = set_layout->descriptor_buffer_surface_size - shrink;
+      set_sampler_size = 0;
    } else {
-      set_size = set_layout->descriptor_buffer_size -
-                 shrink * anv_descriptor_size(dynamic_binding);
+      set_surface_size =
+         set_layout->descriptor_buffer_surface_size > 0 ?
+         (set_layout->descriptor_buffer_surface_size -
+          shrink * dynamic_binding->descriptor_surface_stride) : 0;
+      set_sampler_size =
+         set_layout->descriptor_buffer_sampler_size > 0 ?
+         (set_layout->descriptor_buffer_sampler_size -
+          shrink * dynamic_binding->descriptor_sampler_stride) : 0;
    }
 
-   return ALIGN(set_size, ANV_UBO_ALIGNMENT);
+   *out_surface_size = ALIGN(set_surface_size, ANV_UBO_ALIGNMENT);
+   *out_sampler_size = set_sampler_size;
 }
 
 void anv_DestroyDescriptorSetLayout(
@@ -647,23 +1017,52 @@ void anv_DestroyDescriptorSetLayout(
    anv_descriptor_set_layout_unref(device, set_layout);
 }
 
+void
+anv_descriptor_set_layout_print(const struct anv_descriptor_set_layout *layout)
+{
+   fprintf(stderr, "set layout:\n");
+   for (uint32_t b = 0; b < layout->binding_count; b++) {
+      fprintf(stderr, "  binding%03u: offsets=0x%08x/0x%08x sizes=%04u/%04u strides=%03u/%03u planes=%hhu count=%03u\n",
+              b,
+              layout->binding[b].descriptor_surface_offset,
+              layout->binding[b].descriptor_sampler_offset,
+              layout->binding[b].descriptor_data_surface_size,
+              layout->binding[b].descriptor_data_sampler_size,
+              layout->binding[b].descriptor_surface_stride,
+              layout->binding[b].descriptor_sampler_stride,
+              layout->binding[b].max_plane_count,
+              layout->binding[b].array_size);
+   }
+}
+
 #define SHA1_UPDATE_VALUE(ctx, x) _mesa_sha1_update(ctx, &(x), sizeof(x));
 
 static void
 sha1_update_immutable_sampler(struct mesa_sha1 *ctx,
+                              bool embedded_sampler,
                               const struct anv_sampler *sampler)
 {
-   if (!sampler->conversion)
+   if (!sampler->vk.ycbcr_conversion)
       return;
 
-   /* The only thing that affects the shader is ycbcr conversion */
-   _mesa_sha1_update(ctx, sampler->conversion,
-                     sizeof(*sampler->conversion));
+   /* Hash the conversion if any as this affect placement of descriptors in
+    * the set due to the number of planes.
+    */
+   SHA1_UPDATE_VALUE(ctx, sampler->vk.ycbcr_conversion->state);
+
+   /* For embedded samplers, we need to hash the sampler parameters as the
+    * sampler handle is baked into the shader and this ultimately is part of
+    * the shader hash key. We can only consider 2 shaders identical if all
+    * their embedded samplers parameters are identical.
+    */
+   if (embedded_sampler)
+      SHA1_UPDATE_VALUE(ctx, sampler->sha1);
 }
 
 static void
 sha1_update_descriptor_set_binding_layout(struct mesa_sha1 *ctx,
-   const struct anv_descriptor_set_binding_layout *layout)
+                                          bool embedded_samplers,
+                                          const struct anv_descriptor_set_binding_layout *layout)
 {
    SHA1_UPDATE_VALUE(ctx, layout->flags);
    SHA1_UPDATE_VALUE(ctx, layout->data);
@@ -672,11 +1071,14 @@ sha1_update_descriptor_set_binding_layout(struct mesa_sha1 *ctx,
    SHA1_UPDATE_VALUE(ctx, layout->descriptor_index);
    SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_index);
    SHA1_UPDATE_VALUE(ctx, layout->buffer_view_index);
-   SHA1_UPDATE_VALUE(ctx, layout->descriptor_offset);
+   SHA1_UPDATE_VALUE(ctx, layout->descriptor_surface_offset);
+   SHA1_UPDATE_VALUE(ctx, layout->descriptor_sampler_offset);
 
    if (layout->immutable_samplers) {
-      for (uint16_t i = 0; i < layout->array_size; i++)
-         sha1_update_immutable_sampler(ctx, layout->immutable_samplers[i]);
+      for (uint16_t i = 0; i < layout->array_size; i++) {
+         sha1_update_immutable_sampler(ctx, embedded_samplers,
+                                       layout->immutable_samplers[i]);
+      }
    }
 }
 
@@ -684,15 +1086,22 @@ static void
 sha1_update_descriptor_set_layout(struct mesa_sha1 *ctx,
                                   const struct anv_descriptor_set_layout *layout)
 {
+   SHA1_UPDATE_VALUE(ctx, layout->flags);
    SHA1_UPDATE_VALUE(ctx, layout->binding_count);
    SHA1_UPDATE_VALUE(ctx, layout->descriptor_count);
    SHA1_UPDATE_VALUE(ctx, layout->shader_stages);
    SHA1_UPDATE_VALUE(ctx, layout->buffer_view_count);
    SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_count);
-   SHA1_UPDATE_VALUE(ctx, layout->descriptor_buffer_size);
+   SHA1_UPDATE_VALUE(ctx, layout->descriptor_buffer_surface_size);
+   SHA1_UPDATE_VALUE(ctx, layout->descriptor_buffer_sampler_size);
 
-   for (uint16_t i = 0; i < layout->binding_count; i++)
-      sha1_update_descriptor_set_binding_layout(ctx, &layout->binding[i]);
+   bool embedded_samplers =
+      layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_EMBEDDED_IMMUTABLE_SAMPLERS_BIT_EXT;
+
+   for (uint16_t i = 0; i < layout->binding_count; i++) {
+      sha1_update_descriptor_set_binding_layout(ctx, embedded_samplers,
+                                                &layout->binding[i]);
+   }
 }
 
 /*
@@ -700,6 +1109,107 @@ sha1_update_descriptor_set_layout(struct mesa_sha1 *ctx,
  * just multiple descriptor set layouts pasted together
  */
 
+void
+anv_pipeline_sets_layout_init(struct anv_pipeline_sets_layout *layout,
+                              struct anv_device *device,
+                              bool independent_sets)
+{
+   memset(layout, 0, sizeof(*layout));
+
+   layout->device = device;
+   layout->push_descriptor_set_index = -1;
+   layout->independent_sets = independent_sets;
+}
+
+void
+anv_pipeline_sets_layout_add(struct anv_pipeline_sets_layout *layout,
+                             uint32_t set_idx,
+                             struct anv_descriptor_set_layout *set_layout)
+{
+   if (layout->set[set_idx].layout)
+      return;
+
+   /* Workaround CTS : Internal CTS issue 3584 */
+   if (layout->independent_sets && anv_descriptor_set_layout_empty(set_layout))
+      return;
+
+   if (layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_UNKNOWN)
+      layout->type = set_layout->type;
+   else
+      assert(layout->type == set_layout->type);
+
+   layout->num_sets = MAX2(set_idx + 1, layout->num_sets);
+
+   layout->set[set_idx].layout =
+      anv_descriptor_set_layout_ref(set_layout);
+
+   layout->set[set_idx].dynamic_offset_start = layout->num_dynamic_buffers;
+   layout->num_dynamic_buffers += set_layout->dynamic_offset_count;
+
+   assert(layout->num_dynamic_buffers < MAX_DYNAMIC_BUFFERS);
+
+   if (set_layout->flags &
+       VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR) {
+      assert(layout->push_descriptor_set_index == -1);
+      layout->push_descriptor_set_index = set_idx;
+   }
+}
+
+uint32_t
+anv_pipeline_sets_layout_embedded_sampler_count(const struct anv_pipeline_sets_layout *layout)
+{
+   uint32_t count = 0;
+   for (unsigned s = 0; s < layout->num_sets; s++) {
+      if (!layout->set[s].layout)
+         continue;
+      count += layout->set[s].layout->embedded_sampler_count;
+   }
+   return count;
+}
+
+void
+anv_pipeline_sets_layout_hash(struct anv_pipeline_sets_layout *layout)
+{
+   struct mesa_sha1 ctx;
+   _mesa_sha1_init(&ctx);
+   for (unsigned s = 0; s < layout->num_sets; s++) {
+      if (!layout->set[s].layout)
+         continue;
+      sha1_update_descriptor_set_layout(&ctx, layout->set[s].layout);
+      _mesa_sha1_update(&ctx, &layout->set[s].dynamic_offset_start,
+                        sizeof(layout->set[s].dynamic_offset_start));
+   }
+   _mesa_sha1_update(&ctx, &layout->num_sets, sizeof(layout->num_sets));
+   _mesa_sha1_final(&ctx, layout->sha1);
+}
+
+void
+anv_pipeline_sets_layout_fini(struct anv_pipeline_sets_layout *layout)
+{
+   for (unsigned s = 0; s < layout->num_sets; s++) {
+      if (!layout->set[s].layout)
+         continue;
+
+      anv_descriptor_set_layout_unref(layout->device, layout->set[s].layout);
+   }
+}
+
+void
+anv_pipeline_sets_layout_print(const struct anv_pipeline_sets_layout *layout)
+{
+   fprintf(stderr, "layout: dyn_count=%u sets=%u ind=%u\n",
+           layout->num_dynamic_buffers,
+           layout->num_sets,
+           layout->independent_sets);
+   for (unsigned s = 0; s < layout->num_sets; s++) {
+      if (!layout->set[s].layout)
+         continue;
+
+      fprintf(stderr, "   set%i: dyn_start=%u flags=0x%x\n",
+              s, layout->set[s].dynamic_offset_start, layout->set[s].layout->flags);
+   }
+}
+
 VkResult anv_CreatePipelineLayout(
     VkDevice                                    _device,
     const VkPipelineLayoutCreateInfo*           pCreateInfo,
@@ -711,40 +1221,33 @@ VkResult anv_CreatePipelineLayout(
 
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO);
 
-   layout = vk_object_alloc(&device->vk, pAllocator, sizeof(*layout),
-                            VK_OBJECT_TYPE_PIPELINE_LAYOUT);
+   layout = vk_object_zalloc(&device->vk, pAllocator, sizeof(*layout),
+                             VK_OBJECT_TYPE_PIPELINE_LAYOUT);
    if (layout == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   layout->num_sets = pCreateInfo->setLayoutCount;
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   unsigned dynamic_offset_count = 0;
+   anv_pipeline_sets_layout_init(&layout->sets_layout, device,
+                                 pCreateInfo->flags & VK_PIPELINE_LAYOUT_CREATE_INDEPENDENT_SETS_BIT_EXT);
 
    for (uint32_t set = 0; set < pCreateInfo->setLayoutCount; set++) {
       ANV_FROM_HANDLE(anv_descriptor_set_layout, set_layout,
                       pCreateInfo->pSetLayouts[set]);
-      layout->set[set].layout = set_layout;
-      anv_descriptor_set_layout_ref(set_layout);
 
-      layout->set[set].dynamic_offset_start = dynamic_offset_count;
-      for (uint32_t b = 0; b < set_layout->binding_count; b++) {
-         if (set_layout->binding[b].dynamic_offset_index < 0)
-            continue;
+      /* VUID-VkPipelineLayoutCreateInfo-graphicsPipelineLibrary-06753
+       *
+       *    "If graphicsPipelineLibrary is not enabled, elements of
+       *     pSetLayouts must be valid VkDescriptorSetLayout objects"
+       *
+       * As a result of supporting graphicsPipelineLibrary, we need to allow
+       * null descriptor set layouts.
+       */
+      if (set_layout == NULL)
+         continue;
 
-         dynamic_offset_count += set_layout->binding[b].array_size;
-      }
+      anv_pipeline_sets_layout_add(&layout->sets_layout, set, set_layout);
    }
-   assert(dynamic_offset_count < MAX_DYNAMIC_BUFFERS);
 
-   struct mesa_sha1 ctx;
-   _mesa_sha1_init(&ctx);
-   for (unsigned s = 0; s < layout->num_sets; s++) {
-      sha1_update_descriptor_set_layout(&ctx, layout->set[s].layout);
-      _mesa_sha1_update(&ctx, &layout->set[s].dynamic_offset_start,
-                        sizeof(layout->set[s].dynamic_offset_start));
-   }
-   _mesa_sha1_update(&ctx, &layout->num_sets, sizeof(layout->num_sets));
-   _mesa_sha1_final(&ctx, layout->sha1);
+   anv_pipeline_sets_layout_hash(&layout->sets_layout);
 
    *pPipelineLayout = anv_pipeline_layout_to_handle(layout);
 
@@ -757,30 +1260,29 @@ void anv_DestroyPipelineLayout(
     const VkAllocationCallbacks*                pAllocator)
 {
    ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_pipeline_layout, pipeline_layout, _pipelineLayout);
+   ANV_FROM_HANDLE(anv_pipeline_layout, layout, _pipelineLayout);
 
-   if (!pipeline_layout)
+   if (!layout)
       return;
 
-   for (uint32_t i = 0; i < pipeline_layout->num_sets; i++)
-      anv_descriptor_set_layout_unref(device, pipeline_layout->set[i].layout);
+   anv_pipeline_sets_layout_fini(&layout->sets_layout);
 
-   vk_object_free(&device->vk, pAllocator, pipeline_layout);
+   vk_object_free(&device->vk, pAllocator, layout);
 }
 
 /*
  * Descriptor pools.
  *
- * These are implemented using a big pool of memory and a free-list for the
+ * These are implemented using a big pool of memory and a vma heap for the
  * host memory allocations and a state_stream and a free list for the buffer
  * view surface state. The spec allows us to fail to allocate due to
  * fragmentation in all cases but two: 1) after pool reset, allocating up
  * until the pool size with no freeing must succeed and 2) allocating and
- * freeing only descriptor sets with the same layout. Case 1) is easy enogh,
- * and the free lists lets us recycle blocks for case 2).
+ * freeing only descriptor sets with the same layout. Case 1) is easy enough,
+ * and the vma heap ensures case 2).
  */
 
-/* The vma heap reserves 0 to mean NULL; we have to offset by some ammount to
+/* The vma heap reserves 0 to mean NULL; we have to offset by some amount to
  * ensure we can allocate the entire BO without hitting zero.  The actual
  * amount doesn't matter.
  */
@@ -788,6 +1290,108 @@ void anv_DestroyPipelineLayout(
 
 #define EMPTY 1
 
+static VkResult
+anv_descriptor_pool_heap_init(struct anv_device *device,
+                              struct anv_descriptor_pool_heap *heap,
+                              uint32_t size,
+                              bool host_only,
+                              bool samplers)
+{
+   if (size == 0)
+      return VK_SUCCESS;
+
+   if (host_only) {
+      heap->size = size;
+      heap->host_mem = vk_zalloc(&device->vk.alloc, size, 8,
+                                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (heap->host_mem == NULL)
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+   } else {
+      const char *bo_name =
+         device->physical->indirect_descriptors ? "indirect descriptors" :
+         samplers ? "direct sampler" : "direct surfaces";
+
+      heap->size = align(size, 4096);
+
+      VkResult result = anv_device_alloc_bo(device,
+                                            bo_name, heap->size,
+                                            ANV_BO_ALLOC_CAPTURE |
+                                            ANV_BO_ALLOC_MAPPED |
+                                            ANV_BO_ALLOC_HOST_CACHED_COHERENT |
+                                            (samplers ?
+                                             ANV_BO_ALLOC_SAMPLER_POOL :
+                                             ANV_BO_ALLOC_DESCRIPTOR_POOL),
+                                            0 /* explicit_address */,
+                                            &heap->bo);
+      if (result != VK_SUCCESS)
+         return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+   }
+
+   util_vma_heap_init(&heap->heap, POOL_HEAP_OFFSET, heap->size);
+
+   return VK_SUCCESS;
+}
+
+static void
+anv_descriptor_pool_heap_fini(struct anv_device *device,
+                              struct anv_descriptor_pool_heap *heap)
+{
+   if (heap->size == 0)
+      return;
+
+   util_vma_heap_finish(&heap->heap);
+
+   if (heap->bo)
+      anv_device_release_bo(device, heap->bo);
+
+   if (heap->host_mem)
+      vk_free(&device->vk.alloc, heap->host_mem);
+}
+
+static void
+anv_descriptor_pool_heap_reset(struct anv_device *device,
+                               struct anv_descriptor_pool_heap *heap)
+{
+   if (heap->size == 0)
+      return;
+
+   util_vma_heap_finish(&heap->heap);
+   util_vma_heap_init(&heap->heap, POOL_HEAP_OFFSET, heap->size);
+}
+
+static VkResult
+anv_descriptor_pool_heap_alloc(struct anv_descriptor_pool *pool,
+                               struct anv_descriptor_pool_heap *heap,
+                               uint32_t size, uint32_t alignment,
+                               struct anv_state *state)
+{
+   uint64_t pool_vma_offset =
+      util_vma_heap_alloc(&heap->heap, size, alignment);
+   if (pool_vma_offset == 0)
+      return vk_error(pool, VK_ERROR_FRAGMENTED_POOL);
+
+   assert(pool_vma_offset >= POOL_HEAP_OFFSET &&
+          pool_vma_offset - POOL_HEAP_OFFSET <= INT32_MAX);
+
+   state->offset = pool_vma_offset - POOL_HEAP_OFFSET;
+   state->alloc_size = size;
+   if (heap->host_mem)
+      state->map = heap->host_mem + state->offset;
+   else
+      state->map = heap->bo->map + state->offset;
+
+   return VK_SUCCESS;
+}
+
+static void
+anv_descriptor_pool_heap_free(struct anv_descriptor_pool_heap *heap,
+                              struct anv_state state)
+{
+   util_vma_heap_free(&heap->heap,
+                      (uint64_t)state.offset + POOL_HEAP_OFFSET,
+                      state.alloc_size);
+}
+
 VkResult anv_CreateDescriptorPool(
     VkDevice                                    _device,
     const VkDescriptorPoolCreateInfo*           pCreateInfo,
@@ -797,41 +1401,71 @@ VkResult anv_CreateDescriptorPool(
    ANV_FROM_HANDLE(anv_device, device, _device);
    struct anv_descriptor_pool *pool;
 
-   const VkDescriptorPoolInlineUniformBlockCreateInfoEXT *inline_info =
+   const VkDescriptorPoolInlineUniformBlockCreateInfo *inline_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO);
+   const VkMutableDescriptorTypeCreateInfoEXT *mutable_info =
       vk_find_struct_const(pCreateInfo->pNext,
-                           DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO_EXT);
+                           MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT);
 
    uint32_t descriptor_count = 0;
    uint32_t buffer_view_count = 0;
-   uint32_t descriptor_bo_size = 0;
+   uint32_t descriptor_bo_surface_size = 0;
+   uint32_t descriptor_bo_sampler_size = 0;
+
+   const enum anv_descriptor_set_layout_type layout_type =
+      device->physical->indirect_descriptors ?
+      ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT :
+      ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT;
+
    for (uint32_t i = 0; i < pCreateInfo->poolSizeCount; i++) {
       enum anv_descriptor_data desc_data =
-         anv_descriptor_data_for_type(device->physical,
+         pCreateInfo->pPoolSizes[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ?
+         anv_descriptor_data_for_mutable_type(device->physical, layout_type,
+                                              pCreateInfo->flags,
+                                              mutable_info, i) :
+         anv_descriptor_data_for_type(device->physical, layout_type,
+                                      pCreateInfo->flags,
                                       pCreateInfo->pPoolSizes[i].type);
 
       if (desc_data & ANV_DESCRIPTOR_BUFFER_VIEW)
          buffer_view_count += pCreateInfo->pPoolSizes[i].descriptorCount;
 
-      unsigned desc_data_size = anv_descriptor_data_size(desc_data) *
-                                pCreateInfo->pPoolSizes[i].descriptorCount;
+      uint16_t desc_surface_size, desc_sampler_size;
+      if (pCreateInfo->pPoolSizes[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT) {
+         anv_descriptor_size_for_mutable_type(device->physical, layout_type,
+                                              pCreateInfo->flags, mutable_info, i,
+                                              &desc_surface_size, &desc_sampler_size);
+      } else {
+         anv_descriptor_data_size(desc_data, layout_type,
+                                  &desc_surface_size, &desc_sampler_size);
+      }
+
+      uint32_t desc_data_surface_size =
+         desc_surface_size * pCreateInfo->pPoolSizes[i].descriptorCount;
+      uint32_t desc_data_sampler_size =
+         desc_sampler_size * pCreateInfo->pPoolSizes[i].descriptorCount;
 
       /* Combined image sampler descriptors can take up to 3 slots if they
        * hold a YCbCr image.
        */
       if (pCreateInfo->pPoolSizes[i].type ==
-          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
-         desc_data_size *= 3;
+          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
+         desc_data_surface_size *= 3;
+         desc_data_sampler_size *= 3;
+      }
 
       if (pCreateInfo->pPoolSizes[i].type ==
-          VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
+          VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
          /* Inline uniform blocks are specified to use the descriptor array
           * size as the size in bytes of the block.
           */
          assert(inline_info);
-         desc_data_size += pCreateInfo->pPoolSizes[i].descriptorCount;
+         desc_data_surface_size += pCreateInfo->pPoolSizes[i].descriptorCount;
       }
 
-      descriptor_bo_size += desc_data_size;
+      descriptor_bo_surface_size += desc_data_surface_size;
+      descriptor_bo_sampler_size += desc_data_sampler_size;
 
       descriptor_count += pCreateInfo->pPoolSizes[i].descriptorCount;
    }
@@ -843,53 +1477,70 @@ VkResult anv_CreateDescriptorPool(
     * extra space that we can chop it into maxSets pieces and align each one
     * of them to 32B.
     */
-   descriptor_bo_size += ANV_UBO_ALIGNMENT * pCreateInfo->maxSets;
+   descriptor_bo_surface_size += ANV_UBO_ALIGNMENT * pCreateInfo->maxSets;
    /* We align inline uniform blocks to ANV_UBO_ALIGNMENT */
    if (inline_info) {
-      descriptor_bo_size +=
+      descriptor_bo_surface_size +=
          ANV_UBO_ALIGNMENT * inline_info->maxInlineUniformBlockBindings;
    }
-   descriptor_bo_size = ALIGN(descriptor_bo_size, 4096);
 
-   const size_t pool_size =
+   const bool host_only =
+      pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_HOST_ONLY_BIT_EXT;
+
+   /* For host_only pools, allocate some memory to hold the written surface
+    * states of the internal anv_buffer_view. With normal pools, the memory
+    * holding surface state is allocated from the device surface_state_pool.
+    */
+   const size_t host_mem_size =
       pCreateInfo->maxSets * sizeof(struct anv_descriptor_set) +
       descriptor_count * sizeof(struct anv_descriptor) +
-      buffer_view_count * sizeof(struct anv_buffer_view);
-   const size_t total_size = sizeof(*pool) + pool_size;
+      buffer_view_count * sizeof(struct anv_buffer_view) +
+      (host_only ? buffer_view_count * ANV_SURFACE_STATE_SIZE : 0);
 
-   pool = vk_object_alloc(&device->vk, pAllocator, total_size,
-                          VK_OBJECT_TYPE_DESCRIPTOR_POOL);
+   pool = vk_object_zalloc(&device->vk, pAllocator,
+                           sizeof(*pool) + host_mem_size,
+                           VK_OBJECT_TYPE_DESCRIPTOR_POOL);
    if (!pool)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   pool->size = pool_size;
-   pool->next = 0;
-   pool->free_list = EMPTY;
+   pool->host_mem_size = host_mem_size;
+   util_vma_heap_init(&pool->host_heap, POOL_HEAP_OFFSET, host_mem_size);
 
-   if (descriptor_bo_size > 0) {
-      VkResult result = anv_device_alloc_bo(device,
-                                            "descriptors",
-                                            descriptor_bo_size,
-                                            ANV_BO_ALLOC_MAPPED |
-                                            ANV_BO_ALLOC_SNOOPED,
-                                            0 /* explicit_address */,
-                                            &pool->bo);
-      if (result != VK_SUCCESS) {
-         vk_object_free(&device->vk, pAllocator, pool);
-         return result;
-      }
+   pool->host_only = host_only;
 
-      util_vma_heap_init(&pool->bo_heap, POOL_HEAP_OFFSET, descriptor_bo_size);
-   } else {
-      pool->bo = NULL;
+   VkResult result = anv_descriptor_pool_heap_init(device,
+                                                   &pool->surfaces,
+                                                   descriptor_bo_surface_size,
+                                                   pool->host_only,
+                                                   false /* samplers */);
+   if (result != VK_SUCCESS) {
+      vk_object_free(&device->vk, pAllocator, pool);
+      return result;
    }
 
+   result = anv_descriptor_pool_heap_init(device,
+                                          &pool->samplers,
+                                          descriptor_bo_sampler_size,
+                                          pool->host_only,
+                                          true /* samplers */);
+   if (result != VK_SUCCESS) {
+      anv_descriptor_pool_heap_fini(device, &pool->surfaces);
+      vk_object_free(&device->vk, pAllocator, pool);
+      return result;
+   }
+
+   /* All the surface states allocated by the descriptor pool are internal. We
+    * have to allocate them to handle the fact that we do not have surface
+    * states for VkBuffers.
+    */
    anv_state_stream_init(&pool->surface_state_stream,
-                         &device->surface_state_pool, 4096);
+                         &device->internal_surface_state_pool, 4096);
    pool->surface_state_free_list = NULL;
 
    list_inithead(&pool->desc_sets);
 
+   ANV_RMV(descriptor_pool_create, device, pCreateInfo, pool, false);
+
    *pDescriptorPool = anv_descriptor_pool_to_handle(pool);
 
    return VK_SUCCESS;
@@ -906,17 +1557,20 @@ void anv_DestroyDescriptorPool(
    if (!pool)
       return;
 
+   ANV_RMV(resource_destroy, device, pool);
+
    list_for_each_entry_safe(struct anv_descriptor_set, set,
                             &pool->desc_sets, pool_link) {
       anv_descriptor_set_layout_unref(device, set->layout);
    }
 
-   if (pool->bo) {
-      util_vma_heap_finish(&pool->bo_heap);
-      anv_device_release_bo(device, pool->bo);
-   }
+   util_vma_heap_finish(&pool->host_heap);
+
    anv_state_stream_finish(&pool->surface_state_stream);
 
+   anv_descriptor_pool_heap_fini(device, &pool->surfaces);
+   anv_descriptor_pool_heap_fini(device, &pool->samplers);
+
    vk_object_free(&device->vk, pAllocator, pool);
 }
 
@@ -934,73 +1588,51 @@ VkResult anv_ResetDescriptorPool(
    }
    list_inithead(&pool->desc_sets);
 
-   pool->next = 0;
-   pool->free_list = EMPTY;
+   util_vma_heap_finish(&pool->host_heap);
+   util_vma_heap_init(&pool->host_heap, POOL_HEAP_OFFSET, pool->host_mem_size);
 
-   if (pool->bo) {
-      util_vma_heap_finish(&pool->bo_heap);
-      util_vma_heap_init(&pool->bo_heap, POOL_HEAP_OFFSET, pool->bo->size);
-   }
+   anv_descriptor_pool_heap_reset(device, &pool->surfaces);
+   anv_descriptor_pool_heap_reset(device, &pool->samplers);
 
    anv_state_stream_finish(&pool->surface_state_stream);
    anv_state_stream_init(&pool->surface_state_stream,
-                         &device->surface_state_pool, 4096);
+                         &device->internal_surface_state_pool, 4096);
    pool->surface_state_free_list = NULL;
 
    return VK_SUCCESS;
 }
 
-struct pool_free_list_entry {
-   uint32_t next;
-   uint32_t size;
-};
-
 static VkResult
 anv_descriptor_pool_alloc_set(struct anv_descriptor_pool *pool,
                               uint32_t size,
                               struct anv_descriptor_set **set)
 {
-   if (size <= pool->size - pool->next) {
-      *set = (struct anv_descriptor_set *) (pool->data + pool->next);
-      (*set)->size = size;
-      pool->next += size;
-      return VK_SUCCESS;
-   } else {
-      struct pool_free_list_entry *entry;
-      uint32_t *link = &pool->free_list;
-      for (uint32_t f = pool->free_list; f != EMPTY; f = entry->next) {
-         entry = (struct pool_free_list_entry *) (pool->data + f);
-         if (size <= entry->size) {
-            *link = entry->next;
-            *set = (struct anv_descriptor_set *) entry;
-            (*set)->size = entry->size;
-            return VK_SUCCESS;
-         }
-         link = &entry->next;
-      }
+   uint64_t vma_offset = util_vma_heap_alloc(&pool->host_heap, size, 1);
 
-      if (pool->free_list != EMPTY) {
-         return vk_error(VK_ERROR_FRAGMENTED_POOL);
+   if (vma_offset == 0) {
+      if (size <= pool->host_heap.free_size) {
+         return VK_ERROR_FRAGMENTED_POOL;
       } else {
-         return vk_error(VK_ERROR_OUT_OF_POOL_MEMORY);
+         return VK_ERROR_OUT_OF_POOL_MEMORY;
       }
    }
+
+   assert(vma_offset >= POOL_HEAP_OFFSET);
+   uint64_t host_mem_offset = vma_offset - POOL_HEAP_OFFSET;
+
+   *set = (struct anv_descriptor_set *) (pool->host_mem + host_mem_offset);
+   (*set)->size = size;
+
+   return VK_SUCCESS;
 }
 
 static void
 anv_descriptor_pool_free_set(struct anv_descriptor_pool *pool,
                              struct anv_descriptor_set *set)
 {
-   /* Put the descriptor set allocation back on the free list. */
-   const uint32_t index = (char *) set - pool->data;
-   if (index + set->size == pool->next) {
-      pool->next = index;
-   } else {
-      struct pool_free_list_entry *entry = (struct pool_free_list_entry *) set;
-      entry->next = pool->free_list;
-      entry->size = set->size;
-      pool->free_list = (char *) entry - pool->data;
-   }
+   util_vma_heap_free(&pool->host_heap,
+                      ((char *) set - pool->host_mem) + POOL_HEAP_OFFSET,
+                      set->size);
 }
 
 struct surface_state_free_list_entry {
@@ -1011,16 +1643,21 @@ struct surface_state_free_list_entry {
 static struct anv_state
 anv_descriptor_pool_alloc_state(struct anv_descriptor_pool *pool)
 {
+   assert(!pool->host_only);
+
    struct surface_state_free_list_entry *entry =
       pool->surface_state_free_list;
 
    if (entry) {
       struct anv_state state = entry->state;
       pool->surface_state_free_list = entry->next;
-      assert(state.alloc_size == 64);
+      assert(state.alloc_size == ANV_SURFACE_STATE_SIZE);
       return state;
    } else {
-      return anv_state_stream_alloc(&pool->surface_state_stream, 64, 64);
+      struct anv_state state =
+         anv_state_stream_alloc(&pool->surface_state_stream,
+                                ANV_SURFACE_STATE_SIZE, 64);
+      return state;
    }
 }
 
@@ -1028,6 +1665,7 @@ static void
 anv_descriptor_pool_free_state(struct anv_descriptor_pool *pool,
                                struct anv_state state)
 {
+   assert(state.alloc_size);
    /* Put the buffer view surface state back on the free list. */
    struct surface_state_free_list_entry *entry = state.map;
    entry->next = pool->surface_state_free_list;
@@ -1035,9 +1673,9 @@ anv_descriptor_pool_free_state(struct anv_descriptor_pool *pool,
    pool->surface_state_free_list = entry;
 }
 
-size_t
+static size_t
 anv_descriptor_set_layout_size(const struct anv_descriptor_set_layout *layout,
-                               uint32_t var_desc_count)
+                               bool host_only, uint32_t var_desc_count)
 {
    const uint32_t descriptor_count =
       set_layout_descriptor_count(layout, var_desc_count);
@@ -1046,10 +1684,11 @@ anv_descriptor_set_layout_size(const struct anv_descriptor_set_layout *layout,
 
    return sizeof(struct anv_descriptor_set) +
           descriptor_count * sizeof(struct anv_descriptor) +
-          buffer_view_count * sizeof(struct anv_buffer_view);
+          buffer_view_count * sizeof(struct anv_buffer_view) +
+          (host_only ? buffer_view_count * ANV_SURFACE_STATE_SIZE : 0);
 }
 
-VkResult
+static VkResult
 anv_descriptor_set_create(struct anv_device *device,
                           struct anv_descriptor_pool *pool,
                           struct anv_descriptor_set_layout *layout,
@@ -1057,46 +1696,78 @@ anv_descriptor_set_create(struct anv_device *device,
                           struct anv_descriptor_set **out_set)
 {
    struct anv_descriptor_set *set;
-   const size_t size = anv_descriptor_set_layout_size(layout, var_desc_count);
+   const size_t size = anv_descriptor_set_layout_size(layout,
+                                                      pool->host_only,
+                                                      var_desc_count);
 
    VkResult result = anv_descriptor_pool_alloc_set(pool, size, &set);
    if (result != VK_SUCCESS)
       return result;
 
-   uint32_t descriptor_buffer_size =
-      anv_descriptor_set_layout_descriptor_buffer_size(layout, var_desc_count);
-   if (descriptor_buffer_size) {
-      uint64_t pool_vma_offset =
-         util_vma_heap_alloc(&pool->bo_heap, descriptor_buffer_size,
-                             ANV_UBO_ALIGNMENT);
-      if (pool_vma_offset == 0) {
+   uint32_t descriptor_buffer_surface_size, descriptor_buffer_sampler_size;
+   anv_descriptor_set_layout_descriptor_buffer_size(layout, var_desc_count,
+                                                    &descriptor_buffer_surface_size,
+                                                    &descriptor_buffer_sampler_size);
+
+   set->desc_surface_state = ANV_STATE_NULL;
+   set->is_push = false;
+
+   if (descriptor_buffer_surface_size) {
+      result = anv_descriptor_pool_heap_alloc(pool, &pool->surfaces,
+                                              descriptor_buffer_surface_size,
+                                              ANV_UBO_ALIGNMENT,
+                                              &set->desc_surface_mem);
+      if (result != VK_SUCCESS) {
          anv_descriptor_pool_free_set(pool, set);
-         return vk_error(VK_ERROR_FRAGMENTED_POOL);
+         return result;
       }
-      assert(pool_vma_offset >= POOL_HEAP_OFFSET &&
-             pool_vma_offset - POOL_HEAP_OFFSET <= INT32_MAX);
-      set->desc_mem.offset = pool_vma_offset - POOL_HEAP_OFFSET;
-      set->desc_mem.alloc_size = descriptor_buffer_size;
-      set->desc_mem.map = pool->bo->map + set->desc_mem.offset;
-
-      set->desc_addr = (struct anv_address) {
-         .bo = pool->bo,
-         .offset = set->desc_mem.offset,
+
+      set->desc_surface_addr = (struct anv_address) {
+         .bo = pool->surfaces.bo,
+         .offset = set->desc_surface_mem.offset,
       };
+      set->desc_offset = anv_address_physical(set->desc_surface_addr) -
+                         device->physical->va.internal_surface_state_pool.addr;
 
       enum isl_format format =
          anv_isl_format_for_descriptor_type(device,
                                             VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
 
-      set->desc_surface_state = anv_descriptor_pool_alloc_state(pool);
-      anv_fill_buffer_surface_state(device, set->desc_surface_state, format,
-                                    ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
-                                    set->desc_addr,
-                                    descriptor_buffer_size, 1);
+      if (!pool->host_only) {
+         set->desc_surface_state = anv_descriptor_pool_alloc_state(pool);
+         if (set->desc_surface_state.map == NULL) {
+            anv_descriptor_pool_free_set(pool, set);
+            return vk_error(pool, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+         }
+
+         anv_fill_buffer_surface_state(device, set->desc_surface_state.map,
+                                       format, ISL_SWIZZLE_IDENTITY,
+                                       ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
+                                       set->desc_surface_addr,
+                                       descriptor_buffer_surface_size, 1);
+      }
    } else {
-      set->desc_mem = ANV_STATE_NULL;
-      set->desc_addr = (struct anv_address) { .bo = NULL, .offset = 0 };
-      set->desc_surface_state = ANV_STATE_NULL;
+      set->desc_surface_mem = ANV_STATE_NULL;
+      set->desc_surface_addr = ANV_NULL_ADDRESS;
+   }
+
+   if (descriptor_buffer_sampler_size) {
+      result = anv_descriptor_pool_heap_alloc(pool, &pool->samplers,
+                                              descriptor_buffer_sampler_size,
+                                              ANV_SAMPLER_STATE_SIZE,
+                                              &set->desc_sampler_mem);
+      if (result != VK_SUCCESS) {
+         anv_descriptor_pool_free_set(pool, set);
+         return result;
+      }
+
+      set->desc_sampler_addr = (struct anv_address) {
+         .bo = pool->samplers.bo,
+         .offset = set->desc_sampler_mem.offset,
+      };
+   } else {
+      set->desc_sampler_mem = ANV_STATE_NULL;
+      set->desc_sampler_addr = ANV_NULL_ADDRESS;
    }
 
    vk_object_base_init(&device->vk, &set->base,
@@ -1120,7 +1791,6 @@ anv_descriptor_set_create(struct anv_device *device,
           sizeof(struct anv_descriptor) * set->descriptor_count);
 
    /* Go through and fill out immutable samplers if we have any */
-   struct anv_descriptor *desc = set->descriptors;
    for (uint32_t b = 0; b < layout->binding_count; b++) {
       if (layout->binding[b].immutable_samplers) {
          for (uint32_t i = 0; i < layout->binding[b].array_size; i++) {
@@ -1139,13 +1809,30 @@ anv_descriptor_set_create(struct anv_device *device,
                                                 b, i);
          }
       }
-      desc += layout->binding[b].array_size;
    }
 
-   /* Allocate surface state for the buffer views. */
-   for (uint32_t b = 0; b < set->buffer_view_count; b++) {
-      set->buffer_views[b].surface_state =
-         anv_descriptor_pool_alloc_state(pool);
+   /* Allocate surface states for real descriptor sets if we're using indirect
+    * descriptors. For host only sets, we just store the surface state data in
+    * malloc memory.
+    */
+   if (device->physical->indirect_descriptors) {
+      if (!pool->host_only) {
+         for (uint32_t b = 0; b < set->buffer_view_count; b++) {
+            set->buffer_views[b].general.state =
+               anv_descriptor_pool_alloc_state(pool);
+         }
+      } else {
+         void *host_surface_states =
+            set->buffer_views + set->buffer_view_count;
+         memset(host_surface_states, 0,
+                set->buffer_view_count * ANV_SURFACE_STATE_SIZE);
+         for (uint32_t b = 0; b < set->buffer_view_count; b++) {
+            set->buffer_views[b].general.state = (struct anv_state) {
+               .alloc_size = ANV_SURFACE_STATE_SIZE,
+               .map = host_surface_states + b * ANV_SURFACE_STATE_SIZE,
+            };
+         }
+      }
    }
 
    list_addtail(&set->pool_link, &pool->desc_sets);
@@ -1155,22 +1842,32 @@ anv_descriptor_set_create(struct anv_device *device,
    return VK_SUCCESS;
 }
 
-void
+static void
 anv_descriptor_set_destroy(struct anv_device *device,
                            struct anv_descriptor_pool *pool,
                            struct anv_descriptor_set *set)
 {
    anv_descriptor_set_layout_unref(device, set->layout);
 
-   if (set->desc_mem.alloc_size) {
-      util_vma_heap_free(&pool->bo_heap,
-                         (uint64_t)set->desc_mem.offset + POOL_HEAP_OFFSET,
-                         set->desc_mem.alloc_size);
-      anv_descriptor_pool_free_state(pool, set->desc_surface_state);
+   if (set->desc_surface_mem.alloc_size) {
+      anv_descriptor_pool_heap_free(&pool->surfaces, set->desc_surface_mem);
+      if (set->desc_surface_state.alloc_size)
+         anv_descriptor_pool_free_state(pool, set->desc_surface_state);
    }
 
-   for (uint32_t b = 0; b < set->buffer_view_count; b++)
-      anv_descriptor_pool_free_state(pool, set->buffer_views[b].surface_state);
+   if (set->desc_sampler_mem.alloc_size)
+      anv_descriptor_pool_heap_free(&pool->samplers, set->desc_sampler_mem);
+
+   if (device->physical->indirect_descriptors) {
+      if (!pool->host_only) {
+         for (uint32_t b = 0; b < set->buffer_view_count; b++) {
+            if (set->buffer_views[b].general.state.alloc_size) {
+               anv_descriptor_pool_free_state(
+                  pool, set->buffer_views[b].general.state);
+            }
+         }
+      }
+   }
 
    list_del(&set->pool_link);
 
@@ -1187,7 +1884,7 @@ VkResult anv_AllocateDescriptorSets(
    ANV_FROM_HANDLE(anv_descriptor_pool, pool, pAllocateInfo->descriptorPool);
 
    VkResult result = VK_SUCCESS;
-   struct anv_descriptor_set *set;
+   struct anv_descriptor_set *set = NULL;
    uint32_t i;
 
    const VkDescriptorSetVariableDescriptorCountAllocateInfo *vdcai =
@@ -1212,9 +1909,20 @@ VkResult anv_AllocateDescriptorSets(
       pDescriptorSets[i] = anv_descriptor_set_to_handle(set);
    }
 
-   if (result != VK_SUCCESS)
+   if (result != VK_SUCCESS) {
       anv_FreeDescriptorSets(_device, pAllocateInfo->descriptorPool,
                              i, pDescriptorSets);
+      /* The Vulkan 1.3.228 spec, section 14.2.3. Allocation of Descriptor Sets:
+       *
+       *   "If the creation of any of those descriptor sets fails, then the
+       *    implementation must destroy all successfully created descriptor
+       *    set objects from this command, set all entries of the
+       *    pDescriptorSets array to VK_NULL_HANDLE and return the error."
+       */
+      for (i = 0; i < pAllocateInfo->descriptorSetCount; i++)
+         pDescriptorSets[i] = VK_NULL_HANDLE;
+
+   }
 
    return result;
 }
@@ -1240,34 +1948,194 @@ VkResult anv_FreeDescriptorSets(
    return VK_SUCCESS;
 }
 
-static void
-anv_descriptor_set_write_image_param(uint32_t *param_desc_map,
-                                     const struct brw_image_param *param)
+bool
+anv_push_descriptor_set_init(struct anv_cmd_buffer *cmd_buffer,
+                             struct anv_push_descriptor_set *push_set,
+                             struct anv_descriptor_set_layout *layout)
 {
-#define WRITE_PARAM_FIELD(field, FIELD) \
-   for (unsigned i = 0; i < ARRAY_SIZE(param->field); i++) \
-      param_desc_map[BRW_IMAGE_PARAM_##FIELD##_OFFSET + i] = param->field[i]
+   struct anv_descriptor_set *set = &push_set->set;
+   /* Only copy the old descriptor data if needed :
+    *    - not if there was no previous layout
+    *    - not if the layout is different (descriptor set data becomes
+    *      undefined)
+    *    - not if there is only one descriptor, we know the entire data will
+    *      be replaced
+    *
+    * TODO: we could optimizer further, try to keep a copy of the old data on
+    *       the host, try to copy only the non newly written bits, ...
+    */
+   const bool copy_old_descriptors = set->layout != NULL &&
+                                     set->layout == layout &&
+                                     layout->descriptor_count > 1;
 
-   WRITE_PARAM_FIELD(offset, OFFSET);
-   WRITE_PARAM_FIELD(size, SIZE);
-   WRITE_PARAM_FIELD(stride, STRIDE);
-   WRITE_PARAM_FIELD(tiling, TILING);
-   WRITE_PARAM_FIELD(swizzling, SWIZZLING);
-   WRITE_PARAM_FIELD(size, SIZE);
+   if (set->layout != layout) {
+      if (set->layout) {
+         anv_descriptor_set_layout_unref(cmd_buffer->device, set->layout);
+      } else {
+         /* one-time initialization */
+         vk_object_base_init(&cmd_buffer->device->vk, &set->base,
+                             VK_OBJECT_TYPE_DESCRIPTOR_SET);
+         set->is_push = true;
+         set->buffer_views = push_set->buffer_views;
+      }
 
-#undef WRITE_PARAM_FIELD
+      anv_descriptor_set_layout_ref(layout);
+      set->layout = layout;
+      set->generate_surface_states = 0;
+   }
+
+   assert(set->is_push && set->buffer_views);
+   set->size = anv_descriptor_set_layout_size(layout, false /* host_only */, 0);
+   set->buffer_view_count = layout->buffer_view_count;
+   set->descriptor_count = layout->descriptor_count;
+
+   if (layout->descriptor_buffer_surface_size &&
+       (push_set->set_used_on_gpu ||
+        set->desc_surface_mem.alloc_size < layout->descriptor_buffer_surface_size)) {
+      struct anv_physical_device *pdevice = cmd_buffer->device->physical;
+      struct anv_state_stream *push_stream;
+      uint64_t push_base_address;
+
+      if (layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT) {
+         push_stream = pdevice->uses_ex_bso ?
+            &cmd_buffer->push_descriptor_buffer_stream :
+            &cmd_buffer->surface_state_stream;
+         push_base_address = pdevice->uses_ex_bso ?
+            pdevice->va.push_descriptor_buffer_pool.addr :
+            pdevice->va.internal_surface_state_pool.addr;
+      } else {
+         push_stream = pdevice->indirect_descriptors ?
+            &cmd_buffer->indirect_push_descriptor_stream :
+            &cmd_buffer->surface_state_stream;
+         push_base_address = pdevice->indirect_descriptors ?
+            pdevice->va.indirect_push_descriptor_pool.addr :
+            pdevice->va.internal_surface_state_pool.addr;
+      }
+
+      uint32_t surface_size, sampler_size;
+      anv_descriptor_set_layout_descriptor_buffer_size(layout, 0,
+                                                       &surface_size,
+                                                       &sampler_size);
+
+      /* The previous buffer is either actively used by some GPU command (so
+       * we can't modify it) or is too small.  Allocate a new one.
+       */
+      struct anv_state desc_surface_mem =
+         anv_state_stream_alloc(push_stream, surface_size, ANV_UBO_ALIGNMENT);
+      if (desc_surface_mem.map == NULL)
+         return false;
+
+      if (copy_old_descriptors) {
+         memcpy(desc_surface_mem.map, set->desc_surface_mem.map,
+                MIN2(desc_surface_mem.alloc_size,
+                     set->desc_surface_mem.alloc_size));
+      }
+      set->desc_surface_mem = desc_surface_mem;
+
+      set->desc_surface_addr = anv_state_pool_state_address(
+         push_stream->state_pool,
+         set->desc_surface_mem);
+      set->desc_offset = anv_address_physical(set->desc_surface_addr) -
+                         push_base_address;
+   }
+
+   if (layout->descriptor_buffer_sampler_size &&
+       (push_set->set_used_on_gpu ||
+        set->desc_sampler_mem.alloc_size < layout->descriptor_buffer_sampler_size)) {
+      struct anv_physical_device *pdevice = cmd_buffer->device->physical;
+      assert(!pdevice->indirect_descriptors);
+      struct anv_state_stream *push_stream = &cmd_buffer->dynamic_state_stream;
+
+      uint32_t surface_size, sampler_size;
+      anv_descriptor_set_layout_descriptor_buffer_size(layout, 0,
+                                                       &surface_size,
+                                                       &sampler_size);
+
+      /* The previous buffer is either actively used by some GPU command (so
+       * we can't modify it) or is too small.  Allocate a new one.
+       */
+      struct anv_state desc_sampler_mem =
+         anv_state_stream_alloc(push_stream, sampler_size, ANV_SAMPLER_STATE_SIZE);
+      if (desc_sampler_mem.map == NULL)
+         return false;
+
+      if (copy_old_descriptors) {
+         memcpy(desc_sampler_mem.map, set->desc_sampler_mem.map,
+                MIN2(desc_sampler_mem.alloc_size,
+                     set->desc_sampler_mem.alloc_size));
+      }
+      set->desc_sampler_mem = desc_sampler_mem;
+
+      set->desc_sampler_addr = anv_state_pool_state_address(
+         push_stream->state_pool,
+         set->desc_sampler_mem);
+   }
+
+   if (push_set->set_used_on_gpu) {
+      set->desc_surface_state = ANV_STATE_NULL;
+      push_set->set_used_on_gpu = false;
+   }
+
+   return true;
+}
+
+void
+anv_push_descriptor_set_finish(struct anv_push_descriptor_set *push_set)
+{
+   struct anv_descriptor_set *set = &push_set->set;
+   if (set->layout) {
+      struct anv_device *device =
+         container_of(set->base.device, struct anv_device, vk);
+      anv_descriptor_set_layout_unref(device, set->layout);
+   }
 }
 
 static uint32_t
-anv_surface_state_to_handle(struct anv_state state)
+anv_surface_state_to_handle(struct anv_physical_device *device,
+                            struct anv_state state)
 {
    /* Bits 31:12 of the bindless surface offset in the extended message
     * descriptor is bits 25:6 of the byte-based address.
     */
    assert(state.offset >= 0);
    uint32_t offset = state.offset;
-   assert((offset & 0x3f) == 0 && offset < (1 << 26));
-   return offset << 6;
+   if (device->uses_ex_bso) {
+      assert((offset & 0x3f) == 0);
+      return offset;
+   } else {
+      assert((offset & 0x3f) == 0 && offset < (1 << 26));
+      return offset << 6;
+   }
+}
+
+static const void *
+anv_image_view_surface_data_for_plane_layout(struct anv_image_view *image_view,
+                                             VkDescriptorType desc_type,
+                                             unsigned plane,
+                                             VkImageLayout layout)
+{
+   if (desc_type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER ||
+       desc_type == VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE ||
+       desc_type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT) {
+      return layout == VK_IMAGE_LAYOUT_GENERAL ?
+         &image_view->planes[plane].general_sampler.state_data :
+         &image_view->planes[plane].optimal_sampler.state_data;
+   }
+
+   if (desc_type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
+      return &image_view->planes[plane].storage.state_data;
+
+   unreachable("Invalid descriptor type");
+}
+
+static const uint32_t *
+anv_sampler_state_for_descriptor_set(const struct anv_sampler *sampler,
+                                     const struct anv_descriptor_set *set,
+                                     uint32_t plane)
+{
+   if (set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT)
+      return sampler->db_state[plane];
+   return sampler->state[plane];
 }
 
 void
@@ -1289,7 +2157,8 @@ anv_descriptor_set_write_image_view(struct anv_device *device,
     * set initialization to set the bindless samplers.
     */
    assert(type == bind_layout->type ||
-          type == VK_DESCRIPTOR_TYPE_SAMPLER);
+          type == VK_DESCRIPTOR_TYPE_SAMPLER ||
+          bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT);
 
    switch (type) {
    case VK_DESCRIPTOR_TYPE_SAMPLER:
@@ -1322,21 +2191,27 @@ anv_descriptor_set_write_image_view(struct anv_device *device,
       .sampler = sampler,
    };
 
-   void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset +
-                    element * anv_descriptor_size(bind_layout);
-   memset(desc_map, 0, anv_descriptor_size(bind_layout));
+   void *desc_surface_map = set->desc_surface_mem.map +
+      bind_layout->descriptor_surface_offset +
+      element * bind_layout->descriptor_surface_stride;
 
-   if (bind_layout->data & ANV_DESCRIPTOR_SAMPLED_IMAGE) {
+   enum anv_descriptor_data data =
+      bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ?
+      anv_descriptor_data_for_type(device->physical, set->layout->type,
+                                   set->layout->flags, type) :
+      bind_layout->data;
+
+   if (data & ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE) {
       struct anv_sampled_image_descriptor desc_data[3];
       memset(desc_data, 0, sizeof(desc_data));
 
       if (image_view) {
          for (unsigned p = 0; p < image_view->n_planes; p++) {
-            struct anv_surface_state sstate =
-               (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?
-               image_view->planes[p].general_sampler_surface_state :
-               image_view->planes[p].optimal_sampler_surface_state;
-            desc_data[p].image = anv_surface_state_to_handle(sstate.state);
+            const struct anv_surface_state *sstate =
+               anv_image_view_texture_surface_state(image_view, p,
+                                                    desc->layout);
+            desc_data[p].image =
+               anv_surface_state_to_handle(device->physical, sstate->state);
          }
       }
 
@@ -1349,55 +2224,101 @@ anv_descriptor_set_write_image_view(struct anv_device *device,
        * can be no more than the size of our array of handles.
        */
       assert(bind_layout->max_plane_count <= ARRAY_SIZE(desc_data));
-      memcpy(desc_map, desc_data,
+      memcpy(desc_surface_map, desc_data,
              MAX2(1, bind_layout->max_plane_count) * sizeof(desc_data[0]));
    }
 
-   if (image_view == NULL)
-      return;
+   if (data & ANV_DESCRIPTOR_INDIRECT_STORAGE_IMAGE) {
+      if (image_view) {
+         assert(image_view->n_planes == 1);
+         struct anv_storage_image_descriptor desc_data = {
+            .vanilla = anv_surface_state_to_handle(
+               device->physical,
+               anv_image_view_storage_surface_state(image_view)->state),
+            .image_depth = image_view->vk.storage.z_slice_count,
+         };
+         memcpy(desc_surface_map, &desc_data, sizeof(desc_data));
+      } else {
+         memset(desc_surface_map, 0, bind_layout->descriptor_surface_stride);
+      }
+   }
 
-   if (bind_layout->data & ANV_DESCRIPTOR_STORAGE_IMAGE) {
-      assert(!(bind_layout->data & ANV_DESCRIPTOR_IMAGE_PARAM));
-      assert(image_view->n_planes == 1);
-      struct anv_storage_image_descriptor desc_data = {
-         .read_write = anv_surface_state_to_handle(
-                           image_view->planes[0].storage_surface_state.state),
-         .write_only = anv_surface_state_to_handle(
-                           image_view->planes[0].writeonly_storage_surface_state.state),
-      };
-      memcpy(desc_map, &desc_data, sizeof(desc_data));
+   if (data & ANV_DESCRIPTOR_SAMPLER) {
+      void *sampler_map =
+         set->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT ?
+         (set->desc_sampler_mem.map +
+          bind_layout->descriptor_sampler_offset +
+          element * bind_layout->descriptor_sampler_stride) : desc_surface_map;
+      if (sampler) {
+         for (unsigned p = 0; p < sampler->n_planes; p++) {
+            memcpy(sampler_map + p * ANV_SAMPLER_STATE_SIZE,
+                   anv_sampler_state_for_descriptor_set(sampler, set, p),
+                   ANV_SAMPLER_STATE_SIZE);
+         }
+      } else {
+         memset(sampler_map, 0, bind_layout->descriptor_sampler_stride);
+      }
    }
 
-   if (bind_layout->data & ANV_DESCRIPTOR_IMAGE_PARAM) {
-      /* Storage images can only ever have one plane */
-      assert(image_view->n_planes == 1);
-      const struct brw_image_param *image_param =
-         &image_view->planes[0].storage_image_param;
+   if (data & ANV_DESCRIPTOR_SURFACE) {
+      unsigned max_plane_count = image_view ? image_view->n_planes : 1;
+
+      for (unsigned p = 0; p < max_plane_count; p++) {
+         void *plane_map = desc_surface_map + p * ANV_SURFACE_STATE_SIZE;
 
-      anv_descriptor_set_write_image_param(desc_map, image_param);
+         if (image_view) {
+            memcpy(plane_map,
+                   anv_image_view_surface_data_for_plane_layout(image_view, type,
+                                                                p, desc->layout),
+                   ANV_SURFACE_STATE_SIZE);
+         } else {
+            memcpy(plane_map, &device->host_null_surface_state, ANV_SURFACE_STATE_SIZE);
+         }
+      }
    }
 
-   if (bind_layout->data & ANV_DESCRIPTOR_TEXTURE_SWIZZLE) {
-      assert(!(bind_layout->data & ANV_DESCRIPTOR_SAMPLED_IMAGE));
-      assert(image_view);
-      struct anv_texture_swizzle_descriptor desc_data[3];
-      memset(desc_data, 0, sizeof(desc_data));
+   if (data & ANV_DESCRIPTOR_SURFACE_SAMPLER) {
+      unsigned max_plane_count =
+         MAX2(image_view ? image_view->n_planes : 1,
+              sampler ? sampler->n_planes : 1);
 
-      for (unsigned p = 0; p < image_view->n_planes; p++) {
-         desc_data[p] = (struct anv_texture_swizzle_descriptor) {
-            .swizzle = {
-               (uint8_t)image_view->planes[p].isl.swizzle.r,
-               (uint8_t)image_view->planes[p].isl.swizzle.g,
-               (uint8_t)image_view->planes[p].isl.swizzle.b,
-               (uint8_t)image_view->planes[p].isl.swizzle.a,
-            },
-         };
+      for (unsigned p = 0; p < max_plane_count; p++) {
+         void *plane_map = desc_surface_map + p * 2 * ANV_SURFACE_STATE_SIZE;
+
+         if (image_view) {
+            memcpy(plane_map,
+                   anv_image_view_surface_data_for_plane_layout(image_view, type,
+                                                                p, desc->layout),
+                   ANV_SURFACE_STATE_SIZE);
+         } else {
+            memcpy(plane_map, &device->host_null_surface_state, ANV_SURFACE_STATE_SIZE);
+         }
+
+         if (sampler) {
+            memcpy(plane_map + ANV_SURFACE_STATE_SIZE,
+                   anv_sampler_state_for_descriptor_set(sampler, set, p),
+                   ANV_SAMPLER_STATE_SIZE);
+         } else {
+            memset(plane_map + ANV_SURFACE_STATE_SIZE, 0,
+                   ANV_SAMPLER_STATE_SIZE);
+         }
       }
-      memcpy(desc_map, desc_data,
-             MAX2(1, bind_layout->max_plane_count) * sizeof(desc_data[0]));
    }
 }
 
+static const void *
+anv_buffer_view_surface_data(struct anv_buffer_view *buffer_view,
+                             VkDescriptorType desc_type)
+{
+   if (desc_type == VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER)
+      return &buffer_view->general.state_data;
+
+   if (desc_type == VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER)
+      return &buffer_view->storage.state_data;
+
+   unreachable("Invalid descriptor type");
+}
+
 void
 anv_descriptor_set_write_buffer_view(struct anv_device *device,
                                      struct anv_descriptor_set *set,
@@ -1411,50 +2332,79 @@ anv_descriptor_set_write_buffer_view(struct anv_device *device,
    struct anv_descriptor *desc =
       &set->descriptors[bind_layout->descriptor_index + element];
 
-   assert(type == bind_layout->type);
-
-   void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset +
-                    element * anv_descriptor_size(bind_layout);
-
-   if (buffer_view == NULL) {
-      *desc = (struct anv_descriptor) { .type = type, };
-      memset(desc_map, 0, anv_descriptor_size(bind_layout));
-      return;
-   }
+   assert(type == bind_layout->type ||
+          bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT);
 
    *desc = (struct anv_descriptor) {
       .type = type,
       .buffer_view = buffer_view,
    };
 
-   if (bind_layout->data & ANV_DESCRIPTOR_SAMPLED_IMAGE) {
+   enum anv_descriptor_data data =
+      bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ?
+      anv_descriptor_data_for_type(device->physical, set->layout->type,
+                                   set->layout->flags, type) :
+      bind_layout->data;
+
+   void *desc_map = set->desc_surface_mem.map +
+                    bind_layout->descriptor_surface_offset +
+                    element * bind_layout->descriptor_surface_stride;
+
+   if (buffer_view == NULL) {
+      if (data & ANV_DESCRIPTOR_SURFACE)
+         memcpy(desc_map, &device->host_null_surface_state, ANV_SURFACE_STATE_SIZE);
+      else
+         memset(desc_map, 0, bind_layout->descriptor_surface_stride);
+      return;
+   }
+
+   if (data & ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE) {
       struct anv_sampled_image_descriptor desc_data = {
-         .image = anv_surface_state_to_handle(buffer_view->surface_state),
+         .image = anv_surface_state_to_handle(
+            device->physical, buffer_view->general.state),
       };
       memcpy(desc_map, &desc_data, sizeof(desc_data));
    }
 
-   if (bind_layout->data & ANV_DESCRIPTOR_STORAGE_IMAGE) {
-      assert(!(bind_layout->data & ANV_DESCRIPTOR_IMAGE_PARAM));
+   if (data & ANV_DESCRIPTOR_INDIRECT_STORAGE_IMAGE) {
       struct anv_storage_image_descriptor desc_data = {
-         .read_write = anv_surface_state_to_handle(
-                           buffer_view->storage_surface_state),
-         .write_only = anv_surface_state_to_handle(
-                           buffer_view->writeonly_storage_surface_state),
+         .vanilla = anv_surface_state_to_handle(
+            device->physical, buffer_view->storage.state),
       };
       memcpy(desc_map, &desc_data, sizeof(desc_data));
    }
 
-   if (bind_layout->data & ANV_DESCRIPTOR_IMAGE_PARAM) {
-      anv_descriptor_set_write_image_param(desc_map,
-                                           &buffer_view->storage_image_param);
+   if (data & ANV_DESCRIPTOR_SURFACE) {
+      memcpy(desc_map,
+             anv_buffer_view_surface_data(buffer_view, type),
+             ANV_SURFACE_STATE_SIZE);
    }
 }
 
 void
+anv_descriptor_write_surface_state(struct anv_device *device,
+                                   struct anv_descriptor *desc,
+                                   struct anv_state surface_state)
+{
+   assert(surface_state.alloc_size);
+
+   struct anv_buffer_view *bview = desc->buffer_view;
+
+   bview->general.state = surface_state;
+
+   isl_surf_usage_flags_t usage =
+      anv_isl_usage_for_descriptor_type(desc->type);
+
+   enum isl_format format =
+      anv_isl_format_for_descriptor_type(device, desc->type);
+   anv_fill_buffer_surface_state(device, bview->general.state.map,
+                                 format, ISL_SWIZZLE_IDENTITY,
+                                 usage, bview->address, bview->vk.range, 1);
+}
+
+void
 anv_descriptor_set_write_buffer(struct anv_device *device,
                                 struct anv_descriptor_set *set,
-                                struct anv_state_stream *alloc_stream,
                                 VkDescriptorType type,
                                 struct anv_buffer *buffer,
                                 uint32_t binding,
@@ -1464,76 +2414,95 @@ anv_descriptor_set_write_buffer(struct anv_device *device,
 {
    const struct anv_descriptor_set_binding_layout *bind_layout =
       &set->layout->binding[binding];
-   struct anv_descriptor *desc =
-      &set->descriptors[bind_layout->descriptor_index + element];
+   const uint32_t descriptor_index = bind_layout->descriptor_index + element;
+   struct anv_descriptor *desc = &set->descriptors[descriptor_index];
+
+   assert(type == bind_layout->type ||
+          bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT);
+
+   *desc = (struct anv_descriptor) {
+      .type = type,
+      .offset = offset,
+      .range = range,
+      .buffer = buffer,
+   };
 
-   assert(type == bind_layout->type);
+   enum anv_descriptor_data data =
+      bind_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ?
+      anv_descriptor_data_for_type(device->physical, set->layout->type,
+                                   set->layout->flags, type) :
+      bind_layout->data;
 
-   void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset +
-                    element * anv_descriptor_size(bind_layout);
+   void *desc_map = set->desc_surface_mem.map +
+                    bind_layout->descriptor_surface_offset +
+                    element * bind_layout->descriptor_surface_stride;
 
    if (buffer == NULL) {
-      *desc = (struct anv_descriptor) { .type = type, };
-      memset(desc_map, 0, anv_descriptor_size(bind_layout));
+      if (data & ANV_DESCRIPTOR_SURFACE)
+         memcpy(desc_map, &device->host_null_surface_state, ANV_SURFACE_STATE_SIZE);
+      else
+         memset(desc_map, 0, bind_layout->descriptor_surface_stride);
       return;
    }
 
    struct anv_address bind_addr = anv_address_add(buffer->address, offset);
-   uint64_t bind_range = anv_buffer_get_range(buffer, offset, range);
+   desc->bind_range = vk_buffer_range(&buffer->vk, offset, range);
 
    /* We report a bounds checking alignment of 32B for the sake of block
     * messages which read an entire register worth at a time.
     */
    if (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
        type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
-      bind_range = align_u64(bind_range, ANV_UBO_ALIGNMENT);
+      desc->bind_range = align64(desc->bind_range, ANV_UBO_ALIGNMENT);
 
-   if (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
-       type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
-      *desc = (struct anv_descriptor) {
-         .type = type,
-         .buffer = buffer,
-         .offset = offset,
-         .range = range,
+   if (data & ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE) {
+      struct anv_address_range_descriptor desc_data = {
+         .address = anv_address_physical(bind_addr),
+         .range = desc->bind_range,
       };
-   } else {
-      assert(bind_layout->data & ANV_DESCRIPTOR_BUFFER_VIEW);
+      memcpy(desc_map, &desc_data, sizeof(desc_data));
+   }
+
+   if (data & ANV_DESCRIPTOR_SURFACE) {
+      isl_surf_usage_flags_t usage =
+         anv_isl_usage_for_descriptor_type(desc->type);
+
+      enum isl_format format =
+         anv_isl_format_for_descriptor_type(device, desc->type);
+
+      isl_buffer_fill_state(&device->isl_dev, desc_map,
+                            .address = anv_address_physical(bind_addr),
+                            .mocs = isl_mocs(&device->isl_dev, usage,
+                                             bind_addr.bo && anv_bo_is_external(bind_addr.bo)),
+                            .size_B = desc->bind_range,
+                            .format = format,
+                            .swizzle = ISL_SWIZZLE_IDENTITY,
+                            .stride_B = 1);
+   }
+
+   if (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
+       type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)
+      return;
+
+   if (data & ANV_DESCRIPTOR_BUFFER_VIEW) {
       struct anv_buffer_view *bview =
          &set->buffer_views[bind_layout->buffer_view_index + element];
 
-      bview->format = anv_isl_format_for_descriptor_type(device, type);
-      bview->range = bind_range;
-      bview->address = bind_addr;
-
-      /* If we're writing descriptors through a push command, we need to
-       * allocate the surface state from the command buffer. Otherwise it will
-       * be allocated by the descriptor pool when calling
-       * vkAllocateDescriptorSets. */
-      if (alloc_stream)
-         bview->surface_state = anv_state_stream_alloc(alloc_stream, 64, 64);
+      desc->set_buffer_view = bview;
 
-      isl_surf_usage_flags_t usage =
-         (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
-          type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) ?
-         ISL_SURF_USAGE_CONSTANT_BUFFER_BIT :
-         ISL_SURF_USAGE_STORAGE_BIT;
-
-      anv_fill_buffer_surface_state(device, bview->surface_state,
-                                    bview->format, usage,
-                                    bind_addr, bind_range, 1);
-
-      *desc = (struct anv_descriptor) {
-         .type = type,
-         .buffer_view = bview,
-      };
-   }
+      bview->vk.range = desc->bind_range;
+      bview->address = bind_addr;
 
-   if (bind_layout->data & ANV_DESCRIPTOR_ADDRESS_RANGE) {
-      struct anv_address_range_descriptor desc_data = {
-         .address = anv_address_physical(bind_addr),
-         .range = bind_range,
-      };
-      memcpy(desc_map, &desc_data, sizeof(desc_data));
+      if (set->is_push) {
+         set->generate_surface_states |= BITFIELD_BIT(descriptor_index);
+         /* Reset the surface state to make sure
+          * genX(cmd_buffer_emit_push_descriptor_surfaces) generates a new
+          * one.
+          */
+         bview->general.state = ANV_STATE_NULL;
+      } else {
+         anv_descriptor_write_surface_state(device, desc, bview->general.state);
+      }
    }
 }
 
@@ -1550,7 +2519,8 @@ anv_descriptor_set_write_inline_uniform_data(struct anv_device *device,
 
    assert(bind_layout->data & ANV_DESCRIPTOR_INLINE_UNIFORM);
 
-   void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset;
+   void *desc_map = set->desc_surface_mem.map +
+                    bind_layout->descriptor_surface_offset;
 
    memcpy(desc_map + offset, data, size);
 }
@@ -1558,7 +2528,7 @@ anv_descriptor_set_write_inline_uniform_data(struct anv_device *device,
 void
 anv_descriptor_set_write_acceleration_structure(struct anv_device *device,
                                                 struct anv_descriptor_set *set,
-                                                struct anv_acceleration_structure *accel,
+                                                struct vk_acceleration_structure *accel,
                                                 uint32_t binding,
                                                 uint32_t element)
 {
@@ -1567,35 +2537,36 @@ anv_descriptor_set_write_acceleration_structure(struct anv_device *device,
    struct anv_descriptor *desc =
       &set->descriptors[bind_layout->descriptor_index + element];
 
-   assert(bind_layout->type == VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR);
+   assert(bind_layout->data & ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE);
    *desc = (struct anv_descriptor) {
       .type = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR,
+      .accel_struct = accel,
    };
 
    struct anv_address_range_descriptor desc_data = { };
    if (accel != NULL) {
-      desc_data.address = anv_address_physical(accel->address);
+      desc_data.address = vk_acceleration_structure_get_va(accel);
       desc_data.range = accel->size;
    }
-   assert(anv_descriptor_size(bind_layout) == sizeof(desc_data));
+   assert(sizeof(desc_data) <= bind_layout->descriptor_surface_stride);
 
-   void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset +
-                    element * sizeof(desc_data);
+   void *desc_map = set->desc_surface_mem.map +
+                    bind_layout->descriptor_surface_offset +
+                    element * bind_layout->descriptor_surface_stride;
    memcpy(desc_map, &desc_data, sizeof(desc_data));
 }
 
-void anv_UpdateDescriptorSets(
-    VkDevice                                    _device,
-    uint32_t                                    descriptorWriteCount,
-    const VkWriteDescriptorSet*                 pDescriptorWrites,
-    uint32_t                                    descriptorCopyCount,
-    const VkCopyDescriptorSet*                  pDescriptorCopies)
+void
+anv_descriptor_set_write(struct anv_device *device,
+                         struct anv_descriptor_set *set_override,
+                         uint32_t write_count,
+                         const VkWriteDescriptorSet *writes)
 {
-   ANV_FROM_HANDLE(anv_device, device, _device);
-
-   for (uint32_t i = 0; i < descriptorWriteCount; i++) {
-      const VkWriteDescriptorSet *write = &pDescriptorWrites[i];
-      ANV_FROM_HANDLE(anv_descriptor_set, set, write->dstSet);
+   for (uint32_t i = 0; i < write_count; i++) {
+      const VkWriteDescriptorSet *write = &writes[i];
+      struct anv_descriptor_set *set = unlikely(set_override) ?
+         set_override :
+         anv_descriptor_set_from_handle(write->dstSet);
 
       switch (write->descriptorType) {
       case VK_DESCRIPTOR_TYPE_SAMPLER:
@@ -1634,7 +2605,6 @@ void anv_UpdateDescriptorSets(
             ANV_FROM_HANDLE(anv_buffer, buffer, write->pBufferInfo[j].buffer);
 
             anv_descriptor_set_write_buffer(device, set,
-                                            NULL,
                                             write->descriptorType,
                                             buffer,
                                             write->dstBinding,
@@ -1644,10 +2614,10 @@ void anv_UpdateDescriptorSets(
          }
          break;
 
-      case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: {
-         const VkWriteDescriptorSetInlineUniformBlockEXT *inline_write =
+      case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: {
+         const VkWriteDescriptorSetInlineUniformBlock *inline_write =
             vk_find_struct_const(write->pNext,
-                                 WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK_EXT);
+                                 WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK);
          assert(inline_write->dataSize == write->descriptorCount);
          anv_descriptor_set_write_inline_uniform_data(device, set,
                                                       write->dstBinding,
@@ -1663,7 +2633,7 @@ void anv_UpdateDescriptorSets(
          assert(accel_write->accelerationStructureCount ==
                 write->descriptorCount);
          for (uint32_t j = 0; j < write->descriptorCount; j++) {
-            ANV_FROM_HANDLE(anv_acceleration_structure, accel,
+            ANV_FROM_HANDLE(vk_acceleration_structure, accel,
                             accel_write->pAccelerationStructures[j]);
             anv_descriptor_set_write_acceleration_structure(device, set, accel,
                                                             write->dstBinding,
@@ -1676,6 +2646,19 @@ void anv_UpdateDescriptorSets(
          break;
       }
    }
+}
+
+void anv_UpdateDescriptorSets(
+    VkDevice                                    _device,
+    uint32_t                                    descriptorWriteCount,
+    const VkWriteDescriptorSet*                 pDescriptorWrites,
+    uint32_t                                    descriptorCopyCount,
+    const VkCopyDescriptorSet*                  pDescriptorCopies)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   anv_descriptor_set_write(device, NULL, descriptorWriteCount,
+                            pDescriptorWrites);
 
    for (uint32_t i = 0; i < descriptorCopyCount; i++) {
       const VkCopyDescriptorSet *copy = &pDescriptorCopies[i];
@@ -1684,35 +2667,85 @@ void anv_UpdateDescriptorSets(
 
       const struct anv_descriptor_set_binding_layout *src_layout =
          &src->layout->binding[copy->srcBinding];
-      struct anv_descriptor *src_desc =
-         &src->descriptors[src_layout->descriptor_index];
-      src_desc += copy->srcArrayElement;
-
       const struct anv_descriptor_set_binding_layout *dst_layout =
          &dst->layout->binding[copy->dstBinding];
-      struct anv_descriptor *dst_desc =
-         &dst->descriptors[dst_layout->descriptor_index];
-      dst_desc += copy->dstArrayElement;
-
-      if (src_layout->data & ANV_DESCRIPTOR_INLINE_UNIFORM) {
-         assert(src_layout->data == ANV_DESCRIPTOR_INLINE_UNIFORM);
-         memcpy(dst->desc_mem.map + dst_layout->descriptor_offset +
-                                    copy->dstArrayElement,
-                src->desc_mem.map + src_layout->descriptor_offset +
-                                    copy->srcArrayElement,
-                copy->descriptorCount);
-      } else {
-         for (uint32_t j = 0; j < copy->descriptorCount; j++)
-            dst_desc[j] = src_desc[j];
-
-         unsigned desc_size = anv_descriptor_size(src_layout);
-         if (desc_size > 0) {
-            assert(desc_size == anv_descriptor_size(dst_layout));
-            memcpy(dst->desc_mem.map + dst_layout->descriptor_offset +
-                                       copy->dstArrayElement * desc_size,
-                   src->desc_mem.map + src_layout->descriptor_offset +
-                                       copy->srcArrayElement * desc_size,
-                   copy->descriptorCount * desc_size);
+
+      if (src_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+         anv_descriptor_set_write_inline_uniform_data(device, dst,
+                                                      copy->dstBinding,
+                                                      src->desc_surface_mem.map +
+                                                      src_layout->descriptor_surface_offset + copy->srcArrayElement,
+                                                      copy->dstArrayElement,
+                                                      copy->descriptorCount);
+         continue;
+      }
+
+      uint32_t copy_surface_element_size =
+         MIN2(src_layout->descriptor_surface_stride,
+              dst_layout->descriptor_surface_stride);
+      uint32_t copy_sampler_element_size =
+         MIN2(src_layout->descriptor_sampler_stride,
+              dst_layout->descriptor_sampler_stride);
+      for (uint32_t j = 0; j < copy->descriptorCount; j++) {
+         struct anv_descriptor *src_desc =
+            &src->descriptors[src_layout->descriptor_index +
+                              copy->srcArrayElement + j];
+         struct anv_descriptor *dst_desc =
+            &dst->descriptors[dst_layout->descriptor_index +
+                              copy->dstArrayElement + j];
+
+         /* Copy the memory containing one of the following structure read by
+          * the shaders :
+          *    - anv_sampled_image_descriptor
+          *    - anv_storage_image_descriptor
+          *    - anv_address_range_descriptor
+          *    - RENDER_SURFACE_STATE
+          *    - SAMPLER_STATE
+          */
+         memcpy(dst->desc_surface_mem.map +
+                dst_layout->descriptor_surface_offset +
+                (copy->dstArrayElement + j) * dst_layout->descriptor_surface_stride,
+                src->desc_surface_mem.map +
+                src_layout->descriptor_surface_offset +
+                (copy->srcArrayElement + j) * src_layout->descriptor_surface_stride,
+                copy_surface_element_size);
+         memcpy(dst->desc_sampler_mem.map +
+                dst_layout->descriptor_sampler_offset +
+                (copy->dstArrayElement + j) * dst_layout->descriptor_sampler_stride,
+                src->desc_sampler_mem.map +
+                src_layout->descriptor_sampler_offset +
+                (copy->srcArrayElement + j) * src_layout->descriptor_sampler_stride,
+                copy_sampler_element_size);
+
+         /* Copy the CPU side data anv_descriptor */
+         *dst_desc = *src_desc;
+
+         /* If the CPU side may contain a buffer view, we need to copy that as
+          * well
+          */
+         const enum anv_descriptor_data data =
+            src_layout->type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT ?
+            anv_descriptor_data_for_type(device->physical,
+                                         src->layout->type,
+                                         src->layout->flags,
+                                         src_desc->type) :
+            src_layout->data;
+         if (data & ANV_DESCRIPTOR_BUFFER_VIEW) {
+            struct anv_buffer_view *src_bview =
+               &src->buffer_views[src_layout->buffer_view_index +
+                                  copy->srcArrayElement + j];
+            struct anv_buffer_view *dst_bview =
+               &dst->buffer_views[dst_layout->buffer_view_index +
+                                  copy->dstArrayElement + j];
+
+            dst_desc->set_buffer_view = dst_bview;
+
+            dst_bview->vk.range = src_bview->vk.range;
+            dst_bview->address = src_bview->address;
+
+            memcpy(dst_bview->general.state.map,
+                   src_bview->general.state.map,
+                   ANV_SURFACE_STATE_SIZE);
          }
       }
    }
@@ -1725,12 +2758,11 @@ void anv_UpdateDescriptorSets(
 void
 anv_descriptor_set_write_template(struct anv_device *device,
                                   struct anv_descriptor_set *set,
-                                  struct anv_state_stream *alloc_stream,
-                                  const struct anv_descriptor_update_template *template,
+                                  const struct vk_descriptor_update_template *template,
                                   const void *data)
 {
    for (uint32_t i = 0; i < template->entry_count; i++) {
-      const struct anv_descriptor_template_entry *entry =
+      const struct vk_descriptor_template_entry *entry =
          &template->entries[i];
 
       switch (entry->type) {
@@ -1774,7 +2806,6 @@ anv_descriptor_set_write_template(struct anv_device *device,
             ANV_FROM_HANDLE(anv_buffer, buffer, info->buffer);
 
             anv_descriptor_set_write_buffer(device, set,
-                                            alloc_stream,
                                             entry->type,
                                             buffer,
                                             entry->binding,
@@ -1783,7 +2814,7 @@ anv_descriptor_set_write_template(struct anv_device *device,
          }
          break;
 
-      case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
+      case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
          anv_descriptor_set_write_inline_uniform_data(device, set,
                                                       entry->binding,
                                                       data + entry->offset,
@@ -1791,79 +2822,241 @@ anv_descriptor_set_write_template(struct anv_device *device,
                                                       entry->array_count);
          break;
 
+      case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR:
+         for (uint32_t j = 0; j < entry->array_count; j++) {
+            VkAccelerationStructureKHR *accel_obj =
+               (VkAccelerationStructureKHR *)(data + entry->offset + j * entry->stride);
+            ANV_FROM_HANDLE(vk_acceleration_structure, accel, *accel_obj);
+
+            anv_descriptor_set_write_acceleration_structure(device, set,
+                                                            accel,
+                                                            entry->binding,
+                                                            entry->array_element + j);
+         }
+         break;
+
       default:
          break;
       }
    }
 }
 
-VkResult anv_CreateDescriptorUpdateTemplate(
+void anv_UpdateDescriptorSetWithTemplate(
     VkDevice                                    _device,
-    const VkDescriptorUpdateTemplateCreateInfo* pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkDescriptorUpdateTemplate*                 pDescriptorUpdateTemplate)
+    VkDescriptorSet                             descriptorSet,
+    VkDescriptorUpdateTemplate                  descriptorUpdateTemplate,
+    const void*                                 pData)
 {
    ANV_FROM_HANDLE(anv_device, device, _device);
-   struct anv_descriptor_update_template *template;
+   ANV_FROM_HANDLE(anv_descriptor_set, set, descriptorSet);
+   VK_FROM_HANDLE(vk_descriptor_update_template, template,
+                  descriptorUpdateTemplate);
 
-   size_t size = sizeof(*template) +
-      pCreateInfo->descriptorUpdateEntryCount * sizeof(template->entries[0]);
-   template = vk_object_alloc(&device->vk, pAllocator, size,
-                              VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE);
-   if (template == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   anv_descriptor_set_write_template(device, set, template, pData);
+}
 
-   template->bind_point = pCreateInfo->pipelineBindPoint;
+void anv_GetDescriptorSetLayoutSizeEXT(
+    VkDevice                                    device,
+    VkDescriptorSetLayout                       layout,
+    VkDeviceSize*                               pLayoutSizeInBytes)
+{
+   ANV_FROM_HANDLE(anv_descriptor_set_layout, set_layout, layout);
 
-   if (pCreateInfo->templateType == VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET)
-      template->set = pCreateInfo->set;
+   *pLayoutSizeInBytes = set_layout->descriptor_buffer_surface_size;
+}
 
-   template->entry_count = pCreateInfo->descriptorUpdateEntryCount;
-   for (uint32_t i = 0; i < template->entry_count; i++) {
-      const VkDescriptorUpdateTemplateEntry *pEntry =
-         &pCreateInfo->pDescriptorUpdateEntries[i];
-
-      template->entries[i] = (struct anv_descriptor_template_entry) {
-         .type = pEntry->descriptorType,
-         .binding = pEntry->dstBinding,
-         .array_element = pEntry->dstArrayElement,
-         .array_count = pEntry->descriptorCount,
-         .offset = pEntry->offset,
-         .stride = pEntry->stride,
-      };
-   }
+void anv_GetDescriptorSetLayoutBindingOffsetEXT(
+    VkDevice                                    device,
+    VkDescriptorSetLayout                       layout,
+    uint32_t                                    binding,
+    VkDeviceSize*                               pOffset)
+{
+   ANV_FROM_HANDLE(anv_descriptor_set_layout, set_layout, layout);
+   assert(binding < set_layout->binding_count);
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+      &set_layout->binding[binding];
 
-   *pDescriptorUpdateTemplate =
-      anv_descriptor_update_template_to_handle(template);
+   *pOffset = bind_layout->descriptor_surface_offset;
+}
 
-   return VK_SUCCESS;
+static bool
+address_info_is_null(const VkDescriptorAddressInfoEXT *addr_info)
+{
+   return addr_info == NULL || addr_info->address == 0 || addr_info->range == 0;
 }
 
-void anv_DestroyDescriptorUpdateTemplate(
+void anv_GetDescriptorEXT(
     VkDevice                                    _device,
-    VkDescriptorUpdateTemplate                  descriptorUpdateTemplate,
-    const VkAllocationCallbacks*                pAllocator)
+    const VkDescriptorGetInfoEXT*               pDescriptorInfo,
+    size_t                                      dataSize,
+    void*                                       pDescriptor)
 {
    ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_descriptor_update_template, template,
-                   descriptorUpdateTemplate);
+   struct anv_sampler *sampler;
+   struct anv_image_view *image_view;
 
-   if (!template)
-      return;
+   switch (pDescriptorInfo->type) {
+   case VK_DESCRIPTOR_TYPE_SAMPLER:
+      if (pDescriptorInfo->data.pSampler &&
+          (sampler = anv_sampler_from_handle(*pDescriptorInfo->data.pSampler))) {
+         memcpy(pDescriptor, sampler->db_state[0], ANV_SAMPLER_STATE_SIZE);
+      } else {
+         memset(pDescriptor, 0, ANV_SAMPLER_STATE_SIZE);
+      }
+      break;
 
-   vk_object_free(&device->vk, pAllocator, template);
-}
+   case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+      for (uint32_t i = 0; i < dataSize / (2 * ANV_SURFACE_STATE_SIZE); i++) {
+         uint32_t desc_offset = i * 2 * ANV_SURFACE_STATE_SIZE;
+
+         if (pDescriptorInfo->data.pCombinedImageSampler &&
+             (image_view = anv_image_view_from_handle(
+                pDescriptorInfo->data.pCombinedImageSampler->imageView))) {
+            const VkImageLayout layout =
+               pDescriptorInfo->data.pCombinedImageSampler->imageLayout;
+            memcpy(pDescriptor + desc_offset,
+                   anv_image_view_surface_data_for_plane_layout(image_view,
+                                                                pDescriptorInfo->type,
+                                                                i,
+                                                                layout),
+                   ANV_SURFACE_STATE_SIZE);
+         } else {
+            memcpy(pDescriptor + desc_offset,
+                   device->host_null_surface_state,
+                   ANV_SURFACE_STATE_SIZE);
+         }
 
-void anv_UpdateDescriptorSetWithTemplate(
-    VkDevice                                    _device,
-    VkDescriptorSet                             descriptorSet,
-    VkDescriptorUpdateTemplate                  descriptorUpdateTemplate,
-    const void*                                 pData)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_descriptor_set, set, descriptorSet);
-   ANV_FROM_HANDLE(anv_descriptor_update_template, template,
-                   descriptorUpdateTemplate);
+         if (pDescriptorInfo->data.pCombinedImageSampler &&
+             (sampler = anv_sampler_from_handle(
+                pDescriptorInfo->data.pCombinedImageSampler->sampler))) {
+            memcpy(pDescriptor + desc_offset + ANV_SURFACE_STATE_SIZE,
+                   sampler->db_state[i],
+                   ANV_SAMPLER_STATE_SIZE);
+         } else {
+            memset(pDescriptor + desc_offset + ANV_SURFACE_STATE_SIZE,
+                   0, ANV_SAMPLER_STATE_SIZE);
+         }
+      }
+      break;
+
+   case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+   case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+   case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+      if (pDescriptorInfo->data.pSampledImage &&
+          (image_view = anv_image_view_from_handle(
+             pDescriptorInfo->data.pSampledImage->imageView))) {
+         const VkImageLayout layout =
+            pDescriptorInfo->data.pSampledImage->imageLayout;
+
+         memcpy(pDescriptor,
+                anv_image_view_surface_data_for_plane_layout(image_view,
+                                                             pDescriptorInfo->type,
+                                                             0,
+                                                             layout),
+                ANV_SURFACE_STATE_SIZE);
+      } else {
+         memcpy(pDescriptor, device->host_null_surface_state,
+                ANV_SURFACE_STATE_SIZE);
+      }
+      break;
+
+   case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: {
+      const VkDescriptorAddressInfoEXT *addr_info =
+         pDescriptorInfo->data.pUniformTexelBuffer;
+
+      if (!address_info_is_null(addr_info)) {
+         struct anv_format_plane format =
+            anv_get_format_plane(device->info,
+                                 addr_info->format,
+                                 0, VK_IMAGE_TILING_LINEAR);
+         const uint32_t format_bs =
+            isl_format_get_layout(format.isl_format)->bpb / 8;
+
+         anv_fill_buffer_surface_state(device, pDescriptor,
+                                       format.isl_format, format.swizzle,
+                                       ISL_SURF_USAGE_TEXTURE_BIT,
+                                       anv_address_from_u64(addr_info->address),
+                                       align_down_npot_u32(addr_info->range, format_bs),
+                                       format_bs);
+      } else {
+         memcpy(pDescriptor, device->host_null_surface_state,
+                ANV_SURFACE_STATE_SIZE);
+      }
+      break;
+   }
+
+   case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: {
+      const VkDescriptorAddressInfoEXT *addr_info =
+         pDescriptorInfo->data.pStorageTexelBuffer;
+
+      if (!address_info_is_null(addr_info)) {
+         struct anv_format_plane format =
+            anv_get_format_plane(device->info,
+                                 addr_info->format,
+                                 0, VK_IMAGE_TILING_LINEAR);
+         const uint32_t format_bs =
+            isl_format_get_layout(format.isl_format)->bpb / 8;
+
+         anv_fill_buffer_surface_state(device, pDescriptor,
+                                       format.isl_format, format.swizzle,
+                                       ISL_SURF_USAGE_STORAGE_BIT,
+                                       anv_address_from_u64(addr_info->address),
+                                       align_down_npot_u32(addr_info->range, format_bs),
+                                       format_bs);
+      } else {
+         memcpy(pDescriptor, device->host_null_surface_state,
+                ANV_SURFACE_STATE_SIZE);
+      }
+      break;
+   }
+
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: {
+      const VkDescriptorAddressInfoEXT *addr_info =
+         pDescriptorInfo->data.pStorageBuffer;
+
+      if (!address_info_is_null(addr_info)) {
+         VkDeviceSize range = addr_info->range;
 
-   anv_descriptor_set_write_template(device, set, NULL, template, pData);
+         /* We report a bounds checking alignment of 32B for the sake of block
+          * messages which read an entire register worth at a time.
+          */
+         if (pDescriptorInfo->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
+            range = align64(range, ANV_UBO_ALIGNMENT);
+
+         isl_surf_usage_flags_t usage =
+            pDescriptorInfo->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ?
+            ISL_SURF_USAGE_CONSTANT_BUFFER_BIT :
+            ISL_SURF_USAGE_STORAGE_BIT;
+
+         enum isl_format format =
+            anv_isl_format_for_descriptor_type(device, pDescriptorInfo->type);
+
+         isl_buffer_fill_state(&device->isl_dev, pDescriptor,
+                               .address = addr_info->address,
+                               .mocs = isl_mocs(&device->isl_dev, usage, false),
+                               .size_B = range,
+                               .format = format,
+                               .swizzle = ISL_SWIZZLE_IDENTITY,
+                               .stride_B = 1);
+      } else {
+         memcpy(pDescriptor, device->host_null_surface_state,
+                ANV_SURFACE_STATE_SIZE);
+      }
+      break;
+   }
+
+   case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: {
+      struct anv_address_range_descriptor desc_data = {
+         .address = pDescriptorInfo->data.accelerationStructure,
+         .range = 0,
+      };
+
+      memcpy(pDescriptor, &desc_data, sizeof(desc_data));
+      break;
+   }
+
+   default:
+      unreachable("Invalid descriptor type");
+   }
 }
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 3d3ad15151e..507be254624 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -22,6 +22,7 @@
  */
 
 #include <assert.h>
+#include <inttypes.h>
 #include <stdbool.h>
 #include <string.h>
 #ifdef MAJOR_IN_MKDEV
@@ -40,35 +41,77 @@
 
 #include "anv_private.h"
 #include "anv_measure.h"
-#include "util/debug.h"
+#include "util/u_debug.h"
 #include "util/build_id.h"
 #include "util/disk_cache.h"
 #include "util/mesa-sha1.h"
 #include "util/os_file.h"
 #include "util/os_misc.h"
 #include "util/u_atomic.h"
+#if DETECT_OS_ANDROID
+#include "util/u_gralloc/u_gralloc.h"
+#endif
 #include "util/u_string.h"
 #include "util/driconf.h"
 #include "git_sha1.h"
+#include "vk_common_entrypoints.h"
 #include "vk_util.h"
 #include "vk_deferred_operation.h"
+#include "vk_drm_syncobj.h"
 #include "common/intel_aux_map.h"
-#include "common/intel_defines.h"
 #include "common/intel_uuid.h"
 #include "perf/intel_perf.h"
 
+#include "i915/anv_device.h"
+#include "xe/anv_device.h"
+#include "xe/anv_queue.h"
+
 #include "genxml/gen7_pack.h"
+#include "genxml/genX_bits.h"
 
 static const driOptionDescription anv_dri_options[] = {
    DRI_CONF_SECTION_PERFORMANCE
+      DRI_CONF_ADAPTIVE_SYNC(true)
       DRI_CONF_VK_X11_OVERRIDE_MIN_IMAGE_COUNT(0)
       DRI_CONF_VK_X11_STRICT_IMAGE_COUNT(false)
-      DRI_CONF_VK_XWAYLAND_WAIT_READY(true)
+      DRI_CONF_VK_KHR_PRESENT_WAIT(false)
+      DRI_CONF_VK_XWAYLAND_WAIT_READY(false)
+      DRI_CONF_ANV_ASSUME_FULL_SUBGROUPS(0)
+      DRI_CONF_ANV_DISABLE_FCV(false)
+      DRI_CONF_ANV_EXTERNAL_MEMORY_IMPLICIT_SYNC(true)
+      DRI_CONF_ANV_SAMPLE_MASK_OUT_OPENGL_BEHAVIOUR(false)
+      DRI_CONF_ANV_FORCE_FILTER_ADDR_ROUNDING(false)
+      DRI_CONF_ANV_FP64_WORKAROUND_ENABLED(false)
+      DRI_CONF_ANV_GENERATED_INDIRECT_THRESHOLD(4)
+      DRI_CONF_ANV_GENERATED_INDIRECT_RING_THRESHOLD(100)
+      DRI_CONF_NO_16BIT(false)
+      DRI_CONF_INTEL_ENABLE_WA_14018912822(false)
+      DRI_CONF_ANV_QUERY_CLEAR_WITH_BLORP_THRESHOLD(6)
+      DRI_CONF_ANV_QUERY_COPY_WITH_SHADER_THRESHOLD(6)
+      DRI_CONF_ANV_FORCE_INDIRECT_DESCRIPTORS(false)
+      DRI_CONF_SHADER_SPILLING_RATE(0)
+      DRI_CONF_OPT_B(intel_tbimr, true, "Enable TBIMR tiled rendering")
+      DRI_CONF_ANV_COMPRESSION_CONTROL_ENABLED(false)
    DRI_CONF_SECTION_END
 
    DRI_CONF_SECTION_DEBUG
       DRI_CONF_ALWAYS_FLUSH_CACHE(false)
       DRI_CONF_VK_WSI_FORCE_BGRA8_UNORM_FIRST(false)
+      DRI_CONF_VK_WSI_FORCE_SWAPCHAIN_TO_CURRENT_EXTENT(false)
+      DRI_CONF_VK_X11_IGNORE_SUBOPTIMAL(false)
+      DRI_CONF_LIMIT_TRIG_INPUT_RANGE(false)
+      DRI_CONF_ANV_MESH_CONV_PRIM_ATTRS_TO_VERT_ATTRS(-2)
+      DRI_CONF_FORCE_VK_VENDOR(0)
+      DRI_CONF_FAKE_SPARSE(false)
+#if DETECT_OS_ANDROID && ANDROID_API_LEVEL >= 34
+      DRI_CONF_VK_REQUIRE_ASTC(true)
+#else
+      DRI_CONF_VK_REQUIRE_ASTC(false)
+#endif
+   DRI_CONF_SECTION_END
+
+   DRI_CONF_SECTION_QUALITY
+      DRI_CONF_PP_LOWER_DEPTH_RANGE_RATE()
    DRI_CONF_SECTION_END
 };
 
@@ -77,9 +120,6 @@ static const driOptionDescription anv_dri_options[] = {
  */
 #define MAX_DEBUG_MESSAGE_LENGTH    4096
 
-/* Render engine timestamp register */
-#define TIMESTAMP 0x2358
-
 /* The "RAW" clocks on Linux are called "FAST" on FreeBSD */
 #if !defined(CLOCK_MONOTONIC_RAW) && defined(CLOCK_MONOTONIC_FAST)
 #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC_FAST
@@ -90,19 +130,14 @@ compiler_debug_log(void *data, UNUSED unsigned *id, const char *fmt, ...)
 {
    char str[MAX_DEBUG_MESSAGE_LENGTH];
    struct anv_device *device = (struct anv_device *)data;
-   struct anv_instance *instance = device->physical->instance;
-
-   if (list_is_empty(&instance->vk.debug_report.callbacks))
-      return;
+   UNUSED struct anv_instance *instance = device->physical->instance;
 
    va_list args;
    va_start(args, fmt);
    (void) vsnprintf(str, MAX_DEBUG_MESSAGE_LENGTH, fmt, args);
    va_end(args);
 
-   vk_debug_report(&instance->vk,
-                   VK_DEBUG_REPORT_DEBUG_BIT_EXT,
-                   NULL, 0, 0, "anv", str);
+   //vk_logd(VK_LOG_NO_OBJS(&instance->vk), "%s", str);
 }
 
 static void
@@ -111,7 +146,7 @@ compiler_perf_log(UNUSED void *data, UNUSED unsigned *id, const char *fmt, ...)
    va_list args;
    va_start(args, fmt);
 
-   if (INTEL_DEBUG & DEBUG_PERF)
+   if (INTEL_DEBUG(DEBUG_PERF))
       mesa_logd_v(fmt, args);
 
    va_end(args);
@@ -124,10 +159,14 @@ compiler_perf_log(UNUSED void *data, UNUSED unsigned *id, const char *fmt, ...)
 #define ANV_USE_WSI_PLATFORM
 #endif
 
-#ifdef ANDROID
+#ifdef ANDROID_STRICT
+#if ANDROID_API_LEVEL >= 33
+#define ANV_API_VERSION VK_MAKE_VERSION(1, 3, VK_HEADER_VERSION)
+#else
 #define ANV_API_VERSION VK_MAKE_VERSION(1, 1, VK_HEADER_VERSION)
+#endif
 #else
-#define ANV_API_VERSION VK_MAKE_VERSION(1, 2, VK_HEADER_VERSION)
+#define ANV_API_VERSION VK_MAKE_VERSION(1, 3, VK_HEADER_VERSION)
 #endif
 
 VkResult anv_EnumerateInstanceVersion(
@@ -144,11 +183,14 @@ static const struct vk_instance_extension_table instance_extensions = {
    .KHR_external_semaphore_capabilities      = true,
    .KHR_get_physical_device_properties2      = true,
    .EXT_debug_report                         = true,
+   .EXT_debug_utils                          = true,
 
 #ifdef ANV_USE_WSI_PLATFORM
    .KHR_get_surface_capabilities2            = true,
    .KHR_surface                              = true,
    .KHR_surface_protected_capabilities       = true,
+   .EXT_surface_maintenance1                 = true,
+   .EXT_swapchain_colorspace                 = true,
 #endif
 #ifdef VK_USE_PLATFORM_WAYLAND_KHR
    .KHR_wayland_surface                      = true,
@@ -169,18 +211,29 @@ static const struct vk_instance_extension_table instance_extensions = {
    .EXT_display_surface_counter              = true,
    .EXT_acquire_drm_display                  = true,
 #endif
+#ifndef VK_USE_PLATFORM_WIN32_KHR
+   .EXT_headless_surface                     = true,
+#endif
 };
 
 static void
 get_device_extensions(const struct anv_physical_device *device,
                       struct vk_device_extension_table *ext)
 {
+   const bool has_syncobj_wait =
+      (device->sync_syncobj_type.features & VK_SYNC_FEATURE_CPU_WAIT) != 0;
+
+   const bool rt_enabled = ANV_SUPPORT_RT && device->info.has_ray_tracing;
+
    *ext = (struct vk_device_extension_table) {
-      .KHR_8bit_storage                      = device->info.ver >= 8,
-      .KHR_16bit_storage                     = device->info.ver >= 8,
+      .KHR_8bit_storage                      = true,
+      .KHR_16bit_storage                     = !device->instance->no_16bit,
+      .KHR_acceleration_structure            = rt_enabled,
       .KHR_bind_memory2                      = true,
-      .KHR_buffer_device_address             = device->has_a64_buffer_access,
+      .KHR_buffer_device_address             = true,
+      .KHR_calibrated_timestamps             = device->has_reg_timestamp,
       .KHR_copy_commands2                    = true,
+      .KHR_cooperative_matrix                = anv_has_cooperative_matrix(device),
       .KHR_create_renderpass2                = true,
       .KHR_dedicated_allocation              = true,
       .KHR_deferred_host_operations          = true,
@@ -189,43 +242,74 @@ get_device_extensions(const struct anv_physical_device *device,
       .KHR_device_group                      = true,
       .KHR_draw_indirect_count               = true,
       .KHR_driver_properties                 = true,
-      .KHR_external_fence                    = device->has_syncobj_wait,
-      .KHR_external_fence_fd                 = device->has_syncobj_wait,
+      .KHR_dynamic_rendering                 = true,
+      .KHR_external_fence                    = has_syncobj_wait,
+      .KHR_external_fence_fd                 = has_syncobj_wait,
       .KHR_external_memory                   = true,
       .KHR_external_memory_fd                = true,
       .KHR_external_semaphore                = true,
       .KHR_external_semaphore_fd             = true,
+      .KHR_format_feature_flags2             = true,
       .KHR_fragment_shading_rate             = device->info.ver >= 11,
       .KHR_get_memory_requirements2          = true,
+      .KHR_global_priority                   = device->max_context_priority >=
+                                               VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR,
       .KHR_image_format_list                 = true,
       .KHR_imageless_framebuffer             = true,
 #ifdef ANV_USE_WSI_PLATFORM
       .KHR_incremental_present               = true,
 #endif
+      .KHR_index_type_uint8                  = true,
+      .KHR_line_rasterization                = true,
+      .KHR_load_store_op_none                = true,
       .KHR_maintenance1                      = true,
       .KHR_maintenance2                      = true,
       .KHR_maintenance3                      = true,
+      .KHR_maintenance4                      = true,
+      .KHR_maintenance5                      = true,
+      .KHR_maintenance6                      = true,
+      .KHR_map_memory2                       = true,
       .KHR_multiview                         = true,
       .KHR_performance_query =
-         device->use_softpin && device->perf &&
+         device->perf &&
          (device->perf->i915_perf_version >= 3 ||
-          INTEL_DEBUG & DEBUG_NO_OACONFIG) &&
+          INTEL_DEBUG(DEBUG_NO_OACONFIG)) &&
          device->use_call_secondary,
       .KHR_pipeline_executable_properties    = true,
+      .KHR_pipeline_library                  = true,
+      /* Hide these behind dri configs for now since we cannot implement it reliably on
+       * all surfaces yet. There is no surface capability query for present wait/id,
+       * but the feature is useful enough to hide behind an opt-in mechanism for now.
+       * If the instance only enables surface extensions that unconditionally support present wait,
+       * we can also expose the extension that way. */
+      .KHR_present_id =
+         driQueryOptionb(&device->instance->dri_options, "vk_khr_present_wait") ||
+         wsi_common_vk_instance_supports_present_wait(&device->instance->vk),
+      .KHR_present_wait =
+         driQueryOptionb(&device->instance->dri_options, "vk_khr_present_wait") ||
+         wsi_common_vk_instance_supports_present_wait(&device->instance->vk),
       .KHR_push_descriptor                   = true,
+      .KHR_ray_query                         = rt_enabled,
+      .KHR_ray_tracing_maintenance1          = rt_enabled,
+      .KHR_ray_tracing_pipeline              = rt_enabled,
+      .KHR_ray_tracing_position_fetch        = rt_enabled,
       .KHR_relaxed_block_layout              = true,
       .KHR_sampler_mirror_clamp_to_edge      = true,
       .KHR_sampler_ycbcr_conversion          = true,
       .KHR_separate_depth_stencil_layouts    = true,
-      .KHR_shader_atomic_int64               = device->info.ver >= 9 &&
-                                               device->use_softpin,
+      .KHR_shader_atomic_int64               = true,
       .KHR_shader_clock                      = true,
       .KHR_shader_draw_parameters            = true,
-      .KHR_shader_float16_int8               = device->info.ver >= 8,
-      .KHR_shader_float_controls             = device->info.ver >= 8,
+      .KHR_shader_expect_assume              = true,
+      .KHR_shader_float16_int8               = !device->instance->no_16bit,
+      .KHR_shader_float_controls             = true,
+      .KHR_shader_float_controls2            = true,
       .KHR_shader_integer_dot_product        = true,
+      .KHR_shader_maximal_reconvergence      = true,
       .KHR_shader_non_semantic_info          = true,
-      .KHR_shader_subgroup_extended_types    = device->info.ver >= 8,
+      .KHR_shader_quad_control               = true,
+      .KHR_shader_subgroup_extended_types    = true,
+      .KHR_shader_subgroup_rotate            = true,
       .KHR_shader_subgroup_uniform_control_flow = true,
       .KHR_shader_terminate_invocation       = true,
       .KHR_spirv_1_4                         = true,
@@ -234,64 +318,110 @@ get_device_extensions(const struct anv_physical_device *device,
       .KHR_swapchain                         = true,
       .KHR_swapchain_mutable_format          = true,
 #endif
+      .KHR_synchronization2                  = true,
       .KHR_timeline_semaphore                = true,
       .KHR_uniform_buffer_standard_layout    = true,
       .KHR_variable_pointers                 = true,
+      .KHR_vertex_attribute_divisor          = true,
+      .KHR_video_queue                       = device->video_decode_enabled,
+      .KHR_video_decode_queue                = device->video_decode_enabled,
+      .KHR_video_decode_h264                 = VIDEO_CODEC_H264DEC && device->video_decode_enabled,
+      .KHR_video_decode_h265                 = VIDEO_CODEC_H265DEC && device->video_decode_enabled,
       .KHR_vulkan_memory_model               = true,
       .KHR_workgroup_memory_explicit_layout  = true,
       .KHR_zero_initialize_workgroup_memory  = true,
       .EXT_4444_formats                      = true,
-      .EXT_buffer_device_address             = device->has_a64_buffer_access,
+      .EXT_attachment_feedback_loop_layout   = true,
+      .EXT_attachment_feedback_loop_dynamic_state = true,
+      .EXT_border_color_swizzle              = true,
+      .EXT_buffer_device_address             = true,
       .EXT_calibrated_timestamps             = device->has_reg_timestamp,
       .EXT_color_write_enable                = true,
-      .EXT_conditional_rendering             = device->info.verx10 >= 75,
-      .EXT_conservative_rasterization        = device->info.ver >= 9,
-      .EXT_custom_border_color               = device->info.ver >= 8,
+      .EXT_conditional_rendering             = true,
+      .EXT_conservative_rasterization        = true,
+      .EXT_custom_border_color               = true,
+      .EXT_depth_bias_control                = true,
+      .EXT_depth_clamp_zero_one              = true,
+      .EXT_depth_clip_control                = true,
+      .EXT_depth_range_unrestricted          = device->info.ver >= 20,
       .EXT_depth_clip_enable                 = true,
-      .EXT_descriptor_indexing               = device->has_a64_buffer_access &&
-                                               device->has_bindless_images,
+      .EXT_descriptor_buffer                 = true,
+      .EXT_descriptor_indexing               = true,
 #ifdef VK_USE_PLATFORM_DISPLAY_KHR
       .EXT_display_control                   = true,
 #endif
+      .EXT_dynamic_rendering_unused_attachments = true,
       .EXT_extended_dynamic_state            = true,
       .EXT_extended_dynamic_state2           = true,
+      .EXT_extended_dynamic_state3           = true,
       .EXT_external_memory_dma_buf           = true,
       .EXT_external_memory_host              = true,
-      .EXT_fragment_shader_interlock         = device->info.ver >= 9,
-      .EXT_global_priority                   = device->has_context_priority,
+      .EXT_fragment_shader_interlock         = true,
+      .EXT_global_priority                   = device->max_context_priority >=
+                                               VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR,
+      .EXT_global_priority_query             = device->max_context_priority >=
+                                               VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR,
+      .EXT_graphics_pipeline_library         = !debug_get_bool_option("ANV_NO_GPL", false),
       .EXT_host_query_reset                  = true,
+      .EXT_image_2d_view_of_3d               = true,
+      .EXT_image_compression_control         = device->instance->compression_control_enabled,
       .EXT_image_robustness                  = true,
       .EXT_image_drm_format_modifier         = true,
+      .EXT_image_sliced_view_of_3d           = true,
+      .EXT_image_view_min_lod                = true,
       .EXT_index_type_uint8                  = true,
       .EXT_inline_uniform_block              = true,
       .EXT_line_rasterization                = true,
-      .EXT_memory_budget                     = device->sys.available,
+      .EXT_load_store_op_none                = true,
+      .EXT_map_memory_placed                 = device->info.has_mmap_offset,
+      /* Enable the extension only if we have support on both the local &
+       * system memory
+       */
+      .EXT_memory_budget                     = (!device->info.has_local_mem ||
+                                                device->vram_mappable.available > 0) &&
+                                               device->sys.available,
+      .EXT_mesh_shader                       = device->info.has_mesh_shading,
+      .EXT_mutable_descriptor_type           = true,
+      .EXT_nested_command_buffer             = true,
+      .EXT_non_seamless_cube_map             = true,
       .EXT_pci_bus_info                      = true,
       .EXT_physical_device_drm               = true,
       .EXT_pipeline_creation_cache_control   = true,
       .EXT_pipeline_creation_feedback        = true,
-      .EXT_post_depth_coverage               = device->info.ver >= 9,
+      .EXT_pipeline_library_group_handles    = rt_enabled,
+      .EXT_pipeline_robustness               = true,
+      .EXT_post_depth_coverage               = true,
+      .EXT_primitives_generated_query        = true,
+      .EXT_primitive_topology_list_restart   = true,
       .EXT_private_data                      = true,
       .EXT_provoking_vertex                  = true,
       .EXT_queue_family_foreign              = true,
       .EXT_robustness2                       = true,
       .EXT_sample_locations                  = true,
-      .EXT_sampler_filter_minmax             = device->info.ver >= 9,
+      .EXT_sampler_filter_minmax             = true,
       .EXT_scalar_block_layout               = true,
       .EXT_separate_stencil_usage            = true,
       .EXT_shader_atomic_float               = true,
-      .EXT_shader_atomic_float2              = device->info.ver >= 9,
+      .EXT_shader_atomic_float2              = true,
       .EXT_shader_demote_to_helper_invocation = true,
-      .EXT_shader_stencil_export             = device->info.ver >= 9,
+      .EXT_shader_module_identifier          = true,
+      .EXT_shader_stencil_export             = true,
       .EXT_shader_subgroup_ballot            = true,
       .EXT_shader_subgroup_vote              = true,
       .EXT_shader_viewport_index_layer       = true,
       .EXT_subgroup_size_control             = true,
+#ifdef ANV_USE_WSI_PLATFORM
+      .EXT_swapchain_maintenance1            = true,
+#endif
       .EXT_texel_buffer_alignment            = true,
+      .EXT_tooling_info                      = true,
       .EXT_transform_feedback                = true,
       .EXT_vertex_attribute_divisor          = true,
+      .EXT_vertex_input_dynamic_state        = true,
       .EXT_ycbcr_image_arrays                = true,
-#ifdef ANDROID
+      .AMD_buffer_marker                     = true,
+      .AMD_texture_gather_bias_lod           = device->info.ver >= 20,
+#if DETECT_OS_ANDROID
       .ANDROID_external_memory_android_hardware_buffer = true,
       .ANDROID_native_buffer                 = true,
 #endif
@@ -300,173 +430,1484 @@ get_device_extensions(const struct anv_physical_device *device,
       .GOOGLE_user_type                      = true,
       .INTEL_performance_query               = device->perf &&
                                                device->perf->i915_perf_version >= 3,
-      .INTEL_shader_integer_functions2       = device->info.ver >= 8,
+      .INTEL_shader_integer_functions2       = true,
       .EXT_multi_draw                        = true,
       .NV_compute_shader_derivatives         = true,
+      .VALVE_mutable_descriptor_type         = true,
    };
 }
 
-static uint64_t
-anv_compute_sys_heap_size(struct anv_physical_device *device,
-                          uint64_t total_ram)
+static void
+get_features(const struct anv_physical_device *pdevice,
+             struct vk_features *features)
 {
-   /* We don't want to burn too much ram with the GPU.  If the user has 4GiB
-    * or less, we use at most half.  If they have more than 4GiB, we use 3/4.
+   struct vk_app_info *app_info = &pdevice->instance->vk.app_info;
+
+   const bool rt_enabled = ANV_SUPPORT_RT && pdevice->info.has_ray_tracing;
+
+   const bool mesh_shader =
+      pdevice->vk.supported_extensions.EXT_mesh_shader;
+
+   const bool has_sparse_or_fake = pdevice->sparse_type != ANV_SPARSE_TYPE_NOT_SUPPORTED;
+
+   *features = (struct vk_features) {
+      /* Vulkan 1.0 */
+      .robustBufferAccess                       = true,
+      .fullDrawIndexUint32                      = true,
+      .imageCubeArray                           = true,
+      .independentBlend                         = true,
+      .geometryShader                           = true,
+      .tessellationShader                       = true,
+      .sampleRateShading                        = true,
+      .dualSrcBlend                             = true,
+      .logicOp                                  = true,
+      .multiDrawIndirect                        = true,
+      .drawIndirectFirstInstance                = true,
+      .depthClamp                               = true,
+      .depthBiasClamp                           = true,
+      .fillModeNonSolid                         = true,
+      .depthBounds                              = pdevice->info.ver >= 12,
+      .wideLines                                = true,
+      .largePoints                              = true,
+      .alphaToOne                               = true,
+      .multiViewport                            = true,
+      .samplerAnisotropy                        = true,
+      .textureCompressionETC2                   = true,
+      .textureCompressionASTC_LDR               = pdevice->has_astc_ldr ||
+                                                  pdevice->emu_astc_ldr,
+      .textureCompressionBC                     = true,
+      .occlusionQueryPrecise                    = true,
+      .pipelineStatisticsQuery                  = true,
+      .vertexPipelineStoresAndAtomics           = true,
+      .fragmentStoresAndAtomics                 = true,
+      .shaderTessellationAndGeometryPointSize   = true,
+      .shaderImageGatherExtended                = true,
+      .shaderStorageImageExtendedFormats        = true,
+      .shaderStorageImageMultisample            = false,
+      /* Gfx12.5 has all the required format supported in HW for typed
+       * read/writes
+       */
+      .shaderStorageImageReadWithoutFormat      = pdevice->info.verx10 >= 125,
+      .shaderStorageImageWriteWithoutFormat     = true,
+      .shaderUniformBufferArrayDynamicIndexing  = true,
+      .shaderSampledImageArrayDynamicIndexing   = true,
+      .shaderStorageBufferArrayDynamicIndexing  = true,
+      .shaderStorageImageArrayDynamicIndexing   = true,
+      .shaderClipDistance                       = true,
+      .shaderCullDistance                       = true,
+      .shaderFloat64                            = pdevice->info.has_64bit_float ||
+                                                  pdevice->instance->fp64_workaround_enabled,
+      .shaderInt64                              = true,
+      .shaderInt16                              = true,
+      .shaderResourceMinLod                     = true,
+      .shaderResourceResidency                  = has_sparse_or_fake,
+      .sparseBinding                            = has_sparse_or_fake,
+      .sparseResidencyAliased                   = has_sparse_or_fake,
+      .sparseResidencyBuffer                    = has_sparse_or_fake,
+      .sparseResidencyImage2D                   = has_sparse_or_fake,
+      .sparseResidencyImage3D                   = has_sparse_or_fake,
+      .sparseResidency2Samples                  = false,
+      .sparseResidency4Samples                  = false,
+      .sparseResidency8Samples                  = false,
+      .sparseResidency16Samples                 = false,
+      .variableMultisampleRate                  = true,
+      .inheritedQueries                         = true,
+
+      /* Vulkan 1.1 */
+      .storageBuffer16BitAccess            = !pdevice->instance->no_16bit,
+      .uniformAndStorageBuffer16BitAccess  = !pdevice->instance->no_16bit,
+      .storagePushConstant16               = true,
+      .storageInputOutput16                = false,
+      .multiview                           = true,
+      .multiviewGeometryShader             = true,
+      .multiviewTessellationShader         = true,
+      .variablePointersStorageBuffer       = true,
+      .variablePointers                    = true,
+      .protectedMemory                     = pdevice->has_protected_contexts,
+      .samplerYcbcrConversion              = true,
+      .shaderDrawParameters                = true,
+
+      /* Vulkan 1.2 */
+      .samplerMirrorClampToEdge            = true,
+      .drawIndirectCount                   = true,
+      .storageBuffer8BitAccess             = true,
+      .uniformAndStorageBuffer8BitAccess   = true,
+      .storagePushConstant8                = true,
+      .shaderBufferInt64Atomics            = true,
+      .shaderSharedInt64Atomics            = false,
+      .shaderFloat16                       = !pdevice->instance->no_16bit,
+      .shaderInt8                          = !pdevice->instance->no_16bit,
+
+      .descriptorIndexing                                 = true,
+      .shaderInputAttachmentArrayDynamicIndexing          = false,
+      .shaderUniformTexelBufferArrayDynamicIndexing       = true,
+      .shaderStorageTexelBufferArrayDynamicIndexing       = true,
+      .shaderUniformBufferArrayNonUniformIndexing         = true,
+      .shaderSampledImageArrayNonUniformIndexing          = true,
+      .shaderStorageBufferArrayNonUniformIndexing         = true,
+      .shaderStorageImageArrayNonUniformIndexing          = true,
+      .shaderInputAttachmentArrayNonUniformIndexing       = false,
+      .shaderUniformTexelBufferArrayNonUniformIndexing    = true,
+      .shaderStorageTexelBufferArrayNonUniformIndexing    = true,
+      .descriptorBindingUniformBufferUpdateAfterBind      = true,
+      .descriptorBindingSampledImageUpdateAfterBind       = true,
+      .descriptorBindingStorageImageUpdateAfterBind       = true,
+      .descriptorBindingStorageBufferUpdateAfterBind      = true,
+      .descriptorBindingUniformTexelBufferUpdateAfterBind = true,
+      .descriptorBindingStorageTexelBufferUpdateAfterBind = true,
+      .descriptorBindingUpdateUnusedWhilePending          = true,
+      .descriptorBindingPartiallyBound                    = true,
+      .descriptorBindingVariableDescriptorCount           = true,
+      .runtimeDescriptorArray                             = true,
+
+      .samplerFilterMinmax                 = true,
+      .scalarBlockLayout                   = true,
+      .imagelessFramebuffer                = true,
+      .uniformBufferStandardLayout         = true,
+      .shaderSubgroupExtendedTypes         = true,
+      .separateDepthStencilLayouts         = true,
+      .hostQueryReset                      = true,
+      .timelineSemaphore                   = true,
+      .bufferDeviceAddress                 = true,
+      .bufferDeviceAddressCaptureReplay    = true,
+      .bufferDeviceAddressMultiDevice      = false,
+      .vulkanMemoryModel                   = true,
+      .vulkanMemoryModelDeviceScope        = true,
+      .vulkanMemoryModelAvailabilityVisibilityChains = true,
+      .shaderOutputViewportIndex           = true,
+      .shaderOutputLayer                   = true,
+      .subgroupBroadcastDynamicId          = true,
+
+      /* Vulkan 1.3 */
+      .robustImageAccess = true,
+      .inlineUniformBlock = true,
+      .descriptorBindingInlineUniformBlockUpdateAfterBind = true,
+      .pipelineCreationCacheControl = true,
+      .privateData = true,
+      .shaderDemoteToHelperInvocation = true,
+      .shaderTerminateInvocation = true,
+      .subgroupSizeControl = true,
+      .computeFullSubgroups = true,
+      .synchronization2 = true,
+      .textureCompressionASTC_HDR = false,
+      .shaderZeroInitializeWorkgroupMemory = true,
+      .dynamicRendering = true,
+      .shaderIntegerDotProduct = true,
+      .maintenance4 = true,
+
+      /* VK_EXT_4444_formats */
+      .formatA4R4G4B4 = true,
+      .formatA4B4G4R4 = false,
+
+      /* VK_KHR_acceleration_structure */
+      .accelerationStructure = rt_enabled,
+      .accelerationStructureCaptureReplay = false, /* TODO */
+      .accelerationStructureIndirectBuild = false, /* TODO */
+      .accelerationStructureHostCommands = false,
+      .descriptorBindingAccelerationStructureUpdateAfterBind = rt_enabled,
+
+      /* VK_EXT_border_color_swizzle */
+      .borderColorSwizzle = true,
+      .borderColorSwizzleFromImage = true,
+
+      /* VK_EXT_color_write_enable */
+      .colorWriteEnable = true,
+
+      /* VK_EXT_image_2d_view_of_3d  */
+      .image2DViewOf3D = true,
+      .sampler2DViewOf3D = true,
+
+      /* VK_EXT_image_sliced_view_of_3d */
+      .imageSlicedViewOf3D = true,
+
+      /* VK_NV_compute_shader_derivatives */
+      .computeDerivativeGroupQuads = true,
+      .computeDerivativeGroupLinear = true,
+
+      /* VK_EXT_conditional_rendering */
+      .conditionalRendering = true,
+      .inheritedConditionalRendering = true,
+
+      /* VK_EXT_custom_border_color */
+      .customBorderColors = true,
+      .customBorderColorWithoutFormat = true,
+
+      /* VK_EXT_depth_clamp_zero_one */
+      .depthClampZeroOne = true,
+
+      /* VK_EXT_depth_clip_enable */
+      .depthClipEnable = true,
+
+      /* VK_EXT_fragment_shader_interlock */
+      .fragmentShaderSampleInterlock = true,
+      .fragmentShaderPixelInterlock = true,
+      .fragmentShaderShadingRateInterlock = false,
+
+      /* VK_EXT_global_priority_query */
+      .globalPriorityQuery = true,
+
+      /* VK_EXT_graphics_pipeline_library */
+      .graphicsPipelineLibrary =
+         pdevice->vk.supported_extensions.EXT_graphics_pipeline_library,
+
+      /* VK_KHR_fragment_shading_rate */
+      .pipelineFragmentShadingRate = true,
+      .primitiveFragmentShadingRate =
+         pdevice->info.has_coarse_pixel_primitive_and_cb,
+      .attachmentFragmentShadingRate =
+         pdevice->info.has_coarse_pixel_primitive_and_cb,
+
+      /* VK_EXT_image_view_min_lod */
+      .minLod = true,
+
+      /* VK_EXT_index_type_uint8 */
+      .indexTypeUint8 = true,
+
+      /* VK_EXT_line_rasterization */
+      /* Rectangular lines must use the strict algorithm, which is not
+       * supported for wide lines prior to ICL.  See rasterization_mode for
+       * details and how the HW states are programmed.
+       */
+      .rectangularLines = pdevice->info.ver >= 10,
+      .bresenhamLines = true,
+      /* Support for Smooth lines with MSAA was removed on gfx11.  From the
+       * BSpec section "Multisample ModesState" table for "AA Line Support
+       * Requirements":
+       *
+       *    GFX10:BUG:######## 	NUM_MULTISAMPLES == 1
+       *
+       * Fortunately, this isn't a case most people care about.
+       */
+      .smoothLines = pdevice->info.ver < 10,
+      .stippledRectangularLines = false,
+      .stippledBresenhamLines = true,
+      .stippledSmoothLines = false,
+
+      /* VK_NV_mesh_shader */
+      .taskShaderNV = false,
+      .meshShaderNV = false,
+
+      /* VK_EXT_mesh_shader */
+      .taskShader = mesh_shader,
+      .meshShader = mesh_shader,
+      .multiviewMeshShader = false,
+      .primitiveFragmentShadingRateMeshShader = mesh_shader,
+      .meshShaderQueries = false,
+
+      /* VK_EXT_mutable_descriptor_type */
+      .mutableDescriptorType = true,
+
+      /* VK_KHR_performance_query */
+      .performanceCounterQueryPools = true,
+      /* HW only supports a single configuration at a time. */
+      .performanceCounterMultipleQueryPools = false,
+
+      /* VK_KHR_pipeline_executable_properties */
+      .pipelineExecutableInfo = true,
+
+      /* VK_EXT_primitives_generated_query */
+      .primitivesGeneratedQuery = true,
+      .primitivesGeneratedQueryWithRasterizerDiscard = false,
+      .primitivesGeneratedQueryWithNonZeroStreams = false,
+
+      /* VK_EXT_pipeline_library_group_handles */
+      .pipelineLibraryGroupHandles = true,
+
+      /* VK_EXT_provoking_vertex */
+      .provokingVertexLast = true,
+      .transformFeedbackPreservesProvokingVertex = true,
+
+      /* VK_KHR_ray_query */
+      .rayQuery = rt_enabled,
+
+      /* VK_KHR_ray_tracing_maintenance1 */
+      .rayTracingMaintenance1 = rt_enabled,
+      .rayTracingPipelineTraceRaysIndirect2 = rt_enabled,
+
+      /* VK_KHR_ray_tracing_pipeline */
+      .rayTracingPipeline = rt_enabled,
+      .rayTracingPipelineShaderGroupHandleCaptureReplay = false,
+      .rayTracingPipelineShaderGroupHandleCaptureReplayMixed = false,
+      .rayTracingPipelineTraceRaysIndirect = rt_enabled,
+      .rayTraversalPrimitiveCulling = rt_enabled,
+
+      /* VK_EXT_robustness2 */
+      .robustBufferAccess2 = true,
+      .robustImageAccess2 = true,
+      .nullDescriptor = true,
+
+      /* VK_EXT_shader_atomic_float */
+      .shaderBufferFloat32Atomics =    true,
+      .shaderBufferFloat32AtomicAdd =  pdevice->info.has_lsc,
+      .shaderBufferFloat64Atomics =
+         pdevice->info.has_64bit_float && pdevice->info.has_lsc,
+      .shaderBufferFloat64AtomicAdd =  false,
+      .shaderSharedFloat32Atomics =    true,
+      .shaderSharedFloat32AtomicAdd =  false,
+      .shaderSharedFloat64Atomics =    false,
+      .shaderSharedFloat64AtomicAdd =  false,
+      .shaderImageFloat32Atomics =     true,
+      .shaderImageFloat32AtomicAdd =   pdevice->info.ver >= 20,
+      .sparseImageFloat32Atomics =     false,
+      .sparseImageFloat32AtomicAdd =   false,
+
+      /* VK_EXT_shader_atomic_float2 */
+      .shaderBufferFloat16Atomics      = pdevice->info.has_lsc,
+      .shaderBufferFloat16AtomicAdd    = false,
+      .shaderBufferFloat16AtomicMinMax = pdevice->info.has_lsc,
+      .shaderBufferFloat32AtomicMinMax = true,
+      .shaderBufferFloat64AtomicMinMax =
+         pdevice->info.has_64bit_float && pdevice->info.has_lsc,
+      .shaderSharedFloat16Atomics      = pdevice->info.has_lsc,
+      .shaderSharedFloat16AtomicAdd    = false,
+      .shaderSharedFloat16AtomicMinMax = pdevice->info.has_lsc,
+      .shaderSharedFloat32AtomicMinMax = true,
+      .shaderSharedFloat64AtomicMinMax = false,
+      .shaderImageFloat32AtomicMinMax  = false,
+      .sparseImageFloat32AtomicMinMax  = false,
+
+      /* VK_KHR_shader_clock */
+      .shaderSubgroupClock = true,
+      .shaderDeviceClock = false,
+
+      /* VK_INTEL_shader_integer_functions2 */
+      .shaderIntegerFunctions2 = true,
+
+      /* VK_EXT_shader_module_identifier */
+      .shaderModuleIdentifier = true,
+
+      /* VK_KHR_shader_subgroup_uniform_control_flow */
+      .shaderSubgroupUniformControlFlow = true,
+
+      /* VK_EXT_texel_buffer_alignment */
+      .texelBufferAlignment = true,
+
+      /* VK_EXT_transform_feedback */
+      .transformFeedback = true,
+      .geometryStreams = true,
+
+      /* VK_KHR_vertex_attribute_divisor */
+      .vertexAttributeInstanceRateDivisor = true,
+      .vertexAttributeInstanceRateZeroDivisor = true,
+
+      /* VK_KHR_workgroup_memory_explicit_layout */
+      .workgroupMemoryExplicitLayout = true,
+      .workgroupMemoryExplicitLayoutScalarBlockLayout = true,
+      .workgroupMemoryExplicitLayout8BitAccess = true,
+      .workgroupMemoryExplicitLayout16BitAccess = true,
+
+      /* VK_EXT_ycbcr_image_arrays */
+      .ycbcrImageArrays = true,
+
+      /* VK_EXT_extended_dynamic_state */
+      .extendedDynamicState = true,
+
+      /* VK_EXT_extended_dynamic_state2 */
+      .extendedDynamicState2 = true,
+      .extendedDynamicState2LogicOp = true,
+      .extendedDynamicState2PatchControlPoints = true,
+
+      /* VK_EXT_extended_dynamic_state3 */
+      .extendedDynamicState3PolygonMode = true,
+      .extendedDynamicState3TessellationDomainOrigin = true,
+      .extendedDynamicState3RasterizationStream = true,
+      .extendedDynamicState3LineStippleEnable = true,
+      .extendedDynamicState3LineRasterizationMode = true,
+      .extendedDynamicState3LogicOpEnable = true,
+      .extendedDynamicState3AlphaToOneEnable = true,
+      .extendedDynamicState3DepthClipEnable = true,
+      .extendedDynamicState3DepthClampEnable = true,
+      .extendedDynamicState3DepthClipNegativeOneToOne = true,
+      .extendedDynamicState3ProvokingVertexMode = true,
+      .extendedDynamicState3ColorBlendEnable = true,
+      .extendedDynamicState3ColorWriteMask = true,
+      .extendedDynamicState3ColorBlendEquation = true,
+      .extendedDynamicState3SampleLocationsEnable = true,
+      .extendedDynamicState3SampleMask = true,
+      .extendedDynamicState3ConservativeRasterizationMode = true,
+      .extendedDynamicState3AlphaToCoverageEnable = true,
+      .extendedDynamicState3RasterizationSamples = true,
+
+      .extendedDynamicState3ExtraPrimitiveOverestimationSize = false,
+      .extendedDynamicState3ViewportWScalingEnable = false,
+      .extendedDynamicState3ViewportSwizzle = false,
+      .extendedDynamicState3ShadingRateImageEnable = false,
+      .extendedDynamicState3CoverageToColorEnable = false,
+      .extendedDynamicState3CoverageToColorLocation = false,
+      .extendedDynamicState3CoverageModulationMode = false,
+      .extendedDynamicState3CoverageModulationTableEnable = false,
+      .extendedDynamicState3CoverageModulationTable = false,
+      .extendedDynamicState3CoverageReductionMode = false,
+      .extendedDynamicState3RepresentativeFragmentTestEnable = false,
+      .extendedDynamicState3ColorBlendAdvanced = false,
+
+      /* VK_EXT_multi_draw */
+      .multiDraw = true,
+
+      /* VK_EXT_non_seamless_cube_map */
+      .nonSeamlessCubeMap = true,
+
+      /* VK_EXT_primitive_topology_list_restart */
+      .primitiveTopologyListRestart = true,
+      .primitiveTopologyPatchListRestart = true,
+
+      /* VK_EXT_depth_clip_control */
+      .depthClipControl = true,
+
+      /* VK_KHR_present_id */
+      .presentId = pdevice->vk.supported_extensions.KHR_present_id,
+
+      /* VK_KHR_present_wait */
+      .presentWait = pdevice->vk.supported_extensions.KHR_present_wait,
+
+      /* VK_EXT_vertex_input_dynamic_state */
+      .vertexInputDynamicState = true,
+
+      /* VK_KHR_ray_tracing_position_fetch */
+      .rayTracingPositionFetch = rt_enabled,
+
+      /* VK_EXT_dynamic_rendering_unused_attachments */
+      .dynamicRenderingUnusedAttachments = true,
+
+      /* VK_EXT_depth_bias_control */
+      .depthBiasControl = true,
+      .floatRepresentation = true,
+      .leastRepresentableValueForceUnormRepresentation = false,
+      .depthBiasExact = true,
+
+      /* VK_EXT_pipeline_robustness */
+      .pipelineRobustness = true,
+
+      /* VK_KHR_maintenance5 */
+      .maintenance5 = true,
+
+      /* VK_KHR_maintenance6 */
+      .maintenance6 = true,
+
+      /* VK_EXT_nested_command_buffer */
+      .nestedCommandBuffer = true,
+      .nestedCommandBufferRendering = true,
+      .nestedCommandBufferSimultaneousUse = false,
+
+      /* VK_KHR_cooperative_matrix */
+      .cooperativeMatrix = anv_has_cooperative_matrix(pdevice),
+
+      /* VK_KHR_shader_maximal_reconvergence */
+      .shaderMaximalReconvergence = true,
+
+      /* VK_KHR_shader_subgroup_rotate */
+      .shaderSubgroupRotate = true,
+      .shaderSubgroupRotateClustered = true,
+
+      /* VK_EXT_attachment_feedback_loop_layout */
+      .attachmentFeedbackLoopLayout = true,
+
+      /* VK_EXT_attachment_feedback_loop_dynamic_state */
+      .attachmentFeedbackLoopDynamicState = true,
+
+      /* VK_KHR_shader_expect_assume */
+      .shaderExpectAssume = true,
+
+      /* VK_EXT_descriptor_buffer */
+      .descriptorBuffer = true,
+      .descriptorBufferCaptureReplay = true,
+      .descriptorBufferImageLayoutIgnored = false,
+      .descriptorBufferPushDescriptors = true,
+
+      /* VK_EXT_map_memory_placed */
+      .memoryMapPlaced = true,
+      .memoryMapRangePlaced = false,
+      .memoryUnmapReserve = true,
+
+      /* VK_KHR_shader_quad_control */
+      .shaderQuadControl = true,
+
+#ifdef ANV_USE_WSI_PLATFORM
+      /* VK_EXT_swapchain_maintenance1 */
+      .swapchainMaintenance1 = true,
+#endif
+
+      /* VK_EXT_image_compression_control */
+      .imageCompressionControl = true,
+
+      /* VK_KHR_shader_float_controls2 */
+      .shaderFloatControls2 = true,
+   };
+
+   /* The new DOOM and Wolfenstein games require depthBounds without
+    * checking for it.  They seem to run fine without it so just claim it's
+    * there and accept the consequences.
     */
-   uint64_t available_ram;
-   if (total_ram <= 4ull * 1024ull * 1024ull * 1024ull)
-      available_ram = total_ram / 2;
-   else
-      available_ram = total_ram * 3 / 4;
+   if (app_info->engine_name && strcmp(app_info->engine_name, "idTech") == 0)
+      features->depthBounds = true;
+}
+
+#define MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS   64
+
+#define MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS 64
+#define MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS       256
+
+#define MAX_CUSTOM_BORDER_COLORS                   4096
+
+static VkDeviceSize
+anx_get_physical_device_max_heap_size(const struct anv_physical_device *pdevice)
+{
+   VkDeviceSize ret = 0;
+
+   for (uint32_t i = 0; i < pdevice->memory.heap_count; i++) {
+      if (pdevice->memory.heaps[i].size > ret)
+         ret = pdevice->memory.heaps[i].size;
+   }
+
+   return ret;
+}
+
+static void
+get_properties_1_1(const struct anv_physical_device *pdevice,
+                   struct vk_properties *p)
+{
+   memcpy(p->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE);
+   memcpy(p->driverUUID, pdevice->driver_uuid, VK_UUID_SIZE);
+   memset(p->deviceLUID, 0, VK_LUID_SIZE);
+   p->deviceNodeMask = 0;
+   p->deviceLUIDValid = false;
 
-   /* We also want to leave some padding for things we allocate in the driver,
-    * so don't go over 3/4 of the GTT either.
+   p->subgroupSize = BRW_SUBGROUP_SIZE;
+   VkShaderStageFlags scalar_stages = 0;
+   for (unsigned stage = 0; stage < MESA_SHADER_STAGES; stage++) {
+      scalar_stages |= mesa_to_vk_shader_stage(stage);
+   }
+   if (pdevice->vk.supported_extensions.KHR_ray_tracing_pipeline) {
+      scalar_stages |= VK_SHADER_STAGE_RAYGEN_BIT_KHR |
+                       VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
+                       VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
+                       VK_SHADER_STAGE_MISS_BIT_KHR |
+                       VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
+                       VK_SHADER_STAGE_CALLABLE_BIT_KHR;
+   }
+   if (pdevice->vk.supported_extensions.EXT_mesh_shader) {
+      scalar_stages |= VK_SHADER_STAGE_TASK_BIT_EXT |
+                       VK_SHADER_STAGE_MESH_BIT_EXT;
+   }
+   p->subgroupSupportedStages = scalar_stages;
+   p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT |
+                                    VK_SUBGROUP_FEATURE_VOTE_BIT |
+                                    VK_SUBGROUP_FEATURE_BALLOT_BIT |
+                                    VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
+                                    VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT |
+                                    VK_SUBGROUP_FEATURE_QUAD_BIT |
+                                    VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
+                                    VK_SUBGROUP_FEATURE_CLUSTERED_BIT |
+                                    VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR |
+                                    VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR;
+   p->subgroupQuadOperationsInAllStages = true;
+
+   p->pointClippingBehavior      = VK_POINT_CLIPPING_BEHAVIOR_USER_CLIP_PLANES_ONLY;
+   p->maxMultiviewViewCount      = 16;
+   p->maxMultiviewInstanceIndex  = UINT32_MAX / 16;
+   /* Our protected implementation is a memory encryption mechanism, it
+    * shouldn't page fault, but it hangs the HW so in terms of user visibility
+    * it's similar to a fault.
     */
-   available_ram = MIN2(available_ram, device->gtt_size * 3 / 4);
+   p->protectedNoFault           = false;
+   /* This value doesn't matter for us today as our per-stage descriptors are
+    * the real limit.
+    */
+   p->maxPerSetDescriptors       = 1024;
 
-   if (available_ram > (2ull << 30) && !device->supports_48bit_addresses) {
-      /* When running with an overridden PCI ID, we may get a GTT size from
-       * the kernel that is greater than 2 GiB but the execbuf check for 48bit
-       * address support can still fail.  Just clamp the address space size to
-       * 2 GiB if we don't have 48-bit support.
-       */
-      mesa_logw("%s:%d: The kernel reported a GTT size larger than 2 GiB but "
-                "not support for 48-bit addresses",
-                __FILE__, __LINE__);
-      available_ram = 2ull << 30;
+   for (uint32_t i = 0; i < pdevice->memory.heap_count; i++) {
+      p->maxMemoryAllocationSize = MAX2(p->maxMemoryAllocationSize,
+                                        pdevice->memory.heaps[i].size);
    }
+}
 
-   return available_ram;
+static void
+get_properties_1_2(const struct anv_physical_device *pdevice,
+                   struct vk_properties *p)
+{
+   p->driverID = VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA;
+   memset(p->driverName, 0, sizeof(p->driverName));
+   snprintf(p->driverName, VK_MAX_DRIVER_NAME_SIZE,
+            "Intel open-source Mesa driver");
+   memset(p->driverInfo, 0, sizeof(p->driverInfo));
+   snprintf(p->driverInfo, VK_MAX_DRIVER_INFO_SIZE,
+            "Mesa " PACKAGE_VERSION MESA_GIT_SHA1);
+
+   p->conformanceVersion = (VkConformanceVersion) {
+      .major = 1,
+      .minor = 3,
+      .subminor = 6,
+      .patch = 0,
+   };
+
+   p->denormBehaviorIndependence =
+      VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL;
+   p->roundingModeIndependence =
+      VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_NONE;
+
+   /* Broadwell does not support HF denorms and there are restrictions
+    * other gens. According to Kabylake's PRM:
+    *
+    * "math - Extended Math Function
+    * [...]
+    * Restriction : Half-float denorms are always retained."
+    */
+   p->shaderDenormFlushToZeroFloat16         = false;
+   p->shaderDenormPreserveFloat16            = true;
+   p->shaderRoundingModeRTEFloat16           = true;
+   p->shaderRoundingModeRTZFloat16           = true;
+   p->shaderSignedZeroInfNanPreserveFloat16  = true;
+
+   p->shaderDenormFlushToZeroFloat32         = true;
+   p->shaderDenormPreserveFloat32            = true;
+   p->shaderRoundingModeRTEFloat32           = true;
+   p->shaderRoundingModeRTZFloat32           = true;
+   p->shaderSignedZeroInfNanPreserveFloat32  = true;
+
+   p->shaderDenormFlushToZeroFloat64         = true;
+   p->shaderDenormPreserveFloat64            = true;
+   p->shaderRoundingModeRTEFloat64           = true;
+   p->shaderRoundingModeRTZFloat64           = true;
+   p->shaderSignedZeroInfNanPreserveFloat64  = true;
+
+   /* It's a bit hard to exactly map our implementation to the limits
+    * described by Vulkan.  The bindless surface handle in the extended
+    * message descriptors is 20 bits and it's an index into the table of
+    * RENDER_SURFACE_STATE structs that starts at bindless surface base
+    * address.  This means that we can have at must 1M surface states
+    * allocated at any given time.  Since most image views take two
+    * descriptors, this means we have a limit of about 500K image views.
+    *
+    * However, since we allocate surface states at vkCreateImageView time,
+    * this means our limit is actually something on the order of 500K image
+    * views allocated at any time.  The actual limit describe by Vulkan, on
+    * the other hand, is a limit of how many you can have in a descriptor set.
+    * Assuming anyone using 1M descriptors will be using the same image view
+    * twice a bunch of times (or a bunch of null descriptors), we can safely
+    * advertise a larger limit here.
+    */
+   const unsigned max_bindless_views =
+      anv_physical_device_bindless_heap_size(pdevice, false) / ANV_SURFACE_STATE_SIZE;
+   p->maxUpdateAfterBindDescriptorsInAllPools            = max_bindless_views;
+   p->shaderUniformBufferArrayNonUniformIndexingNative   = false;
+   p->shaderSampledImageArrayNonUniformIndexingNative    = false;
+   p->shaderStorageBufferArrayNonUniformIndexingNative   = true;
+   p->shaderStorageImageArrayNonUniformIndexingNative    = false;
+   p->shaderInputAttachmentArrayNonUniformIndexingNative = false;
+   p->robustBufferAccessUpdateAfterBind                  = true;
+   p->quadDivergentImplicitLod                           = false;
+   p->maxPerStageDescriptorUpdateAfterBindSamplers       = max_bindless_views;
+   p->maxPerStageDescriptorUpdateAfterBindUniformBuffers = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS;
+   p->maxPerStageDescriptorUpdateAfterBindStorageBuffers = UINT32_MAX;
+   p->maxPerStageDescriptorUpdateAfterBindSampledImages  = max_bindless_views;
+   p->maxPerStageDescriptorUpdateAfterBindStorageImages  = max_bindless_views;
+   p->maxPerStageDescriptorUpdateAfterBindInputAttachments = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS;
+   p->maxPerStageUpdateAfterBindResources                = UINT32_MAX;
+   p->maxDescriptorSetUpdateAfterBindSamplers            = max_bindless_views;
+   p->maxDescriptorSetUpdateAfterBindUniformBuffers      = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS;
+   p->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2;
+   p->maxDescriptorSetUpdateAfterBindStorageBuffers      = UINT32_MAX;
+   p->maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2;
+   p->maxDescriptorSetUpdateAfterBindSampledImages       = max_bindless_views;
+   p->maxDescriptorSetUpdateAfterBindStorageImages       = max_bindless_views;
+   p->maxDescriptorSetUpdateAfterBindInputAttachments    = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS;
+
+   /* We support all of the depth resolve modes */
+   p->supportedDepthResolveModes    = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT |
+                                      VK_RESOLVE_MODE_AVERAGE_BIT |
+                                      VK_RESOLVE_MODE_MIN_BIT |
+                                      VK_RESOLVE_MODE_MAX_BIT;
+   /* Average doesn't make sense for stencil so we don't support that */
+   p->supportedStencilResolveModes  = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT |
+                                      VK_RESOLVE_MODE_MIN_BIT |
+                                      VK_RESOLVE_MODE_MAX_BIT;
+   p->independentResolveNone  = true;
+   p->independentResolve      = true;
+
+   p->filterMinmaxSingleComponentFormats  = true;
+   p->filterMinmaxImageComponentMapping   = true;
+
+   p->maxTimelineSemaphoreValueDifference = UINT64_MAX;
+
+   p->framebufferIntegerColorSampleCounts =
+      isl_device_get_sample_counts(&pdevice->isl_dev);
 }
 
-static VkResult MUST_CHECK
-anv_gather_meminfo(struct anv_physical_device *device, int fd, bool update)
+static void
+get_properties_1_3(const struct anv_physical_device *pdevice,
+                   struct vk_properties *p)
 {
-   char sys_mem_regions[sizeof(struct drm_i915_query_memory_regions) +
-	                sizeof(struct drm_i915_memory_region_info)];
-
-   struct drm_i915_query_memory_regions *mem_regions =
-      intel_i915_query_alloc(fd, DRM_I915_QUERY_MEMORY_REGIONS);
-   if (mem_regions == NULL) {
-      if (device->info.has_local_mem) {
-         return vk_errorfi(device->instance, NULL,
-                           VK_ERROR_INCOMPATIBLE_DRIVER,
-                           "failed to memory regions: %m");
-      }
+   if (pdevice->info.ver >= 20)
+      p->minSubgroupSize = 16;
+   else
+      p->minSubgroupSize = 8;
+   p->maxSubgroupSize = 32;
+   p->maxComputeWorkgroupSubgroups = pdevice->info.max_cs_workgroup_threads;
+   p->requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT |
+                                   VK_SHADER_STAGE_TASK_BIT_EXT |
+                                   VK_SHADER_STAGE_MESH_BIT_EXT;
+
+   p->maxInlineUniformBlockSize = MAX_INLINE_UNIFORM_BLOCK_SIZE;
+   p->maxPerStageDescriptorInlineUniformBlocks =
+      MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
+   p->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks =
+      MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
+   p->maxDescriptorSetInlineUniformBlocks =
+      MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
+   p->maxDescriptorSetUpdateAfterBindInlineUniformBlocks =
+      MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
+   p->maxInlineUniformTotalSize = UINT16_MAX;
+
+   p->integerDotProduct8BitUnsignedAccelerated = false;
+   p->integerDotProduct8BitSignedAccelerated = false;
+   p->integerDotProduct8BitMixedSignednessAccelerated = false;
+   p->integerDotProduct4x8BitPackedUnsignedAccelerated = pdevice->info.ver >= 12;
+   p->integerDotProduct4x8BitPackedSignedAccelerated = pdevice->info.ver >= 12;
+   p->integerDotProduct4x8BitPackedMixedSignednessAccelerated = pdevice->info.ver >= 12;
+   p->integerDotProduct16BitUnsignedAccelerated = false;
+   p->integerDotProduct16BitSignedAccelerated = false;
+   p->integerDotProduct16BitMixedSignednessAccelerated = false;
+   p->integerDotProduct32BitUnsignedAccelerated = false;
+   p->integerDotProduct32BitSignedAccelerated = false;
+   p->integerDotProduct32BitMixedSignednessAccelerated = false;
+   p->integerDotProduct64BitUnsignedAccelerated = false;
+   p->integerDotProduct64BitSignedAccelerated = false;
+   p->integerDotProduct64BitMixedSignednessAccelerated = false;
+   p->integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = false;
+   p->integerDotProductAccumulatingSaturating8BitSignedAccelerated = false;
+   p->integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = false;
+   p->integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = pdevice->info.ver >= 12;
+   p->integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = pdevice->info.ver >= 12;
+   p->integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = pdevice->info.ver >= 12;
+   p->integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = false;
+   p->integerDotProductAccumulatingSaturating16BitSignedAccelerated = false;
+   p->integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated = false;
+   p->integerDotProductAccumulatingSaturating32BitUnsignedAccelerated = false;
+   p->integerDotProductAccumulatingSaturating32BitSignedAccelerated = false;
+   p->integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated = false;
+   p->integerDotProductAccumulatingSaturating64BitUnsignedAccelerated = false;
+   p->integerDotProductAccumulatingSaturating64BitSignedAccelerated = false;
+   p->integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated = false;
+
+   /* From the SKL PRM Vol. 2d, docs for RENDER_SURFACE_STATE::Surface
+    * Base Address:
+    *
+    *    "For SURFTYPE_BUFFER non-rendertarget surfaces, this field
+    *    specifies the base address of the first element of the surface,
+    *    computed in software by adding the surface base address to the
+    *    byte offset of the element in the buffer. The base address must
+    *    be aligned to element size."
+    *
+    * The typed dataport messages require that things be texel aligned.
+    * Otherwise, we may just load/store the wrong data or, in the worst
+    * case, there may be hangs.
+    */
+   p->storageTexelBufferOffsetAlignmentBytes = 16;
+   p->storageTexelBufferOffsetSingleTexelAlignment = true;
 
-      uint64_t total_phys;
-      if (!os_get_total_physical_memory(&total_phys)) {
-         return vk_errorfi(device->instance, NULL,
-                           VK_ERROR_INITIALIZATION_FAILED,
-                           "failed to get total physical memory: %m");
-      }
+   /* The sampler, however, is much more forgiving and it can handle
+    * arbitrary byte alignment for linear and buffer surfaces.  It's
+    * hard to find a good PRM citation for this but years of empirical
+    * experience demonstrate that this is true.
+    */
+   p->uniformTexelBufferOffsetAlignmentBytes = 1;
+   p->uniformTexelBufferOffsetSingleTexelAlignment = true;
+
+   p->maxBufferSize = pdevice->isl_dev.max_buffer_size;
+}
+
+static void
+get_properties(const struct anv_physical_device *pdevice,
+               struct vk_properties *props)
+{
+
+      const struct intel_device_info *devinfo = &pdevice->info;
+
+   const uint32_t max_ssbos = UINT16_MAX;
+   const uint32_t max_textures = UINT16_MAX;
+   const uint32_t max_samplers = UINT16_MAX;
+   const uint32_t max_images = UINT16_MAX;
+   const VkDeviceSize max_heap_size = anx_get_physical_device_max_heap_size(pdevice);
+
+   /* Claim a high per-stage limit since we have bindless. */
+   const uint32_t max_per_stage = UINT32_MAX;
 
-      uint64_t available;
-      if (!os_get_available_system_memory(&available))
-         available = 0; /* Silently disable VK_EXT_memory_budget */
+   const uint32_t max_workgroup_size =
+      MIN2(1024, 32 * devinfo->max_cs_workgroup_threads);
 
-      /* The kernel query failed.  Fake it using OS memory queries.  This
-       * should be roughly the same for integrated GPUs.
+   const bool has_sparse_or_fake = pdevice->sparse_type != ANV_SPARSE_TYPE_NOT_SUPPORTED;
+   const bool sparse_uses_trtt = pdevice->sparse_type == ANV_SPARSE_TYPE_TRTT;
+
+   uint64_t sparse_addr_space_size =
+      !has_sparse_or_fake ? 0 :
+      sparse_uses_trtt ? pdevice->va.trtt.size :
+      pdevice->va.high_heap.size;
+
+   VkSampleCountFlags sample_counts =
+      isl_device_get_sample_counts(&pdevice->isl_dev);
+
+
+   *props = (struct vk_properties) {
+      .apiVersion = ANV_API_VERSION,
+      .driverVersion = vk_get_driver_version(),
+      .vendorID = pdevice->instance->force_vk_vendor != 0 ?
+                  pdevice->instance->force_vk_vendor : 0x8086,
+      .deviceID = pdevice->info.pci_device_id,
+      .deviceType = pdevice->info.has_local_mem ?
+                    VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU :
+                    VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU,
+
+      /* Limits: */
+      .maxImageDimension1D                      = (1 << 14),
+      .maxImageDimension2D                      = (1 << 14),
+      .maxImageDimension3D                      = (1 << 11),
+      .maxImageDimensionCube                    = (1 << 14),
+      .maxImageArrayLayers                      = (1 << 11),
+      .maxTexelBufferElements                   = 128 * 1024 * 1024,
+      .maxUniformBufferRange                    = pdevice->compiler->indirect_ubos_use_sampler ? (1u << 27) : (1u << 30),
+      .maxStorageBufferRange                    = MIN3(pdevice->isl_dev.max_buffer_size, max_heap_size, UINT32_MAX),
+      .maxPushConstantsSize                     = MAX_PUSH_CONSTANTS_SIZE,
+      .maxMemoryAllocationCount                 = UINT32_MAX,
+      .maxSamplerAllocationCount                = 64 * 1024,
+      .bufferImageGranularity                   = 1,
+      .sparseAddressSpaceSize                   = sparse_addr_space_size,
+      .maxBoundDescriptorSets                   = MAX_SETS,
+      .maxPerStageDescriptorSamplers            = max_samplers,
+      .maxPerStageDescriptorUniformBuffers      = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS,
+      .maxPerStageDescriptorStorageBuffers      = max_ssbos,
+      .maxPerStageDescriptorSampledImages       = max_textures,
+      .maxPerStageDescriptorStorageImages       = max_images,
+      .maxPerStageDescriptorInputAttachments    = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS,
+      .maxPerStageResources                     = max_per_stage,
+      .maxDescriptorSetSamplers                 = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSamplers */
+      .maxDescriptorSetUniformBuffers           = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS,           /* number of stages * maxPerStageDescriptorUniformBuffers */
+      .maxDescriptorSetUniformBuffersDynamic    = MAX_DYNAMIC_BUFFERS / 2,
+      .maxDescriptorSetStorageBuffers           = 6 * max_ssbos,    /* number of stages * maxPerStageDescriptorStorageBuffers */
+      .maxDescriptorSetStorageBuffersDynamic    = MAX_DYNAMIC_BUFFERS / 2,
+      .maxDescriptorSetSampledImages            = 6 * max_textures, /* number of stages * maxPerStageDescriptorSampledImages */
+      .maxDescriptorSetStorageImages            = 6 * max_images,   /* number of stages * maxPerStageDescriptorStorageImages */
+      .maxDescriptorSetInputAttachments         = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS,
+      .maxVertexInputAttributes                 = MAX_VES,
+      .maxVertexInputBindings                   = MAX_VBS,
+      /* Broadwell PRMs: Volume 2d: Command Reference: Structures:
+       *
+       * VERTEX_ELEMENT_STATE::Source Element Offset: [0,2047]
        */
-      mem_regions = (void *)sys_mem_regions;
-      mem_regions->num_regions = 1;
-      mem_regions->regions[0] = (struct drm_i915_memory_region_info) {
-         .region.memory_class = I915_MEMORY_CLASS_SYSTEM,
-         .probed_size = total_phys,
-         .unallocated_size = available,
-      };
-   }
+      .maxVertexInputAttributeOffset            = 2047,
+      /* Skylake PRMs: Volume 2d: Command Reference: Structures:
+       *
+       * VERTEX_BUFFER_STATE::Buffer Pitch: [0,4095]
+       */
+      .maxVertexInputBindingStride              = 4095,
+      .maxVertexOutputComponents                = 128,
+      .maxTessellationGenerationLevel           = 64,
+      .maxTessellationPatchSize                 = 32,
+      .maxTessellationControlPerVertexInputComponents = 128,
+      .maxTessellationControlPerVertexOutputComponents = 128,
+      .maxTessellationControlPerPatchOutputComponents = 128,
+      .maxTessellationControlTotalOutputComponents = 2048,
+      .maxTessellationEvaluationInputComponents = 128,
+      .maxTessellationEvaluationOutputComponents = 128,
+      .maxGeometryShaderInvocations             = 32,
+      .maxGeometryInputComponents               = 128,
+      .maxGeometryOutputComponents              = 128,
+      .maxGeometryOutputVertices                = 256,
+      .maxGeometryTotalOutputComponents         = 1024,
+      .maxFragmentInputComponents               = 116, /* 128 components - (PSIZ, CLIP_DIST0, CLIP_DIST1) */
+      .maxFragmentOutputAttachments             = 8,
+      .maxFragmentDualSrcAttachments            = 1,
+      .maxFragmentCombinedOutputResources       = MAX_RTS + max_ssbos + max_images,
+      .maxComputeSharedMemorySize               = 64 * 1024,
+      .maxComputeWorkGroupCount                 = { 65535, 65535, 65535 },
+      .maxComputeWorkGroupInvocations           = max_workgroup_size,
+      .maxComputeWorkGroupSize = {
+         max_workgroup_size,
+         max_workgroup_size,
+         max_workgroup_size,
+      },
+      .subPixelPrecisionBits                    = 8,
+      .subTexelPrecisionBits                    = 8,
+      .mipmapPrecisionBits                      = 8,
+      .maxDrawIndexedIndexValue                 = UINT32_MAX,
+      .maxDrawIndirectCount                     = UINT32_MAX,
+      .maxSamplerLodBias                        = 16,
+      .maxSamplerAnisotropy                     = 16,
+      .maxViewports                             = MAX_VIEWPORTS,
+      .maxViewportDimensions                    = { (1 << 14), (1 << 14) },
+      .viewportBoundsRange                      = { INT16_MIN, INT16_MAX },
+      .viewportSubPixelBits                     = 13, /* We take a float? */
+      .minMemoryMapAlignment                    = 4096, /* A page */
+      /* The dataport requires texel alignment so we need to assume a worst
+       * case of R32G32B32A32 which is 16 bytes.
+       */
+      .minTexelBufferOffsetAlignment            = 16,
+      .minUniformBufferOffsetAlignment          = ANV_UBO_ALIGNMENT,
+      .minStorageBufferOffsetAlignment          = ANV_SSBO_ALIGNMENT,
+      .minTexelOffset                           = -8,
+      .maxTexelOffset                           = 7,
+      .minTexelGatherOffset                     = -32,
+      .maxTexelGatherOffset                     = 31,
+      .minInterpolationOffset                   = -0.5,
+      .maxInterpolationOffset                   = 0.4375,
+      .subPixelInterpolationOffsetBits          = 4,
+      .maxFramebufferWidth                      = (1 << 14),
+      .maxFramebufferHeight                     = (1 << 14),
+      .maxFramebufferLayers                     = (1 << 11),
+      .framebufferColorSampleCounts             = sample_counts,
+      .framebufferDepthSampleCounts             = sample_counts,
+      .framebufferStencilSampleCounts           = sample_counts,
+      .framebufferNoAttachmentsSampleCounts     = sample_counts,
+      .maxColorAttachments                      = MAX_RTS,
+      .sampledImageColorSampleCounts            = sample_counts,
+      .sampledImageIntegerSampleCounts          = sample_counts,
+      .sampledImageDepthSampleCounts            = sample_counts,
+      .sampledImageStencilSampleCounts          = sample_counts,
+      .storageImageSampleCounts                 = VK_SAMPLE_COUNT_1_BIT,
+      .maxSampleMaskWords                       = 1,
+      .timestampComputeAndGraphics              = true,
+      .timestampPeriod                          = 1000000000.0 / devinfo->timestamp_frequency,
+      .maxClipDistances                         = 8,
+      .maxCullDistances                         = 8,
+      .maxCombinedClipAndCullDistances          = 8,
+      .discreteQueuePriorities                  = 2,
+      .pointSizeRange                           = { 0.125, 255.875 },
+      /* While SKL and up support much wider lines than we are setting here,
+       * in practice we run into conformance issues if we go past this limit.
+       * Since the Windows driver does the same, it's probably fair to assume
+       * that no one needs more than this.
+       */
+      .lineWidthRange                           = { 0.0, 8.0 },
+      .pointSizeGranularity                     = (1.0 / 8.0),
+      .lineWidthGranularity                     = (1.0 / 128.0),
+      .strictLines                              = false,
+      .standardSampleLocations                  = true,
+      .optimalBufferCopyOffsetAlignment         = 128,
+      .optimalBufferCopyRowPitchAlignment       = 128,
+      .nonCoherentAtomSize                      = 64,
 
-   for(int i = 0; i < mem_regions->num_regions; i++) {
-      struct drm_i915_memory_region_info *info = &mem_regions->regions[i];
+      /* Sparse: */
+      .sparseResidencyStandard2DBlockShape = has_sparse_or_fake,
+      .sparseResidencyStandard2DMultisampleBlockShape = false,
+      .sparseResidencyStandard3DBlockShape = has_sparse_or_fake,
+      .sparseResidencyAlignedMipSize = false,
+      .sparseResidencyNonResidentStrict = has_sparse_or_fake,
 
-      struct anv_memregion *region;
-      switch (info->region.memory_class) {
-      case I915_MEMORY_CLASS_SYSTEM:
-         region = &device->sys;
-         break;
-      case I915_MEMORY_CLASS_DEVICE:
-         region = &device->vram;
-         break;
-      default:
-         /* We don't know what kind of memory this is */
-         continue;
-      }
+      /* VK_KHR_cooperative_matrix */
+      .cooperativeMatrixSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT,
+   };
 
-      uint64_t size = info->probed_size;
-      if (info->region.memory_class == I915_MEMORY_CLASS_SYSTEM)
-         size = anv_compute_sys_heap_size(device, size);
+   snprintf(props->deviceName, sizeof(props->deviceName),
+            "%s", pdevice->info.name);
+   memcpy(props->pipelineCacheUUID,
+          pdevice->pipeline_cache_uuid, VK_UUID_SIZE);
 
-      uint64_t available = MIN2(size, info->unallocated_size);
+   get_properties_1_1(pdevice, props);
+   get_properties_1_2(pdevice, props);
+   get_properties_1_3(pdevice, props);
+
+   /* VK_KHR_acceleration_structure */
+   {
+      props->maxGeometryCount = (1u << 24) - 1;
+      props->maxInstanceCount = (1u << 24) - 1;
+      props->maxPrimitiveCount = (1u << 29) - 1;
+      props->maxPerStageDescriptorAccelerationStructures = UINT16_MAX;
+      props->maxPerStageDescriptorUpdateAfterBindAccelerationStructures = UINT16_MAX;
+      props->maxDescriptorSetAccelerationStructures = UINT16_MAX;
+      props->maxDescriptorSetUpdateAfterBindAccelerationStructures = UINT16_MAX;
+      props->minAccelerationStructureScratchOffsetAlignment = 64;
+   }
 
-      if (update) {
-         assert(region->region.memory_class == info->region.memory_class);
-         assert(region->region.memory_instance == info->region.memory_instance);
-         assert(region->size == size);
+   /* VK_KHR_fragment_shading_rate */
+   {
+      props->primitiveFragmentShadingRateWithMultipleViewports =
+         pdevice->info.has_coarse_pixel_primitive_and_cb;
+      props->layeredShadingRateAttachments =
+      pdevice->info.has_coarse_pixel_primitive_and_cb;
+      props->fragmentShadingRateNonTrivialCombinerOps =
+         pdevice->info.has_coarse_pixel_primitive_and_cb;
+      props->maxFragmentSize = (VkExtent2D) { 4, 4 };
+      props->maxFragmentSizeAspectRatio =
+         pdevice->info.has_coarse_pixel_primitive_and_cb ?
+         2 : 4;
+      props->maxFragmentShadingRateCoverageSamples = 4 * 4 *
+         (pdevice->info.has_coarse_pixel_primitive_and_cb ? 4 : 16);
+      props->maxFragmentShadingRateRasterizationSamples =
+      pdevice->info.has_coarse_pixel_primitive_and_cb ?
+         VK_SAMPLE_COUNT_4_BIT :  VK_SAMPLE_COUNT_16_BIT;
+      props->fragmentShadingRateWithShaderDepthStencilWrites = false;
+      props->fragmentShadingRateWithSampleMask = true;
+      props->fragmentShadingRateWithShaderSampleMask = false;
+      props->fragmentShadingRateWithConservativeRasterization = true;
+      props->fragmentShadingRateWithFragmentShaderInterlock = true;
+      props->fragmentShadingRateWithCustomSampleLocations = true;
+      props->fragmentShadingRateStrictMultiplyCombiner = true;
+
+      if (pdevice->info.has_coarse_pixel_primitive_and_cb) {
+         props->minFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 8, 8 };
+         props->maxFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 8, 8 };
+         props->maxFragmentShadingRateAttachmentTexelSizeAspectRatio = 1;
       } else {
-         region->region = info->region;
-         region->size = size;
+         /* Those must be 0 if attachmentFragmentShadingRate is not supported. */
+         props->minFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 0, 0 };
+         props->maxFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 0, 0 };
+         props->maxFragmentShadingRateAttachmentTexelSizeAspectRatio = 0;
       }
-      region->available = available;
    }
 
-   if (mem_regions != (void *)sys_mem_regions)
-      free(mem_regions);
+   /* VK_KHR_maintenance5 */
+   {
+      props->earlyFragmentMultisampleCoverageAfterSampleCounting = false;
+      props->earlyFragmentSampleMaskTestBeforeSampleCounting = false;
+      props->depthStencilSwizzleOneSupport = true;
+      props->polygonModePointSize = true;
+      props->nonStrictSinglePixelWideLinesUseParallelogram = false;
+      props->nonStrictWideLinesUseParallelogram = false;
+   }
 
-   return VK_SUCCESS;
+   /* VK_KHR_maintenance6 */
+   {
+      props->blockTexelViewCompatibleMultipleLayers = true;
+      props->maxCombinedImageSamplerDescriptorCount = 3;
+      props->fragmentShadingRateClampCombinerInputs = true;
+   }
+
+   /* VK_KHR_performance_query */
+   {
+      props->allowCommandBufferQueryCopies = false;
+   }
+
+   /* VK_KHR_push_descriptor */
+   {
+      props->maxPushDescriptors = MAX_PUSH_DESCRIPTORS;
+   }
+
+   /* VK_KHR_ray_tracing_pipeline */
+   {
+      /* TODO */
+      props->shaderGroupHandleSize = 32;
+      props->maxRayRecursionDepth = 31;
+      /* MemRay::hitGroupSRStride is 16 bits */
+      props->maxShaderGroupStride = UINT16_MAX;
+      /* MemRay::hitGroupSRBasePtr requires 16B alignment */
+      props->shaderGroupBaseAlignment = 16;
+      props->shaderGroupHandleAlignment = 16;
+      props->shaderGroupHandleCaptureReplaySize = 32;
+      props->maxRayDispatchInvocationCount = 1U << 30; /* required min limit */
+      props->maxRayHitAttributeSize = BRW_RT_SIZEOF_HIT_ATTRIB_DATA;
+   }
+
+   /* VK_KHR_vertex_attribute_divisor */
+   {
+      props->maxVertexAttribDivisor = UINT32_MAX / 16;
+      props->supportsNonZeroFirstInstance = true;
+   }
+
+   /* VK_EXT_conservative_rasterization */
+   {
+      /* There's nothing in the public docs about this value as far as I can
+       * tell. However, this is the value the Windows driver reports and
+       * there's a comment on a rejected HW feature in the internal docs that
+       * says:
+       *
+       *    "This is similar to conservative rasterization, except the
+       *    primitive area is not extended by 1/512 and..."
+       *
+       * That's a bit of an obtuse reference but it's the best we've got for
+       * now.
+       */
+      props->primitiveOverestimationSize = 1.0f / 512.0f;
+      props->maxExtraPrimitiveOverestimationSize = 0.0f;
+      props->extraPrimitiveOverestimationSizeGranularity = 0.0f;
+      props->primitiveUnderestimation = false;
+      props->conservativePointAndLineRasterization = false;
+      props->degenerateTrianglesRasterized = true;
+      props->degenerateLinesRasterized = false;
+      props->fullyCoveredFragmentShaderInputVariable = false;
+      props->conservativeRasterizationPostDepthCoverage = true;
+   }
+
+   /* VK_EXT_custom_border_color */
+   {
+      props->maxCustomBorderColorSamplers = MAX_CUSTOM_BORDER_COLORS;
+   }
+
+   /* VK_EXT_descriptor_buffer */
+   {
+      props->combinedImageSamplerDescriptorSingleArray = true;
+      props->bufferlessPushDescriptors = true;
+      /* Written to the buffer before a timeline semaphore is signaled, but
+       * after vkQueueSubmit().
+       */
+      props->allowSamplerImageViewPostSubmitCreation = true;
+      props->descriptorBufferOffsetAlignment = ANV_SURFACE_STATE_SIZE;
+
+      if (pdevice->uses_ex_bso) {
+         props->maxDescriptorBufferBindings = MAX_SETS;
+         props->maxResourceDescriptorBufferBindings = MAX_SETS;
+         props->maxSamplerDescriptorBufferBindings = MAX_SETS;
+         props->maxEmbeddedImmutableSamplerBindings = MAX_SETS;
+      } else {
+         props->maxDescriptorBufferBindings = 3; /* resources, samplers, push (we don't care about push) */
+         props->maxResourceDescriptorBufferBindings = 1;
+         props->maxSamplerDescriptorBufferBindings = 1;
+         props->maxEmbeddedImmutableSamplerBindings = 1;
+      }
+      props->maxEmbeddedImmutableSamplers = MAX_EMBEDDED_SAMPLERS;
+
+      /* Storing a 64bit address */
+      props->bufferCaptureReplayDescriptorDataSize = 8;
+      props->imageCaptureReplayDescriptorDataSize = 8;
+      /* Offset inside the reserved border color pool */
+      props->samplerCaptureReplayDescriptorDataSize = 4;
+
+      /* Not affected by replay */
+      props->imageViewCaptureReplayDescriptorDataSize = 0;
+      /* The acceleration structure virtual address backing is coming from a
+       * buffer, so as long as that buffer is captured/replayed correctly we
+       * should always get the same address.
+       */
+      props->accelerationStructureCaptureReplayDescriptorDataSize = 0;
+
+      props->samplerDescriptorSize = ANV_SAMPLER_STATE_SIZE;
+      props->combinedImageSamplerDescriptorSize = align(ANV_SURFACE_STATE_SIZE + ANV_SAMPLER_STATE_SIZE,
+                                                        ANV_SURFACE_STATE_SIZE);
+      props->sampledImageDescriptorSize = ANV_SURFACE_STATE_SIZE;
+      props->storageImageDescriptorSize = ANV_SURFACE_STATE_SIZE;
+      props->uniformTexelBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
+      props->robustUniformTexelBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
+      props->storageTexelBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
+      props->robustStorageTexelBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
+      props->uniformBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
+      props->robustUniformBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
+      props->storageBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
+      props->robustStorageBufferDescriptorSize = ANV_SURFACE_STATE_SIZE;
+      props->inputAttachmentDescriptorSize = ANV_SURFACE_STATE_SIZE;
+      props->accelerationStructureDescriptorSize = sizeof(struct anv_address_range_descriptor);
+      props->maxSamplerDescriptorBufferRange = pdevice->va.descriptor_buffer_pool.size;
+      props->maxResourceDescriptorBufferRange = anv_physical_device_bindless_heap_size(pdevice,
+                                                                                       true);
+      props->resourceDescriptorBufferAddressSpaceSize = pdevice->va.descriptor_buffer_pool.size;
+      props->descriptorBufferAddressSpaceSize = pdevice->va.descriptor_buffer_pool.size;
+      props->samplerDescriptorBufferAddressSpaceSize = pdevice->va.descriptor_buffer_pool.size;
+   }
+
+   /* VK_EXT_extended_dynamic_state3 */
+   {
+      props->dynamicPrimitiveTopologyUnrestricted = true;
+   }
+
+   /* VK_EXT_external_memory_host */
+   {
+      props->minImportedHostPointerAlignment = 4096;
+   }
+
+   /* VK_EXT_graphics_pipeline_library */
+   {
+      props->graphicsPipelineLibraryFastLinking = true;
+      props->graphicsPipelineLibraryIndependentInterpolationDecoration = true;
+   }
+
+   /* VK_EXT_line_rasterization */
+   {
+      /* In the Skylake PRM Vol. 7, subsection titled "GIQ (Diamond) Sampling
+       * Rules - Legacy Mode", it says the following:
+       *
+       *    "Note that the device divides a pixel into a 16x16 array of
+       *     subpixels, referenced by their upper left corners."
+       *
+       * This is the only known reference in the PRMs to the subpixel
+       * precision of line rasterization and a "16x16 array of subpixels"
+       * implies 4 subpixel precision bits. Empirical testing has shown that 4
+       * subpixel precision bits applies to all line rasterization types.
+       */
+      props->lineSubPixelPrecisionBits = 4;
+   }
+
+   /* VK_EXT_map_memory_placed */
+   {
+      props->minPlacedMemoryMapAlignment = 4096;
+   }
+
+   /* VK_EXT_mesh_shader */
+   {
+      /* Bounded by the maximum representable size in
+       * 3DSTATE_MESH_SHADER_BODY::SharedLocalMemorySize.  Same for Task.
+       */
+      const uint32_t max_slm_size = 64 * 1024;
+
+      /* Bounded by the maximum representable size in
+       * 3DSTATE_MESH_SHADER_BODY::LocalXMaximum.  Same for Task.
+       */
+      const uint32_t max_workgroup_size = 1 << 10;
+
+      /* 3DMESH_3D limitation. */
+      const uint32_t max_threadgroup_count = 1 << 22;
+
+      /* 3DMESH_3D limitation. */
+      const uint32_t max_threadgroup_xyz = 65535;
+
+      const uint32_t max_urb_size = 64 * 1024;
+
+      props->maxTaskWorkGroupTotalCount = max_threadgroup_count;
+      props->maxTaskWorkGroupCount[0] = max_threadgroup_xyz;
+      props->maxTaskWorkGroupCount[1] = max_threadgroup_xyz;
+      props->maxTaskWorkGroupCount[2] = max_threadgroup_xyz;
+
+      props->maxTaskWorkGroupInvocations = max_workgroup_size;
+      props->maxTaskWorkGroupSize[0] = max_workgroup_size;
+      props->maxTaskWorkGroupSize[1] = max_workgroup_size;
+      props->maxTaskWorkGroupSize[2] = max_workgroup_size;
+
+      /* TUE header with padding */
+      const uint32_t task_payload_reserved = 32;
+
+      props->maxTaskPayloadSize = max_urb_size - task_payload_reserved;
+      props->maxTaskSharedMemorySize = max_slm_size;
+      props->maxTaskPayloadAndSharedMemorySize =
+         props->maxTaskPayloadSize +
+         props->maxTaskSharedMemorySize;
+
+      props->maxMeshWorkGroupTotalCount = max_threadgroup_count;
+      props->maxMeshWorkGroupCount[0] = max_threadgroup_xyz;
+      props->maxMeshWorkGroupCount[1] = max_threadgroup_xyz;
+      props->maxMeshWorkGroupCount[2] = max_threadgroup_xyz;
+
+      props->maxMeshWorkGroupInvocations = max_workgroup_size;
+      props->maxMeshWorkGroupSize[0] = max_workgroup_size;
+      props->maxMeshWorkGroupSize[1] = max_workgroup_size;
+      props->maxMeshWorkGroupSize[2] = max_workgroup_size;
+
+      props->maxMeshSharedMemorySize = max_slm_size;
+      props->maxMeshPayloadAndSharedMemorySize =
+         props->maxTaskPayloadSize +
+         props->maxMeshSharedMemorySize;
+
+      /* Unfortunately spec's formula for the max output size doesn't match our hardware
+       * (because some per-primitive and per-vertex attributes have alignment restrictions),
+       * so we have to advertise the minimum value mandated by the spec to not overflow it.
+       */
+      props->maxMeshOutputPrimitives = 256;
+      props->maxMeshOutputVertices = 256;
+
+      /* NumPrim + Primitive Data List */
+      const uint32_t max_indices_memory =
+         ALIGN(sizeof(uint32_t) +
+               sizeof(uint32_t) * props->maxMeshOutputVertices, 32);
+
+      props->maxMeshOutputMemorySize = MIN2(max_urb_size - max_indices_memory, 32768);
+
+      props->maxMeshPayloadAndOutputMemorySize =
+         props->maxTaskPayloadSize +
+         props->maxMeshOutputMemorySize;
+
+      props->maxMeshOutputComponents = 128;
+
+      /* RTAIndex is 11-bits wide */
+      props->maxMeshOutputLayers = 1 << 11;
+
+      props->maxMeshMultiviewViewCount = 1;
+
+      /* Elements in Vertex Data Array must be aligned to 32 bytes (8 dwords). */
+      props->meshOutputPerVertexGranularity = 8;
+      /* Elements in Primitive Data Array must be aligned to 32 bytes (8 dwords). */
+      props->meshOutputPerPrimitiveGranularity = 8;
+
+      /* SIMD16 */
+      props->maxPreferredTaskWorkGroupInvocations = 16;
+      props->maxPreferredMeshWorkGroupInvocations = 16;
+
+      props->prefersLocalInvocationVertexOutput = false;
+      props->prefersLocalInvocationPrimitiveOutput = false;
+      props->prefersCompactVertexOutput = false;
+      props->prefersCompactPrimitiveOutput = false;
+
+      /* Spec minimum values */
+      assert(props->maxTaskWorkGroupTotalCount >= (1U << 22));
+      assert(props->maxTaskWorkGroupCount[0] >= 65535);
+      assert(props->maxTaskWorkGroupCount[1] >= 65535);
+      assert(props->maxTaskWorkGroupCount[2] >= 65535);
+
+      assert(props->maxTaskWorkGroupInvocations >= 128);
+      assert(props->maxTaskWorkGroupSize[0] >= 128);
+      assert(props->maxTaskWorkGroupSize[1] >= 128);
+      assert(props->maxTaskWorkGroupSize[2] >= 128);
+
+      assert(props->maxTaskPayloadSize >= 16384);
+      assert(props->maxTaskSharedMemorySize >= 32768);
+      assert(props->maxTaskPayloadAndSharedMemorySize >= 32768);
+
+
+      assert(props->maxMeshWorkGroupTotalCount >= (1U << 22));
+      assert(props->maxMeshWorkGroupCount[0] >= 65535);
+      assert(props->maxMeshWorkGroupCount[1] >= 65535);
+      assert(props->maxMeshWorkGroupCount[2] >= 65535);
+
+      assert(props->maxMeshWorkGroupInvocations >= 128);
+      assert(props->maxMeshWorkGroupSize[0] >= 128);
+      assert(props->maxMeshWorkGroupSize[1] >= 128);
+      assert(props->maxMeshWorkGroupSize[2] >= 128);
+
+      assert(props->maxMeshSharedMemorySize >= 28672);
+      assert(props->maxMeshPayloadAndSharedMemorySize >= 28672);
+      assert(props->maxMeshOutputMemorySize >= 32768);
+      assert(props->maxMeshPayloadAndOutputMemorySize >= 48128);
+
+      assert(props->maxMeshOutputComponents >= 128);
+
+      assert(props->maxMeshOutputVertices >= 256);
+      assert(props->maxMeshOutputPrimitives >= 256);
+      assert(props->maxMeshOutputLayers >= 8);
+      assert(props->maxMeshMultiviewViewCount >= 1);
+   }
+
+   /* VK_EXT_multi_draw */
+   {
+      props->maxMultiDrawCount = 2048;
+   }
+
+   /* VK_EXT_nested_command_buffer */
+   {
+      props->maxCommandBufferNestingLevel = UINT32_MAX;
+   }
+
+   /* VK_EXT_pci_bus_info */
+   {
+      props->pciDomain = pdevice->info.pci_domain;
+      props->pciBus = pdevice->info.pci_bus;
+      props->pciDevice = pdevice->info.pci_dev;
+      props->pciFunction = pdevice->info.pci_func;
+   }
+
+   /* VK_EXT_physical_device_drm */
+   {
+      props->drmHasPrimary = pdevice->has_master;
+      props->drmPrimaryMajor = pdevice->master_major;
+      props->drmPrimaryMinor = pdevice->master_minor;
+      props->drmHasRender = pdevice->has_local;
+      props->drmRenderMajor = pdevice->local_major;
+      props->drmRenderMinor = pdevice->local_minor;
+   }
+
+   /* VK_EXT_pipeline_robustness */
+   {
+      props->defaultRobustnessStorageBuffers =
+         VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT;
+      props->defaultRobustnessUniformBuffers =
+         VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT;
+      props->defaultRobustnessVertexInputs =
+         VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT;
+      props->defaultRobustnessImages =
+         VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_ROBUST_IMAGE_ACCESS_2_EXT;
+   }
+
+   /* VK_EXT_provoking_vertex */
+   {
+      props->provokingVertexModePerPipeline = true;
+      props->transformFeedbackPreservesTriangleFanProvokingVertex = false;
+   }
+
+   /* VK_EXT_robustness2 */
+   {
+      props->robustStorageBufferAccessSizeAlignment =
+         ANV_SSBO_BOUNDS_CHECK_ALIGNMENT;
+      props->robustUniformBufferAccessSizeAlignment =
+         ANV_UBO_ALIGNMENT;
+   }
+
+   /* VK_EXT_sample_locations */
+   {
+      props->sampleLocationSampleCounts =
+         isl_device_get_sample_counts(&pdevice->isl_dev);
+
+      /* See also anv_GetPhysicalDeviceMultisamplePropertiesEXT */
+      props->maxSampleLocationGridSize.width = 1;
+      props->maxSampleLocationGridSize.height = 1;
+
+      props->sampleLocationCoordinateRange[0] = 0;
+      props->sampleLocationCoordinateRange[1] = 0.9375;
+      props->sampleLocationSubPixelBits = 4;
+
+      props->variableSampleLocations = true;
+   }
+
+   /* VK_EXT_shader_module_identifier */
+   {
+      STATIC_ASSERT(sizeof(vk_shaderModuleIdentifierAlgorithmUUID) ==
+                    sizeof(props->shaderModuleIdentifierAlgorithmUUID));
+      memcpy(props->shaderModuleIdentifierAlgorithmUUID,
+             vk_shaderModuleIdentifierAlgorithmUUID,
+             sizeof(props->shaderModuleIdentifierAlgorithmUUID));
+   }
+
+   /* VK_EXT_transform_feedback */
+   {
+      props->maxTransformFeedbackStreams = MAX_XFB_STREAMS;
+      props->maxTransformFeedbackBuffers = MAX_XFB_BUFFERS;
+      props->maxTransformFeedbackBufferSize = (1ull << 32);
+      props->maxTransformFeedbackStreamDataSize = 128 * 4;
+      props->maxTransformFeedbackBufferDataSize = 128 * 4;
+      props->maxTransformFeedbackBufferDataStride = 2048;
+      props->transformFeedbackQueries = true;
+      props->transformFeedbackStreamsLinesTriangles = false;
+      props->transformFeedbackRasterizationStreamSelect = false;
+      props->transformFeedbackDraw = true;
+   }
 }
 
 static VkResult MUST_CHECK
 anv_init_meminfo(struct anv_physical_device *device, int fd)
 {
-   return anv_gather_meminfo(device, fd, false);
+   const struct intel_device_info *devinfo = &device->info;
+
+   device->sys.region = &devinfo->mem.sram.mem;
+   device->sys.size = devinfo->mem.sram.mappable.size;
+   device->sys.available = devinfo->mem.sram.mappable.free;
+
+   device->vram_mappable.region = &devinfo->mem.vram.mem;
+   device->vram_mappable.size = devinfo->mem.vram.mappable.size;
+   device->vram_mappable.available = devinfo->mem.vram.mappable.free;
+
+   device->vram_non_mappable.region = &devinfo->mem.vram.mem;
+   device->vram_non_mappable.size = devinfo->mem.vram.unmappable.size;
+   device->vram_non_mappable.available = devinfo->mem.vram.unmappable.free;
+
+   return VK_SUCCESS;
 }
 
 static void
 anv_update_meminfo(struct anv_physical_device *device, int fd)
 {
-   ASSERTED VkResult result = anv_gather_meminfo(device, fd, true);
-   assert(result == VK_SUCCESS);
-}
+   if (!intel_device_info_update_memory_info(&device->info, fd))
+      return;
 
+   const struct intel_device_info *devinfo = &device->info;
+   device->sys.available = devinfo->mem.sram.mappable.free;
+   device->vram_mappable.available = devinfo->mem.vram.mappable.free;
+   device->vram_non_mappable.available = devinfo->mem.vram.unmappable.free;
+}
 
 static VkResult
 anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
 {
-   if (anv_gem_get_context_param(fd, 0, I915_CONTEXT_PARAM_GTT_SIZE,
-                                 &device->gtt_size) == -1) {
-      /* If, for whatever reason, we can't actually get the GTT size from the
-       * kernel (too old?) fall back to the aperture size.
-       */
-      anv_perf_warn(NULL, NULL,
-                    "Failed to get I915_CONTEXT_PARAM_GTT_SIZE: %m");
-
-      if (intel_get_aperture_size(fd, &device->gtt_size) == -1) {
-         return vk_errorfi(device->instance, NULL,
-                           VK_ERROR_INITIALIZATION_FAILED,
-                           "failed to get aperture size: %m");
-      }
-   }
-
-   /* We only allow 48-bit addresses with softpin because knowing the actual
-    * address is required for the vertex cache flush workaround.
-    */
-   device->supports_48bit_addresses = (device->info.ver >= 8) &&
-                                      device->gtt_size > (4ULL << 30 /* GiB */);
-
    VkResult result = anv_init_meminfo(device, fd);
    if (result != VK_SUCCESS)
       return result;
 
    assert(device->sys.size != 0);
 
-   if (device->vram.size > 0) {
-      /* We can create 2 different heaps when we have local memory support,
-       * first heap with local memory size and second with system memory size.
+   if (anv_physical_device_has_vram(device)) {
+      /* We can create 2 or 3 different heaps when we have local memory
+       * support, first heap with local memory size and second with system
+       * memory size and the third is added only if part of the vram is
+       * mappable to the host.
        */
       device->memory.heap_count = 2;
       device->memory.heaps[0] = (struct anv_memory_heap) {
-         .size = device->vram.size,
+         /* If there is a vram_non_mappable, use that for the device only
+          * heap. Otherwise use the vram_mappable.
+          */
+         .size = device->vram_non_mappable.size != 0 ?
+                 device->vram_non_mappable.size : device->vram_mappable.size,
          .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
          .is_local_mem = true,
       };
@@ -475,43 +1916,17 @@ anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
          .flags = 0,
          .is_local_mem = false,
       };
-
-      device->memory.type_count = 3;
-      device->memory.types[0] = (struct anv_memory_type) {
-         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
-         .heapIndex = 0,
-      };
-      device->memory.types[1] = (struct anv_memory_type) {
-         .propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
-                          VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
-         .heapIndex = 1,
-      };
-      device->memory.types[2] = (struct anv_memory_type) {
-         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
-                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
-         .heapIndex = 0,
-      };
-   } else if (device->info.has_llc) {
-      device->memory.heap_count = 1;
-      device->memory.heaps[0] = (struct anv_memory_heap) {
-         .size = device->sys.size,
-         .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
-         .is_local_mem = false,
-      };
-
-      /* Big core GPUs share LLC with the CPU and thus one memory type can be
-       * both cached and coherent at the same time.
+      /* Add an additional smaller vram mappable heap if we can't map all the
+       * vram to the host.
        */
-      device->memory.type_count = 1;
-      device->memory.types[0] = (struct anv_memory_type) {
-         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
-                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
-                          VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
-         .heapIndex = 0,
-      };
+      if (device->vram_non_mappable.size > 0) {
+         device->memory.heap_count++;
+         device->memory.heaps[2] = (struct anv_memory_heap) {
+            .size = device->vram_mappable.size,
+            .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
+            .is_local_mem = true,
+         };
+      }
    } else {
       device->memory.heap_count = 1;
       device->memory.heaps[0] = (struct anv_memory_heap) {
@@ -519,33 +1934,60 @@ anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
          .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
          .is_local_mem = false,
       };
+   }
 
-      /* The spec requires that we expose a host-visible, coherent memory
-       * type, but Atom GPUs don't share LLC. Thus we offer two memory types
-       * to give the application a choice between cached, but not coherent and
-       * coherent but uncached (WC though).
-       */
-      device->memory.type_count = 2;
-      device->memory.types[0] = (struct anv_memory_type) {
-         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
-                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-                          VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
-         .heapIndex = 0,
-      };
-      device->memory.types[1] = (struct anv_memory_type) {
-         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
-                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
-         .heapIndex = 0,
-      };
+   switch (device->info.kmd_type) {
+   case INTEL_KMD_TYPE_XE:
+      result = anv_xe_physical_device_init_memory_types(device);
+      break;
+   case INTEL_KMD_TYPE_I915:
+   default:
+      result = anv_i915_physical_device_init_memory_types(device);
+      break;
+   }
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   /* Replicate all non protected memory types for descriptor buffers because
+    * we want to identify memory allocations to place them in the right memory
+    * heap.
+    */
+   device->memory.default_buffer_mem_types =
+      BITFIELD_RANGE(0, device->memory.type_count);
+   device->memory.protected_mem_types = 0;
+   device->memory.desc_buffer_mem_types = 0;
+
+   uint32_t base_types_count = device->memory.type_count;
+   for (int i = 0; i < base_types_count; i++) {
+      if (device->memory.types[i].propertyFlags &
+          VK_MEMORY_PROPERTY_PROTECTED_BIT) {
+         device->memory.protected_mem_types |= BITFIELD_BIT(i);
+         device->memory.default_buffer_mem_types &= (~BITFIELD_BIT(i));
+         continue;
+      }
+
+      assert(device->memory.type_count < ARRAY_SIZE(device->memory.types));
+
+      device->memory.desc_buffer_mem_types |=
+         BITFIELD_BIT(device->memory.type_count);
+
+      struct anv_memory_type *new_type =
+         &device->memory.types[device->memory.type_count++];
+      *new_type = device->memory.types[i];
+      new_type->descriptor_buffer = true;
    }
 
-   device->memory.need_clflush = false;
    for (unsigned i = 0; i < device->memory.type_count; i++) {
       VkMemoryPropertyFlags props = device->memory.types[i].propertyFlags;
       if ((props & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) &&
           !(props & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT))
-         device->memory.need_clflush = true;
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+         device->memory.need_flush = true;
+#else
+         return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                          "Memory configuration requires flushing, but it's not implemented for this architecture");
+#endif
    }
 
    return VK_SUCCESS;
@@ -557,16 +1999,14 @@ anv_physical_device_init_uuids(struct anv_physical_device *device)
    const struct build_id_note *note =
       build_id_find_nhdr_for_addr(anv_physical_device_init_uuids);
    if (!note) {
-      return vk_errorfi(device->instance, NULL,
-                        VK_ERROR_INITIALIZATION_FAILED,
-                        "Failed to find build-id");
+      return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                       "Failed to find build-id");
    }
 
    unsigned build_id_len = build_id_length(note);
    if (build_id_len < 20) {
-      return vk_errorfi(device->instance, NULL,
-                        VK_ERROR_INITIALIZATION_FAILED,
-                        "build-id too short.  It needs to be a SHA");
+      return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                       "build-id too short.  It needs to be a SHA");
    }
 
    memcpy(device->driver_build_sha1, build_id_data(note), 20);
@@ -580,21 +2020,14 @@ anv_physical_device_init_uuids(struct anv_physical_device *device)
     */
    _mesa_sha1_init(&sha1_ctx);
    _mesa_sha1_update(&sha1_ctx, build_id_data(note), build_id_len);
-   _mesa_sha1_update(&sha1_ctx, &device->info.chipset_id,
-                     sizeof(device->info.chipset_id));
+   brw_device_sha1_update(&sha1_ctx, &device->info);
    _mesa_sha1_update(&sha1_ctx, &device->always_use_bindless,
                      sizeof(device->always_use_bindless));
-   _mesa_sha1_update(&sha1_ctx, &device->has_a64_buffer_access,
-                     sizeof(device->has_a64_buffer_access));
-   _mesa_sha1_update(&sha1_ctx, &device->has_bindless_images,
-                     sizeof(device->has_bindless_images));
-   _mesa_sha1_update(&sha1_ctx, &device->has_bindless_samplers,
-                     sizeof(device->has_bindless_samplers));
    _mesa_sha1_final(&sha1_ctx, sha1);
    memcpy(device->pipeline_cache_uuid, sha1, VK_UUID_SIZE);
 
    intel_uuid_compute_driver_id(device->driver_uuid, &device->info, VK_UUID_SIZE);
-   intel_uuid_compute_device_id(device->device_uuid, &device->isl_dev, VK_UUID_SIZE);
+   intel_uuid_compute_device_id(device->device_uuid, &device->info, VK_UUID_SIZE);
 
    return VK_SUCCESS;
 }
@@ -605,7 +2038,7 @@ anv_physical_device_init_disk_cache(struct anv_physical_device *device)
 #ifdef ENABLE_SHADER_CACHE
    char renderer[10];
    ASSERTED int len = snprintf(renderer, sizeof(renderer), "anv_%04x",
-                               device->info.chipset_id);
+                               device->info.pci_device_id);
    assert(len == sizeof(renderer) - 2);
 
    char timestamp[41];
@@ -613,9 +2046,7 @@ anv_physical_device_init_disk_cache(struct anv_physical_device *device)
 
    const uint64_t driver_flags =
       brw_get_compiler_config_value(device->compiler);
-   device->disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
-#else
-   device->disk_cache = NULL;
+   device->vk.disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
 #endif
 }
 
@@ -623,10 +2054,12 @@ static void
 anv_physical_device_free_disk_cache(struct anv_physical_device *device)
 {
 #ifdef ENABLE_SHADER_CACHE
-   if (device->disk_cache)
-      disk_cache_destroy(device->disk_cache);
+   if (device->vk.disk_cache) {
+      disk_cache_destroy(device->vk.disk_cache);
+      device->vk.disk_cache = NULL;
+   }
 #else
-   assert(device->disk_cache == NULL);
+   assert(device->vk.disk_cache == NULL);
 #endif
 }
 
@@ -637,6 +2070,7 @@ anv_physical_device_free_disk_cache(struct anv_physical_device *device)
  *  * "gc" is for graphics queues with compute support
  *  * "g" is for graphics queues with no compute support
  *  * "c" is for compute queues with no graphics support
+ *  * "v" is for video queues with no graphics support
  *
  * For example, ANV_QUEUE_OVERRIDE=gc=2,c=1 would override the number of
  * advertised queues to be 2 queues with graphics+compute support, and 1 queue
@@ -651,11 +2085,12 @@ anv_physical_device_free_disk_cache(struct anv_physical_device *device)
  * number of graphics+compute queues to be 0.
  */
 static void
-anv_override_engine_counts(int *gc_count, int *g_count, int *c_count)
+anv_override_engine_counts(int *gc_count, int *g_count, int *c_count, int *v_count)
 {
    int gc_override = -1;
    int g_override = -1;
    int c_override = -1;
+   int v_override = -1;
    char *env = getenv("ANV_QUEUE_OVERRIDE");
 
    if (env == NULL)
@@ -671,6 +2106,8 @@ anv_override_engine_counts(int *gc_count, int *g_count, int *c_count)
          g_override = strtol(next + 2, NULL, 0);
       } else if (strncmp(next, "c=", 2) == 0) {
          c_override = strtol(next + 2, NULL, 0);
+      } else if (strncmp(next, "v=", 2) == 0) {
+         v_override = strtol(next + 2, NULL, 0);
       } else {
          mesa_logw("Ignoring unsupported ANV_QUEUE_OVERRIDE token: %s", next);
       }
@@ -686,58 +2123,119 @@ anv_override_engine_counts(int *gc_count, int *g_count, int *c_count)
                 "Vulkan specification");
    if (c_override >= 0)
       *c_count = c_override;
+   if (v_override >= 0)
+      *v_count = v_override;
 }
 
 static void
 anv_physical_device_init_queue_families(struct anv_physical_device *pdevice)
 {
    uint32_t family_count = 0;
+   VkQueueFlags sparse_flags = pdevice->sparse_type != ANV_SPARSE_TYPE_NOT_SUPPORTED ?
+                               VK_QUEUE_SPARSE_BINDING_BIT : 0;
+   VkQueueFlags protected_flag = pdevice->has_protected_contexts ?
+                                 VK_QUEUE_PROTECTED_BIT : 0;
 
    if (pdevice->engine_info) {
       int gc_count =
-         anv_gem_count_engines(pdevice->engine_info, I915_ENGINE_CLASS_RENDER);
+         intel_engines_count(pdevice->engine_info,
+                             INTEL_ENGINE_CLASS_RENDER);
+      int v_count =
+         intel_engines_count(pdevice->engine_info, INTEL_ENGINE_CLASS_VIDEO);
       int g_count = 0;
       int c_count = 0;
+      const bool kernel_supports_non_render_engines = pdevice->has_vm_control;
+      const bool sparse_supports_non_render_engines =
+         pdevice->sparse_type != ANV_SPARSE_TYPE_TRTT;
+      const bool can_use_non_render_engines =
+         kernel_supports_non_render_engines &&
+         sparse_supports_non_render_engines;
 
-      anv_override_engine_counts(&gc_count, &g_count, &c_count);
+      if (can_use_non_render_engines) {
+         c_count = intel_engines_supported_count(pdevice->local_fd,
+                                                 &pdevice->info,
+                                                 pdevice->engine_info,
+                                                 INTEL_ENGINE_CLASS_COMPUTE);
+      }
+      enum intel_engine_class compute_class =
+         c_count < 1 ? INTEL_ENGINE_CLASS_RENDER : INTEL_ENGINE_CLASS_COMPUTE;
+
+      int blit_count = 0;
+      if (pdevice->info.verx10 >= 125 && can_use_non_render_engines) {
+         blit_count = intel_engines_supported_count(pdevice->local_fd,
+                                                    &pdevice->info,
+                                                    pdevice->engine_info,
+                                                    INTEL_ENGINE_CLASS_COPY);
+      }
+
+      anv_override_engine_counts(&gc_count, &g_count, &c_count, &v_count);
 
       if (gc_count > 0) {
          pdevice->queue.families[family_count++] = (struct anv_queue_family) {
             .queueFlags = VK_QUEUE_GRAPHICS_BIT |
                           VK_QUEUE_COMPUTE_BIT |
-                          VK_QUEUE_TRANSFER_BIT,
+                          VK_QUEUE_TRANSFER_BIT |
+                          sparse_flags |
+                          protected_flag,
             .queueCount = gc_count,
-            .engine_class = I915_ENGINE_CLASS_RENDER,
+            .engine_class = INTEL_ENGINE_CLASS_RENDER,
          };
       }
       if (g_count > 0) {
          pdevice->queue.families[family_count++] = (struct anv_queue_family) {
             .queueFlags = VK_QUEUE_GRAPHICS_BIT |
-                          VK_QUEUE_TRANSFER_BIT,
+                          VK_QUEUE_TRANSFER_BIT |
+                          sparse_flags |
+                          protected_flag,
             .queueCount = g_count,
-            .engine_class = I915_ENGINE_CLASS_RENDER,
+            .engine_class = INTEL_ENGINE_CLASS_RENDER,
          };
       }
       if (c_count > 0) {
          pdevice->queue.families[family_count++] = (struct anv_queue_family) {
             .queueFlags = VK_QUEUE_COMPUTE_BIT |
-                          VK_QUEUE_TRANSFER_BIT,
+                          VK_QUEUE_TRANSFER_BIT |
+                          sparse_flags |
+                          protected_flag,
             .queueCount = c_count,
-            .engine_class = I915_ENGINE_CLASS_RENDER,
+            .engine_class = compute_class,
+         };
+      }
+      if (v_count > 0 && pdevice->video_decode_enabled) {
+         /* HEVC support on Gfx9 is only available on VCS0. So limit the number of video queues
+          * to the first VCS engine instance.
+          *
+          * We should be able to query HEVC support from the kernel using the engine query uAPI,
+          * but this appears to be broken :
+          *    https://gitlab.freedesktop.org/drm/intel/-/issues/8832
+          *
+          * When this bug is fixed we should be able to check HEVC support to determine the
+          * correct number of queues.
+          */
+         /* TODO: enable protected content on video queue */
+         pdevice->queue.families[family_count++] = (struct anv_queue_family) {
+            .queueFlags = VK_QUEUE_VIDEO_DECODE_BIT_KHR,
+            .queueCount = pdevice->info.ver == 9 ? MIN2(1, v_count) : v_count,
+            .engine_class = INTEL_ENGINE_CLASS_VIDEO,
+         };
+      }
+      if (blit_count > 0) {
+         pdevice->queue.families[family_count++] = (struct anv_queue_family) {
+            .queueFlags = VK_QUEUE_TRANSFER_BIT |
+                          protected_flag,
+            .queueCount = blit_count,
+            .engine_class = INTEL_ENGINE_CLASS_COPY,
          };
       }
-      /* Increase count below when other families are added as a reminder to
-       * increase the ANV_MAX_QUEUE_FAMILIES value.
-       */
-      STATIC_ASSERT(ANV_MAX_QUEUE_FAMILIES >= 3);
    } else {
       /* Default to a single render queue */
       pdevice->queue.families[family_count++] = (struct anv_queue_family) {
          .queueFlags = VK_QUEUE_GRAPHICS_BIT |
                        VK_QUEUE_COMPUTE_BIT |
-                       VK_QUEUE_TRANSFER_BIT,
+                       VK_QUEUE_TRANSFER_BIT |
+                       sparse_flags,
          .queueCount = 1,
-         .engine_class = I915_ENGINE_CLASS_RENDER,
+         .engine_class = INTEL_ENGINE_CLASS_RENDER,
       };
       family_count = 1;
    }
@@ -746,45 +2244,79 @@ anv_physical_device_init_queue_families(struct anv_physical_device *pdevice)
 }
 
 static VkResult
-anv_physical_device_try_create(struct anv_instance *instance,
-                               drmDevicePtr drm_device,
-                               struct anv_physical_device **device_out)
+anv_physical_device_get_parameters(struct anv_physical_device *device)
+{
+   switch (device->info.kmd_type) {
+   case INTEL_KMD_TYPE_I915:
+      return anv_i915_physical_device_get_parameters(device);
+   case INTEL_KMD_TYPE_XE:
+      return anv_xe_physical_device_get_parameters(device);
+   default:
+      unreachable("Missing");
+      return VK_ERROR_UNKNOWN;
+   }
+}
+
+static VkResult
+anv_physical_device_try_create(struct vk_instance *vk_instance,
+                               struct _drmDevice *drm_device,
+                               struct vk_physical_device **out)
 {
+   struct anv_instance *instance =
+      container_of(vk_instance, struct anv_instance, vk);
+
+   if (!(drm_device->available_nodes & (1 << DRM_NODE_RENDER)) ||
+       drm_device->bustype != DRM_BUS_PCI ||
+       drm_device->deviceinfo.pci->vendor_id != 0x8086)
+      return VK_ERROR_INCOMPATIBLE_DRIVER;
+
    const char *primary_path = drm_device->nodes[DRM_NODE_PRIMARY];
    const char *path = drm_device->nodes[DRM_NODE_RENDER];
    VkResult result;
    int fd;
    int master_fd = -1;
 
-   brw_process_intel_debug_variable();
+   process_intel_debug_variable();
 
    fd = open(path, O_RDWR | O_CLOEXEC);
    if (fd < 0) {
       if (errno == ENOMEM) {
-         return vk_errorfi(instance, NULL, VK_ERROR_OUT_OF_HOST_MEMORY,
-                        "Unable to open device %s: out of memory", path);
+         return vk_errorf(instance, VK_ERROR_OUT_OF_HOST_MEMORY,
+                          "Unable to open device %s: out of memory", path);
       }
-      return vk_errorfi(instance, NULL, VK_ERROR_INCOMPATIBLE_DRIVER,
-                        "Unable to open device %s: %m", path);
+      return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+                       "Unable to open device %s: %m", path);
    }
 
    struct intel_device_info devinfo;
-   if (!intel_get_device_info_from_fd(fd, &devinfo)) {
-      result = vk_error(VK_ERROR_INCOMPATIBLE_DRIVER);
+   if (!intel_get_device_info_from_fd(fd, &devinfo, 9, -1)) {
+      result = VK_ERROR_INCOMPATIBLE_DRIVER;
       goto fail_fd;
    }
 
-   if (devinfo.is_haswell) {
-      mesa_logw("Haswell Vulkan support is incomplete");
-   } else if (devinfo.ver == 7 && !devinfo.is_baytrail) {
-      mesa_logw("Ivy Bridge Vulkan support is incomplete");
-   } else if (devinfo.ver == 7 && devinfo.is_baytrail) {
-      mesa_logw("Bay Trail Vulkan support is incomplete");
-   } else if (devinfo.ver >= 8 && devinfo.ver <= 12) {
-      /* Gfx8-12 fully supported */
-   } else {
-      result = vk_errorfi(instance, NULL, VK_ERROR_INCOMPATIBLE_DRIVER,
-                          "Vulkan not yet supported on %s", devinfo.name);
+   if (devinfo.ver == 20) {
+      mesa_logw("Vulkan not yet supported on %s", devinfo.name);
+   } else if (devinfo.ver > 12) {
+      result = vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+                         "Vulkan not yet supported on %s", devinfo.name);
+      goto fail_fd;
+   } else if (devinfo.ver < 9) {
+      /* Silently fail here, hasvk should pick up this device. */
+      result = VK_ERROR_INCOMPATIBLE_DRIVER;
+      goto fail_fd;
+   }
+
+   /* Disable Wa_16013994831 on Gfx12.0 because we found other cases where we
+    * need to always disable preemption :
+    *    - https://gitlab.freedesktop.org/mesa/mesa/-/issues/5963
+    *    - https://gitlab.freedesktop.org/mesa/mesa/-/issues/5662
+    */
+   if (devinfo.verx10 == 120)
+      BITSET_CLEAR(devinfo.workarounds, INTEL_WA_16013994831);
+
+   if (!devinfo.has_context_isolation) {
+      result = vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+                         "Vulkan requires context isolation for %s", devinfo.name);
       goto fail_fd;
    }
 
@@ -792,19 +2324,21 @@ anv_physical_device_try_create(struct anv_instance *instance,
       vk_zalloc(&instance->vk.alloc, sizeof(*device), 8,
                 VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
    if (device == NULL) {
-      result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      result = vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
       goto fail_fd;
    }
 
    struct vk_physical_device_dispatch_table dispatch_table;
    vk_physical_device_dispatch_table_from_entrypoints(
       &dispatch_table, &anv_physical_device_entrypoints, true);
+   vk_physical_device_dispatch_table_from_entrypoints(
+      &dispatch_table, &wsi_physical_device_entrypoints, false);
 
    result = vk_physical_device_init(&device->vk, &instance->vk,
-                                    NULL, /* We set up extensions later */
+                                    NULL, NULL, NULL, /* We set up extensions later */
                                     &dispatch_table);
    if (result != VK_SUCCESS) {
-      vk_error(result);
+      vk_error(instance, result);
       goto fail_alloc;
    }
    device->instance = instance;
@@ -814,175 +2348,156 @@ anv_physical_device_try_create(struct anv_instance *instance,
 
    device->info = devinfo;
 
-   device->pci_info.domain = drm_device->businfo.pci->domain;
-   device->pci_info.bus = drm_device->businfo.pci->bus;
-   device->pci_info.device = drm_device->businfo.pci->dev;
-   device->pci_info.function = drm_device->businfo.pci->func;
-
-   device->cmd_parser_version = -1;
-   if (device->info.ver == 7) {
-      device->cmd_parser_version =
-         anv_gem_get_param(fd, I915_PARAM_CMD_PARSER_VERSION);
-      if (device->cmd_parser_version == -1) {
-         result = vk_errorfi(device->instance, NULL,
-                             VK_ERROR_INITIALIZATION_FAILED,
-                             "failed to get command parser version");
-         goto fail_base;
-      }
-   }
-
-   if (!anv_gem_get_param(fd, I915_PARAM_HAS_WAIT_TIMEOUT)) {
-      result = vk_errorfi(device->instance, NULL,
-                          VK_ERROR_INITIALIZATION_FAILED,
-                          "kernel missing gem wait");
+   device->local_fd = fd;
+   result = anv_physical_device_get_parameters(device);
+   if (result != VK_SUCCESS)
       goto fail_base;
-   }
 
-   if (!anv_gem_get_param(fd, I915_PARAM_HAS_EXECBUF2)) {
-      result = vk_errorfi(device->instance, NULL,
-                          VK_ERROR_INITIALIZATION_FAILED,
-                          "kernel missing execbuf2");
-      goto fail_base;
-   }
+   device->gtt_size = device->info.gtt_size ? device->info.gtt_size :
+                                              device->info.aperture_bytes;
 
-   if (!device->info.has_llc &&
-       anv_gem_get_param(fd, I915_PARAM_MMAP_VERSION) < 1) {
-      result = vk_errorfi(device->instance, NULL,
-                          VK_ERROR_INITIALIZATION_FAILED,
-                          "kernel missing wc mmap");
+   if (device->gtt_size < (4ULL << 30 /* GiB */)) {
+      vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
+                "GTT size too small: 0x%016"PRIx64, device->gtt_size);
       goto fail_base;
    }
 
-   if (device->info.ver >= 8 && !device->info.is_cherryview &&
-       !anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN)) {
-      result = vk_errorfi(device->instance, NULL,
-                          VK_ERROR_INITIALIZATION_FAILED,
-                          "kernel missing softpin");
-      goto fail_alloc;
+   /* We currently only have the right bits for instructions in Gen12+. If the
+    * kernel ever starts supporting that feature on previous generations,
+    * we'll need to edit genxml prior to enabling here.
+    */
+   device->has_protected_contexts = device->info.ver >= 12 &&
+      intel_gem_supports_protected_context(fd, device->info.kmd_type);
+
+   /* Just pick one; they're all the same */
+   device->has_astc_ldr =
+      isl_format_supports_sampling(&device->info,
+                                   ISL_FORMAT_ASTC_LDR_2D_4X4_FLT16);
+   if (!device->has_astc_ldr &&
+       driQueryOptionb(&device->instance->dri_options, "vk_require_astc"))
+      device->emu_astc_ldr = true;
+   if (devinfo.ver == 9 && !intel_device_info_is_9lp(&devinfo)) {
+      device->flush_astc_ldr_void_extent_denorms =
+         device->has_astc_ldr && !device->emu_astc_ldr;
    }
+   device->disable_fcv = device->info.verx10 >= 125 ||
+                         instance->disable_fcv;
 
-   if (!anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_FENCE_ARRAY)) {
-      result = vk_errorfi(device->instance, NULL,
-                          VK_ERROR_INITIALIZATION_FAILED,
-                          "kernel missing syncobj support");
+   result = anv_physical_device_init_heaps(device, fd);
+   if (result != VK_SUCCESS)
       goto fail_base;
-   }
 
-   device->has_exec_async = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_ASYNC);
-   device->has_exec_capture = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_CAPTURE);
-   device->has_exec_fence = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_FENCE);
-   device->has_syncobj_wait = anv_gem_supports_syncobj_wait(fd);
-   device->has_syncobj_wait_available =
-      anv_gem_get_drm_cap(fd, DRM_CAP_SYNCOBJ_TIMELINE) != 0;
+   if (debug_get_bool_option("ANV_QUEUE_THREAD_DISABLE", false))
+      device->has_exec_timeline = false;
 
-   device->has_context_priority = anv_gem_has_context_priority(fd);
+   device->has_cooperative_matrix =
+      device->info.cooperative_matrix_configurations[0].scope != INTEL_CMAT_SCOPE_NONE;
 
-   /* Initialize memory regions struct to 0. */
-   memset(&device->vram, 0, sizeof(device->vram));
-   memset(&device->sys, 0, sizeof(device->sys));
+   unsigned st_idx = 0;
 
-   result = anv_physical_device_init_heaps(device, fd);
-   if (result != VK_SUCCESS)
-      goto fail_base;
+   device->sync_syncobj_type = vk_drm_syncobj_get_type(fd);
+   if (!device->has_exec_timeline)
+      device->sync_syncobj_type.features &= ~VK_SYNC_FEATURE_TIMELINE;
+   device->sync_types[st_idx++] = &device->sync_syncobj_type;
 
-   device->use_softpin = device->info.ver >= 8 &&
-                         !device->info.is_cherryview;
-   assert(device->use_softpin == device->supports_48bit_addresses);
+   /* anv_bo_sync_type is only supported with i915 for now  */
+   if (device->info.kmd_type == INTEL_KMD_TYPE_I915) {
+      if (!(device->sync_syncobj_type.features & VK_SYNC_FEATURE_CPU_WAIT))
+         device->sync_types[st_idx++] = &anv_bo_sync_type;
 
-   device->has_context_isolation =
-      anv_gem_get_param(fd, I915_PARAM_HAS_CONTEXT_ISOLATION);
+      if (!(device->sync_syncobj_type.features & VK_SYNC_FEATURE_TIMELINE)) {
+         device->sync_timeline_type = vk_sync_timeline_get_type(&anv_bo_sync_type);
+         device->sync_types[st_idx++] = &device->sync_timeline_type.sync;
+      }
+   } else {
+      assert(vk_sync_type_is_drm_syncobj(&device->sync_syncobj_type));
+      assert(device->sync_syncobj_type.features & VK_SYNC_FEATURE_TIMELINE);
+      assert(device->sync_syncobj_type.features & VK_SYNC_FEATURE_CPU_WAIT);
+   }
 
-   device->has_exec_timeline =
-      anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_TIMELINE_FENCES);
-   if (env_var_as_boolean("ANV_QUEUE_THREAD_DISABLE", false))
-      device->has_exec_timeline = false;
+   device->sync_types[st_idx++] = NULL;
+   assert(st_idx <= ARRAY_SIZE(device->sync_types));
+   device->vk.supported_sync_types = device->sync_types;
 
-   device->has_thread_submit =
-      device->has_syncobj_wait_available && device->has_exec_timeline;
+   device->vk.pipeline_cache_import_ops = anv_cache_import_ops;
 
    device->always_use_bindless =
-      env_var_as_boolean("ANV_ALWAYS_BINDLESS", false);
+      debug_get_bool_option("ANV_ALWAYS_BINDLESS", false);
 
    device->use_call_secondary =
-      device->use_softpin &&
-      !env_var_as_boolean("ANV_DISABLE_SECONDARY_CMD_BUFFER_CALLS", false);
+      !debug_get_bool_option("ANV_DISABLE_SECONDARY_CMD_BUFFER_CALLS", false);
 
-   /* We first got the A64 messages on broadwell and we can only use them if
-    * we can pass addresses directly into the shader which requires softpin.
-    */
-   device->has_a64_buffer_access = device->info.ver >= 8 &&
-                                   device->use_softpin;
+   device->video_decode_enabled = debug_get_bool_option("ANV_VIDEO_DECODE", false);
 
-   /* We first get bindless image access on Skylake.
-    */
-   device->has_bindless_images = device->info.ver >= 9;
+   device->uses_ex_bso = device->info.verx10 >= 125;
 
-   /* We've had bindless samplers since Ivy Bridge (forever in Vulkan terms)
-    * because it's just a matter of setting the sampler address in the sample
-    * message header.  However, we've not bothered to wire it up for vec4 so
-    * we leave it disabled on gfx7.
+   /* For now always use indirect descriptors. We'll update this
+    * to !uses_ex_bso when all the infrastructure is built up.
     */
-   device->has_bindless_samplers = device->info.ver >= 8;
-
-   device->has_implicit_ccs = device->info.has_aux_map;
+   device->indirect_descriptors =
+      !device->uses_ex_bso ||
+      driQueryOptionb(&instance->dri_options, "force_indirect_descriptors");
 
+   device->alloc_aux_tt_mem =
+      device->info.has_aux_map && device->info.verx10 >= 125;
    /* Check if we can read the GPU timestamp register from the CPU */
    uint64_t u64_ignore;
-   device->has_reg_timestamp = anv_gem_reg_read(fd, TIMESTAMP | I915_REG_READ_8B_WA,
-                                                &u64_ignore) == 0;
+   device->has_reg_timestamp = intel_gem_read_render_timestamp(fd,
+                                                               device->info.kmd_type,
+                                                               &u64_ignore);
+
+   device->uses_relocs = device->info.kmd_type != INTEL_KMD_TYPE_XE;
+
+   /* While xe.ko can use both vm_bind and TR-TT, i915.ko only has TR-TT. */
+   if (device->info.kmd_type == INTEL_KMD_TYPE_XE) {
+      if (debug_get_bool_option("ANV_SPARSE_USE_TRTT", false))
+         device->sparse_type = ANV_SPARSE_TYPE_TRTT;
+      else
+         device->sparse_type = ANV_SPARSE_TYPE_VM_BIND;
+   } else {
+      if (device->info.ver >= 12 &&
+          device->has_exec_timeline &&
+          debug_get_bool_option("ANV_SPARSE", true)) {
+         device->sparse_type = ANV_SPARSE_TYPE_TRTT;
+      } else if (instance->has_fake_sparse) {
+         device->sparse_type = ANV_SPARSE_TYPE_FAKE;
+      } else {
+         device->sparse_type = ANV_SPARSE_TYPE_NOT_SUPPORTED;
+      }
+   }
 
-   device->always_flush_cache =
+   device->always_flush_cache = INTEL_DEBUG(DEBUG_STALL) ||
       driQueryOptionb(&instance->dri_options, "always_flush_cache");
 
-   device->has_mmap_offset =
-      anv_gem_get_param(fd, I915_PARAM_MMAP_GTT_VERSION) >= 4;
-
-   device->has_userptr_probe =
-      anv_gem_get_param(fd, I915_PARAM_HAS_USERPTR_PROBE);
-
    device->compiler = brw_compiler_create(NULL, &device->info);
    if (device->compiler == NULL) {
-      result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      result = vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
       goto fail_base;
    }
    device->compiler->shader_debug_log = compiler_debug_log;
    device->compiler->shader_perf_log = compiler_perf_log;
-   device->compiler->supports_pull_constants = false;
-   device->compiler->constant_buffer_0_is_relative =
-      device->info.ver < 8 || !device->has_context_isolation;
-   device->compiler->supports_shader_constants = true;
-   device->compiler->compact_params = false;
    device->compiler->indirect_ubos_use_sampler = device->info.ver < 12;
+   device->compiler->extended_bindless_surface_offset = device->uses_ex_bso;
+   device->compiler->use_bindless_sampler_offset = false;
+   device->compiler->spilling_rate =
+      driQueryOptioni(&instance->dri_options, "shader_spilling_rate");
 
-   /* Broadwell PRM says:
-    *
-    *   "Before Gfx8, there was a historical configuration control field to
-    *    swizzle address bit[6] for in X/Y tiling modes. This was set in three
-    *    different places: TILECTL[1:0], ARB_MODE[5:4], and
-    *    DISP_ARB_CTL[14:13].
-    *
-    *    For Gfx8 and subsequent generations, the swizzle fields are all
-    *    reserved, and the CPU's memory controller performs all address
-    *    swizzling modifications."
-    */
-   bool swizzled =
-      device->info.ver < 8 && anv_gem_get_bit6_swizzle(fd, I915_TILING_X);
-
-   isl_device_init(&device->isl_dev, &device->info, swizzled);
+   isl_device_init(&device->isl_dev, &device->info);
+   device->isl_dev.buffer_length_in_aux_addr = true;
 
    result = anv_physical_device_init_uuids(device);
    if (result != VK_SUCCESS)
       goto fail_compiler;
 
+   anv_physical_device_init_va_ranges(device);
+
    anv_physical_device_init_disk_cache(device);
 
    if (instance->vk.enabled_extensions.KHR_display) {
       master_fd = open(primary_path, O_RDWR | O_CLOEXEC);
       if (master_fd >= 0) {
-         /* prod the device with a GETPARAM call which will fail if
-          * we don't have permission to even render on this device
-          */
-         if (anv_gem_get_param(master_fd, I915_PARAM_CHIPSET_ID) == 0) {
+         /* fail if we don't have permission to even render on this device */
+         if (!intel_gem_can_render_on_fd(master_fd, device->info.kmd_type)) {
             close(master_fd);
             master_fd = -1;
          }
@@ -990,25 +2505,15 @@ anv_physical_device_try_create(struct anv_instance *instance,
    }
    device->master_fd = master_fd;
 
-   device->engine_info = anv_gem_get_engine_info(fd);
+   device->engine_info = intel_engine_get_info(fd, device->info.kmd_type);
+   device->info.has_compute_engine = device->engine_info &&
+                                     intel_engines_count(device->engine_info,
+                                                         INTEL_ENGINE_CLASS_COMPUTE);
    anv_physical_device_init_queue_families(device);
 
-   result = anv_init_wsi(device);
-   if (result != VK_SUCCESS)
-      goto fail_engine_info;
-
    anv_physical_device_init_perf(device, fd);
 
-   anv_measure_device_init(device);
-
-   get_device_extensions(device, &device->vk.supported_extensions);
-
-   device->local_fd = fd;
-
-   anv_genX(&device->info, init_physical_device_state)(device);
-
-   *device_out = device;
-
+   /* Gather major/minor before WSI. */
    struct stat st;
 
    if (stat(primary_path, &st) == 0) {
@@ -1031,9 +2536,24 @@ anv_physical_device_try_create(struct anv_instance *instance,
       device->local_minor = 0;
    }
 
+   get_device_extensions(device, &device->vk.supported_extensions);
+   get_features(device, &device->vk.supported_features);
+   get_properties(device, &device->vk.properties);
+
+   result = anv_init_wsi(device);
+   if (result != VK_SUCCESS)
+      goto fail_perf;
+
+   anv_measure_device_init(device);
+
+   anv_genX(&device->info, init_physical_device_state)(device);
+
+   *out = &device->vk;
+
    return VK_SUCCESS;
 
-fail_engine_info:
+fail_perf:
+   ralloc_free(device->perf);
    free(device->engine_info);
    anv_physical_device_free_disk_cache(device);
 fail_compiler:
@@ -1050,8 +2570,11 @@ fail_fd:
 }
 
 static void
-anv_physical_device_destroy(struct anv_physical_device *device)
+anv_physical_device_destroy(struct vk_physical_device *vk_device)
 {
+   struct anv_physical_device *device =
+      container_of(vk_device, struct anv_physical_device, vk);
+
    anv_finish_wsi(device);
    anv_measure_device_destroy(device);
    free(device->engine_info);
@@ -1071,7 +2594,7 @@ VkResult anv_EnumerateInstanceExtensionProperties(
     VkExtensionProperties*                      pProperties)
 {
    if (pLayerName)
-      return vk_error(VK_ERROR_LAYER_NOT_PRESENT);
+      return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT);
 
    return vk_enumerate_instance_extension_properties(
       &instance_extensions, pPropertyCount, pProperties);
@@ -1088,6 +2611,44 @@ anv_init_dri_options(struct anv_instance *instance)
                        instance->vk.app_info.app_version,
                        instance->vk.app_info.engine_name,
                        instance->vk.app_info.engine_version);
+
+    instance->assume_full_subgroups =
+            driQueryOptioni(&instance->dri_options, "anv_assume_full_subgroups");
+    instance->limit_trig_input_range =
+            driQueryOptionb(&instance->dri_options, "limit_trig_input_range");
+    instance->sample_mask_out_opengl_behaviour =
+            driQueryOptionb(&instance->dri_options, "anv_sample_mask_out_opengl_behaviour");
+    instance->force_filter_addr_rounding =
+            driQueryOptionb(&instance->dri_options, "anv_force_filter_addr_rounding");
+    instance->lower_depth_range_rate =
+            driQueryOptionf(&instance->dri_options, "lower_depth_range_rate");
+    instance->no_16bit =
+            driQueryOptionb(&instance->dri_options, "no_16bit");
+    instance->intel_enable_wa_14018912822 =
+            driQueryOptionb(&instance->dri_options, "intel_enable_wa_14018912822");
+    instance->mesh_conv_prim_attrs_to_vert_attrs =
+            driQueryOptioni(&instance->dri_options, "anv_mesh_conv_prim_attrs_to_vert_attrs");
+    instance->fp64_workaround_enabled =
+            driQueryOptionb(&instance->dri_options, "fp64_workaround_enabled");
+    instance->generated_indirect_threshold =
+            driQueryOptioni(&instance->dri_options, "generated_indirect_threshold");
+    instance->generated_indirect_ring_threshold =
+            driQueryOptioni(&instance->dri_options, "generated_indirect_ring_threshold");
+    instance->query_clear_with_blorp_threshold =
+       driQueryOptioni(&instance->dri_options, "query_clear_with_blorp_threshold");
+    instance->query_copy_with_shader_threshold =
+       driQueryOptioni(&instance->dri_options, "query_copy_with_shader_threshold");
+    instance->force_vk_vendor =
+       driQueryOptioni(&instance->dri_options, "force_vk_vendor");
+    instance->has_fake_sparse =
+       driQueryOptionb(&instance->dri_options, "fake_sparse");
+    instance->enable_tbimr = driQueryOptionb(&instance->dri_options, "intel_tbimr");
+    instance->disable_fcv =
+            driQueryOptionb(&instance->dri_options, "anv_disable_fcv");
+    instance->external_memory_implicit_sync =
+            driQueryOptionb(&instance->dri_options, "anv_external_memory_implicit_sync");
+    instance->compression_control_enabled =
+       driQueryOptionb(&instance->dri_options, "compression_control_enabled");
 }
 
 VkResult anv_CreateInstance(
@@ -1106,29 +2667,30 @@ VkResult anv_CreateInstance(
    instance = vk_alloc(pAllocator, sizeof(*instance), 8,
                        VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
    if (!instance)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    struct vk_instance_dispatch_table dispatch_table;
    vk_instance_dispatch_table_from_entrypoints(
       &dispatch_table, &anv_instance_entrypoints, true);
+   vk_instance_dispatch_table_from_entrypoints(
+      &dispatch_table, &wsi_instance_entrypoints, false);
 
    result = vk_instance_init(&instance->vk, &instance_extensions,
                              &dispatch_table, pCreateInfo, pAllocator);
    if (result != VK_SUCCESS) {
       vk_free(pAllocator, instance);
-      return vk_error(result);
+      return vk_error(NULL, result);
    }
 
-   instance->physical_devices_enumerated = false;
-   list_inithead(&instance->physical_devices);
-
-   instance->pipeline_cache_enabled =
-      env_var_as_boolean("ANV_ENABLE_PIPELINE_CACHE", true);
+   instance->vk.physical_devices.try_create_for_drm = anv_physical_device_try_create;
+   instance->vk.physical_devices.destroy = anv_physical_device_destroy;
 
    VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));
 
    anv_init_dri_options(instance);
 
+   intel_driver_ds_init();
+
    *pInstance = anv_instance_to_handle(instance);
 
    return VK_SUCCESS;
@@ -1143,10 +2705,6 @@ void anv_DestroyInstance(
    if (!instance)
       return;
 
-   list_for_each_entry_safe(struct anv_physical_device, pdevice,
-                            &instance->physical_devices, link)
-      anv_physical_device_destroy(pdevice);
-
    VG(VALGRIND_DESTROY_MEMPOOL(instance));
 
    driDestroyOptionCache(&instance->dri_options);
@@ -1156,1644 +2714,71 @@ void anv_DestroyInstance(
    vk_free(&instance->vk.alloc, instance);
 }
 
-static VkResult
-anv_enumerate_physical_devices(struct anv_instance *instance)
-{
-   if (instance->physical_devices_enumerated)
-      return VK_SUCCESS;
-
-   instance->physical_devices_enumerated = true;
-
-   /* TODO: Check for more devices ? */
-   drmDevicePtr devices[8];
-   int max_devices;
-
-   max_devices = drmGetDevices2(0, devices, ARRAY_SIZE(devices));
-   if (max_devices < 1)
-      return VK_SUCCESS;
-
-   VkResult result = VK_SUCCESS;
-   for (unsigned i = 0; i < (unsigned)max_devices; i++) {
-      if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER &&
-          devices[i]->bustype == DRM_BUS_PCI &&
-          devices[i]->deviceinfo.pci->vendor_id == 0x8086) {
-
-         struct anv_physical_device *pdevice;
-         result = anv_physical_device_try_create(instance, devices[i],
-                                                 &pdevice);
-         /* Incompatible DRM device, skip. */
-         if (result == VK_ERROR_INCOMPATIBLE_DRIVER) {
-            result = VK_SUCCESS;
-            continue;
-         }
-
-         /* Error creating the physical device, report the error. */
-         if (result != VK_SUCCESS)
-            break;
-
-         list_addtail(&pdevice->link, &instance->physical_devices);
-      }
-   }
-   drmFreeDevices(devices, max_devices);
-
-   /* If we successfully enumerated any devices, call it success */
-   return result;
-}
-
-VkResult anv_EnumeratePhysicalDevices(
-    VkInstance                                  _instance,
-    uint32_t*                                   pPhysicalDeviceCount,
-    VkPhysicalDevice*                           pPhysicalDevices)
-{
-   ANV_FROM_HANDLE(anv_instance, instance, _instance);
-   VK_OUTARRAY_MAKE(out, pPhysicalDevices, pPhysicalDeviceCount);
-
-   VkResult result = anv_enumerate_physical_devices(instance);
-   if (result != VK_SUCCESS)
-      return result;
-
-   list_for_each_entry(struct anv_physical_device, pdevice,
-                       &instance->physical_devices, link) {
-      vk_outarray_append(&out, i) {
-         *i = anv_physical_device_to_handle(pdevice);
-      }
-   }
-
-   return vk_outarray_status(&out);
-}
-
-VkResult anv_EnumeratePhysicalDeviceGroups(
-    VkInstance                                  _instance,
-    uint32_t*                                   pPhysicalDeviceGroupCount,
-    VkPhysicalDeviceGroupProperties*            pPhysicalDeviceGroupProperties)
-{
-   ANV_FROM_HANDLE(anv_instance, instance, _instance);
-   VK_OUTARRAY_MAKE(out, pPhysicalDeviceGroupProperties,
-                         pPhysicalDeviceGroupCount);
-
-   VkResult result = anv_enumerate_physical_devices(instance);
-   if (result != VK_SUCCESS)
-      return result;
-
-   list_for_each_entry(struct anv_physical_device, pdevice,
-                       &instance->physical_devices, link) {
-      vk_outarray_append(&out, p) {
-         p->physicalDeviceCount = 1;
-         memset(p->physicalDevices, 0, sizeof(p->physicalDevices));
-         p->physicalDevices[0] = anv_physical_device_to_handle(pdevice);
-         p->subsetAllocation = false;
-
-         vk_foreach_struct(ext, p->pNext)
-            anv_debug_ignored_stype(ext->sType);
-      }
-   }
-
-   return vk_outarray_status(&out);
-}
-
-void anv_GetPhysicalDeviceFeatures(
-    VkPhysicalDevice                            physicalDevice,
-    VkPhysicalDeviceFeatures*                   pFeatures)
-{
-   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
-
-   *pFeatures = (VkPhysicalDeviceFeatures) {
-      .robustBufferAccess                       = true,
-      .fullDrawIndexUint32                      = true,
-      .imageCubeArray                           = true,
-      .independentBlend                         = true,
-      .geometryShader                           = true,
-      .tessellationShader                       = true,
-      .sampleRateShading                        = true,
-      .dualSrcBlend                             = true,
-      .logicOp                                  = true,
-      .multiDrawIndirect                        = true,
-      .drawIndirectFirstInstance                = true,
-      .depthClamp                               = true,
-      .depthBiasClamp                           = true,
-      .fillModeNonSolid                         = true,
-      .depthBounds                              = pdevice->info.ver >= 12,
-      .wideLines                                = true,
-      .largePoints                              = true,
-      .alphaToOne                               = true,
-      .multiViewport                            = true,
-      .samplerAnisotropy                        = true,
-      .textureCompressionETC2                   = pdevice->info.ver >= 8 ||
-                                                  pdevice->info.is_baytrail,
-      .textureCompressionASTC_LDR               = pdevice->info.ver >= 9, /* FINISHME CHV */
-      .textureCompressionBC                     = true,
-      .occlusionQueryPrecise                    = true,
-      .pipelineStatisticsQuery                  = true,
-      .fragmentStoresAndAtomics                 = true,
-      .shaderTessellationAndGeometryPointSize   = true,
-      .shaderImageGatherExtended                = true,
-      .shaderStorageImageExtendedFormats        = true,
-      .shaderStorageImageMultisample            = false,
-      .shaderStorageImageReadWithoutFormat      = false,
-      .shaderStorageImageWriteWithoutFormat     = true,
-      .shaderUniformBufferArrayDynamicIndexing  = true,
-      .shaderSampledImageArrayDynamicIndexing   = true,
-      .shaderStorageBufferArrayDynamicIndexing  = true,
-      .shaderStorageImageArrayDynamicIndexing   = true,
-      .shaderClipDistance                       = true,
-      .shaderCullDistance                       = true,
-      .shaderFloat64                            = pdevice->info.ver >= 8 &&
-                                                  pdevice->info.has_64bit_float,
-      .shaderInt64                              = pdevice->info.ver >= 8,
-      .shaderInt16                              = pdevice->info.ver >= 8,
-      .shaderResourceMinLod                     = pdevice->info.ver >= 9,
-      .variableMultisampleRate                  = true,
-      .inheritedQueries                         = true,
-   };
-
-   /* We can't do image stores in vec4 shaders */
-   pFeatures->vertexPipelineStoresAndAtomics =
-      pdevice->compiler->scalar_stage[MESA_SHADER_VERTEX] &&
-      pdevice->compiler->scalar_stage[MESA_SHADER_GEOMETRY];
-
-   struct vk_app_info *app_info = &pdevice->instance->vk.app_info;
-
-   /* The new DOOM and Wolfenstein games require depthBounds without
-    * checking for it.  They seem to run fine without it so just claim it's
-    * there and accept the consequences.
-    */
-   if (app_info->engine_name && strcmp(app_info->engine_name, "idTech") == 0)
-      pFeatures->depthBounds = true;
-}
-
-static void
-anv_get_physical_device_features_1_1(struct anv_physical_device *pdevice,
-                                     VkPhysicalDeviceVulkan11Features *f)
-{
-   assert(f->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES);
-
-   f->storageBuffer16BitAccess            = pdevice->info.ver >= 8;
-   f->uniformAndStorageBuffer16BitAccess  = pdevice->info.ver >= 8;
-   f->storagePushConstant16               = pdevice->info.ver >= 8;
-   f->storageInputOutput16                = false;
-   f->multiview                           = true;
-   f->multiviewGeometryShader             = true;
-   f->multiviewTessellationShader         = true;
-   f->variablePointersStorageBuffer       = true;
-   f->variablePointers                    = true;
-   f->protectedMemory                     = false;
-   f->samplerYcbcrConversion              = true;
-   f->shaderDrawParameters                = true;
-}
-
-static void
-anv_get_physical_device_features_1_2(struct anv_physical_device *pdevice,
-                                     VkPhysicalDeviceVulkan12Features *f)
-{
-   assert(f->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES);
-
-   f->samplerMirrorClampToEdge            = true;
-   f->drawIndirectCount                   = true;
-   f->storageBuffer8BitAccess             = pdevice->info.ver >= 8;
-   f->uniformAndStorageBuffer8BitAccess   = pdevice->info.ver >= 8;
-   f->storagePushConstant8                = pdevice->info.ver >= 8;
-   f->shaderBufferInt64Atomics            = pdevice->info.ver >= 9 &&
-                                            pdevice->use_softpin;
-   f->shaderSharedInt64Atomics            = false;
-   f->shaderFloat16                       = pdevice->info.ver >= 8;
-   f->shaderInt8                          = pdevice->info.ver >= 8;
-
-   bool descIndexing = pdevice->has_a64_buffer_access &&
-                       pdevice->has_bindless_images;
-   f->descriptorIndexing                                 = descIndexing;
-   f->shaderInputAttachmentArrayDynamicIndexing          = false;
-   f->shaderUniformTexelBufferArrayDynamicIndexing       = descIndexing;
-   f->shaderStorageTexelBufferArrayDynamicIndexing       = descIndexing;
-   f->shaderUniformBufferArrayNonUniformIndexing         = false;
-   f->shaderSampledImageArrayNonUniformIndexing          = descIndexing;
-   f->shaderStorageBufferArrayNonUniformIndexing         = descIndexing;
-   f->shaderStorageImageArrayNonUniformIndexing          = descIndexing;
-   f->shaderInputAttachmentArrayNonUniformIndexing       = false;
-   f->shaderUniformTexelBufferArrayNonUniformIndexing    = descIndexing;
-   f->shaderStorageTexelBufferArrayNonUniformIndexing    = descIndexing;
-   f->descriptorBindingUniformBufferUpdateAfterBind      = false;
-   f->descriptorBindingSampledImageUpdateAfterBind       = descIndexing;
-   f->descriptorBindingStorageImageUpdateAfterBind       = descIndexing;
-   f->descriptorBindingStorageBufferUpdateAfterBind      = descIndexing;
-   f->descriptorBindingUniformTexelBufferUpdateAfterBind = descIndexing;
-   f->descriptorBindingStorageTexelBufferUpdateAfterBind = descIndexing;
-   f->descriptorBindingUpdateUnusedWhilePending          = descIndexing;
-   f->descriptorBindingPartiallyBound                    = descIndexing;
-   f->descriptorBindingVariableDescriptorCount           = descIndexing;
-   f->runtimeDescriptorArray                             = descIndexing;
-
-   f->samplerFilterMinmax                 = pdevice->info.ver >= 9;
-   f->scalarBlockLayout                   = true;
-   f->imagelessFramebuffer                = true;
-   f->uniformBufferStandardLayout         = true;
-   f->shaderSubgroupExtendedTypes         = true;
-   f->separateDepthStencilLayouts         = true;
-   f->hostQueryReset                      = true;
-   f->timelineSemaphore                   = true;
-   f->bufferDeviceAddress                 = pdevice->has_a64_buffer_access;
-   f->bufferDeviceAddressCaptureReplay    = pdevice->has_a64_buffer_access;
-   f->bufferDeviceAddressMultiDevice      = false;
-   f->vulkanMemoryModel                   = true;
-   f->vulkanMemoryModelDeviceScope        = true;
-   f->vulkanMemoryModelAvailabilityVisibilityChains = true;
-   f->shaderOutputViewportIndex           = true;
-   f->shaderOutputLayer                   = true;
-   f->subgroupBroadcastDynamicId          = true;
-}
-
-void anv_GetPhysicalDeviceFeatures2(
-    VkPhysicalDevice                            physicalDevice,
-    VkPhysicalDeviceFeatures2*                  pFeatures)
-{
-   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
-   anv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features);
-
-   VkPhysicalDeviceVulkan11Features core_1_1 = {
-      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES,
-   };
-   anv_get_physical_device_features_1_1(pdevice, &core_1_1);
-
-   VkPhysicalDeviceVulkan12Features core_1_2 = {
-      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES,
-   };
-   anv_get_physical_device_features_1_2(pdevice, &core_1_2);
-
-#define CORE_FEATURE(major, minor, feature) \
-   features->feature = core_##major##_##minor.feature
-
-
-   vk_foreach_struct(ext, pFeatures->pNext) {
-      switch (ext->sType) {
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_4444_FORMATS_FEATURES_EXT: {
-         VkPhysicalDevice4444FormatsFeaturesEXT *features =
-            (VkPhysicalDevice4444FormatsFeaturesEXT *)ext;
-         features->formatA4R4G4B4 = true;
-         features->formatA4B4G4R4 = false;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR: {
-         VkPhysicalDevice8BitStorageFeaturesKHR *features =
-            (VkPhysicalDevice8BitStorageFeaturesKHR *)ext;
-         CORE_FEATURE(1, 2, storageBuffer8BitAccess);
-         CORE_FEATURE(1, 2, uniformAndStorageBuffer8BitAccess);
-         CORE_FEATURE(1, 2, storagePushConstant8);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES: {
-         VkPhysicalDevice16BitStorageFeatures *features =
-            (VkPhysicalDevice16BitStorageFeatures *)ext;
-         CORE_FEATURE(1, 1, storageBuffer16BitAccess);
-         CORE_FEATURE(1, 1, uniformAndStorageBuffer16BitAccess);
-         CORE_FEATURE(1, 1, storagePushConstant16);
-         CORE_FEATURE(1, 1, storageInputOutput16);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_FEATURES_KHR: {
-         VkPhysicalDeviceAccelerationStructureFeaturesKHR *features = (void *)ext;
-         features->accelerationStructure = false;
-         features->accelerationStructureCaptureReplay = false;
-         features->accelerationStructureIndirectBuild = false;
-         features->accelerationStructureHostCommands = false;
-         features->descriptorBindingAccelerationStructureUpdateAfterBind = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_EXT: {
-         VkPhysicalDeviceBufferDeviceAddressFeaturesEXT *features = (void *)ext;
-         features->bufferDeviceAddress = pdevice->has_a64_buffer_access;
-         features->bufferDeviceAddressCaptureReplay = false;
-         features->bufferDeviceAddressMultiDevice = false;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_KHR: {
-         VkPhysicalDeviceBufferDeviceAddressFeaturesKHR *features = (void *)ext;
-         CORE_FEATURE(1, 2, bufferDeviceAddress);
-         CORE_FEATURE(1, 2, bufferDeviceAddressCaptureReplay);
-         CORE_FEATURE(1, 2, bufferDeviceAddressMultiDevice);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COLOR_WRITE_ENABLE_FEATURES_EXT: {
-         VkPhysicalDeviceColorWriteEnableFeaturesEXT *features =
-            (VkPhysicalDeviceColorWriteEnableFeaturesEXT *)ext;
-         features->colorWriteEnable = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COMPUTE_SHADER_DERIVATIVES_FEATURES_NV: {
-         VkPhysicalDeviceComputeShaderDerivativesFeaturesNV *features =
-            (VkPhysicalDeviceComputeShaderDerivativesFeaturesNV *)ext;
-         features->computeDerivativeGroupQuads = true;
-         features->computeDerivativeGroupLinear = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONDITIONAL_RENDERING_FEATURES_EXT: {
-         VkPhysicalDeviceConditionalRenderingFeaturesEXT *features =
-            (VkPhysicalDeviceConditionalRenderingFeaturesEXT*)ext;
-         features->conditionalRendering = pdevice->info.verx10 >= 75;
-         features->inheritedConditionalRendering = pdevice->info.verx10 >= 75;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_FEATURES_EXT: {
-         VkPhysicalDeviceCustomBorderColorFeaturesEXT *features =
-            (VkPhysicalDeviceCustomBorderColorFeaturesEXT *)ext;
-         features->customBorderColors = pdevice->info.ver >= 8;
-         features->customBorderColorWithoutFormat = pdevice->info.ver >= 8;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_CLIP_ENABLE_FEATURES_EXT: {
-         VkPhysicalDeviceDepthClipEnableFeaturesEXT *features =
-            (VkPhysicalDeviceDepthClipEnableFeaturesEXT *)ext;
-         features->depthClipEnable = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR: {
-         VkPhysicalDeviceFloat16Int8FeaturesKHR *features = (void *)ext;
-         CORE_FEATURE(1, 2, shaderFloat16);
-         CORE_FEATURE(1, 2, shaderInt8);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADER_INTERLOCK_FEATURES_EXT: {
-         VkPhysicalDeviceFragmentShaderInterlockFeaturesEXT *features =
-            (VkPhysicalDeviceFragmentShaderInterlockFeaturesEXT *)ext;
-         features->fragmentShaderSampleInterlock = pdevice->info.ver >= 9;
-         features->fragmentShaderPixelInterlock = pdevice->info.ver >= 9;
-         features->fragmentShaderShadingRateInterlock = false;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_HOST_QUERY_RESET_FEATURES_EXT: {
-         VkPhysicalDeviceHostQueryResetFeaturesEXT *features =
-            (VkPhysicalDeviceHostQueryResetFeaturesEXT *)ext;
-         CORE_FEATURE(1, 2, hostQueryReset);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_FEATURES_EXT: {
-         VkPhysicalDeviceDescriptorIndexingFeaturesEXT *features =
-            (VkPhysicalDeviceDescriptorIndexingFeaturesEXT *)ext;
-         CORE_FEATURE(1, 2, shaderInputAttachmentArrayDynamicIndexing);
-         CORE_FEATURE(1, 2, shaderUniformTexelBufferArrayDynamicIndexing);
-         CORE_FEATURE(1, 2, shaderStorageTexelBufferArrayDynamicIndexing);
-         CORE_FEATURE(1, 2, shaderUniformBufferArrayNonUniformIndexing);
-         CORE_FEATURE(1, 2, shaderSampledImageArrayNonUniformIndexing);
-         CORE_FEATURE(1, 2, shaderStorageBufferArrayNonUniformIndexing);
-         CORE_FEATURE(1, 2, shaderStorageImageArrayNonUniformIndexing);
-         CORE_FEATURE(1, 2, shaderInputAttachmentArrayNonUniformIndexing);
-         CORE_FEATURE(1, 2, shaderUniformTexelBufferArrayNonUniformIndexing);
-         CORE_FEATURE(1, 2, shaderStorageTexelBufferArrayNonUniformIndexing);
-         CORE_FEATURE(1, 2, descriptorBindingUniformBufferUpdateAfterBind);
-         CORE_FEATURE(1, 2, descriptorBindingSampledImageUpdateAfterBind);
-         CORE_FEATURE(1, 2, descriptorBindingStorageImageUpdateAfterBind);
-         CORE_FEATURE(1, 2, descriptorBindingStorageBufferUpdateAfterBind);
-         CORE_FEATURE(1, 2, descriptorBindingUniformTexelBufferUpdateAfterBind);
-         CORE_FEATURE(1, 2, descriptorBindingStorageTexelBufferUpdateAfterBind);
-         CORE_FEATURE(1, 2, descriptorBindingUpdateUnusedWhilePending);
-         CORE_FEATURE(1, 2, descriptorBindingPartiallyBound);
-         CORE_FEATURE(1, 2, descriptorBindingVariableDescriptorCount);
-         CORE_FEATURE(1, 2, runtimeDescriptorArray);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADING_RATE_FEATURES_KHR: {
-         VkPhysicalDeviceFragmentShadingRateFeaturesKHR *features =
-            (VkPhysicalDeviceFragmentShadingRateFeaturesKHR *)ext;
-         features->attachmentFragmentShadingRate = false;
-         features->pipelineFragmentShadingRate = true;
-         features->primitiveFragmentShadingRate = false;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_ROBUSTNESS_FEATURES_EXT: {
-         VkPhysicalDeviceImageRobustnessFeaturesEXT *features =
-            (VkPhysicalDeviceImageRobustnessFeaturesEXT *)ext;
-         features->robustImageAccess = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INDEX_TYPE_UINT8_FEATURES_EXT: {
-         VkPhysicalDeviceIndexTypeUint8FeaturesEXT *features =
-            (VkPhysicalDeviceIndexTypeUint8FeaturesEXT *)ext;
-         features->indexTypeUint8 = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_FEATURES_EXT: {
-         VkPhysicalDeviceInlineUniformBlockFeaturesEXT *features =
-            (VkPhysicalDeviceInlineUniformBlockFeaturesEXT *)ext;
-         features->inlineUniformBlock = true;
-         features->descriptorBindingInlineUniformBlockUpdateAfterBind = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_FEATURES_EXT: {
-         VkPhysicalDeviceLineRasterizationFeaturesEXT *features =
-            (VkPhysicalDeviceLineRasterizationFeaturesEXT *)ext;
-         features->rectangularLines = true;
-         features->bresenhamLines = true;
-         /* Support for Smooth lines with MSAA was removed on gfx11.  From the
-          * BSpec section "Multisample ModesState" table for "AA Line Support
-          * Requirements":
-          *
-          *    GFX10:BUG:######## 	NUM_MULTISAMPLES == 1
-          *
-          * Fortunately, this isn't a case most people care about.
-          */
-         features->smoothLines = pdevice->info.ver < 10;
-         features->stippledRectangularLines = false;
-         features->stippledBresenhamLines = true;
-         features->stippledSmoothLines = false;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_FEATURES: {
-         VkPhysicalDeviceMultiviewFeatures *features =
-            (VkPhysicalDeviceMultiviewFeatures *)ext;
-         CORE_FEATURE(1, 1, multiview);
-         CORE_FEATURE(1, 1, multiviewGeometryShader);
-         CORE_FEATURE(1, 1, multiviewTessellationShader);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGELESS_FRAMEBUFFER_FEATURES_KHR: {
-         VkPhysicalDeviceImagelessFramebufferFeaturesKHR *features =
-            (VkPhysicalDeviceImagelessFramebufferFeaturesKHR *)ext;
-         CORE_FEATURE(1, 2, imagelessFramebuffer);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_FEATURES_KHR: {
-         VkPhysicalDevicePerformanceQueryFeaturesKHR *feature =
-            (VkPhysicalDevicePerformanceQueryFeaturesKHR *)ext;
-         feature->performanceCounterQueryPools = true;
-         /* HW only supports a single configuration at a time. */
-         feature->performanceCounterMultipleQueryPools = false;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_CREATION_CACHE_CONTROL_FEATURES_EXT: {
-         VkPhysicalDevicePipelineCreationCacheControlFeaturesEXT *features =
-            (VkPhysicalDevicePipelineCreationCacheControlFeaturesEXT *)ext;
-         features->pipelineCreationCacheControl = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR: {
-         VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR *features =
-            (VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR *)ext;
-         features->pipelineExecutableInfo = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRIVATE_DATA_FEATURES_EXT: {
-         VkPhysicalDevicePrivateDataFeaturesEXT *features = (void *)ext;
-         features->privateData = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_FEATURES: {
-         VkPhysicalDeviceProtectedMemoryFeatures *features = (void *)ext;
-         CORE_FEATURE(1, 1, protectedMemory);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_FEATURES_EXT: {
-         VkPhysicalDeviceProvokingVertexFeaturesEXT *features =
-            (VkPhysicalDeviceProvokingVertexFeaturesEXT *)ext;
-         features->provokingVertexLast = true;
-         features->transformFeedbackPreservesProvokingVertex = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_EXT: {
-         VkPhysicalDeviceRobustness2FeaturesEXT *features = (void *)ext;
-         features->robustBufferAccess2 = true;
-         features->robustImageAccess2 = true;
-         features->nullDescriptor = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES: {
-         VkPhysicalDeviceSamplerYcbcrConversionFeatures *features =
-            (VkPhysicalDeviceSamplerYcbcrConversionFeatures *) ext;
-         CORE_FEATURE(1, 1, samplerYcbcrConversion);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SCALAR_BLOCK_LAYOUT_FEATURES_EXT: {
-         VkPhysicalDeviceScalarBlockLayoutFeaturesEXT *features =
-            (VkPhysicalDeviceScalarBlockLayoutFeaturesEXT *)ext;
-         CORE_FEATURE(1, 2, scalarBlockLayout);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SEPARATE_DEPTH_STENCIL_LAYOUTS_FEATURES_KHR: {
-         VkPhysicalDeviceSeparateDepthStencilLayoutsFeaturesKHR *features =
-            (VkPhysicalDeviceSeparateDepthStencilLayoutsFeaturesKHR *)ext;
-         CORE_FEATURE(1, 2, separateDepthStencilLayouts);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_FLOAT_FEATURES_EXT: {
-         VkPhysicalDeviceShaderAtomicFloatFeaturesEXT *features = (void *)ext;
-         features->shaderBufferFloat32Atomics =    true;
-         features->shaderBufferFloat32AtomicAdd =  pdevice->info.has_lsc;
-         features->shaderBufferFloat64Atomics =    pdevice->info.has_lsc;
-         features->shaderBufferFloat64AtomicAdd =  false;
-         features->shaderSharedFloat32Atomics =    true;
-         features->shaderSharedFloat32AtomicAdd =  false;
-         features->shaderSharedFloat64Atomics =    false;
-         features->shaderSharedFloat64AtomicAdd =  false;
-         features->shaderImageFloat32Atomics =     true;
-         features->shaderImageFloat32AtomicAdd =   false;
-         features->sparseImageFloat32Atomics =     false;
-         features->sparseImageFloat32AtomicAdd =   false;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_FLOAT_2_FEATURES_EXT: {
-         VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT *features = (void *)ext;
-         features->shaderBufferFloat16Atomics      = false;
-         features->shaderBufferFloat16AtomicAdd    = false;
-         features->shaderBufferFloat16AtomicMinMax = false;
-         features->shaderBufferFloat32AtomicMinMax = pdevice->info.ver >= 9;
-         features->shaderBufferFloat64AtomicMinMax = pdevice->info.has_lsc;
-         features->shaderSharedFloat16Atomics      = false;
-         features->shaderSharedFloat16AtomicAdd    = false;
-         features->shaderSharedFloat16AtomicMinMax = false;
-         features->shaderSharedFloat32AtomicMinMax = pdevice->info.ver >= 9;
-         features->shaderSharedFloat64AtomicMinMax = false;
-         features->shaderImageFloat32AtomicMinMax  = false;
-         features->sparseImageFloat32AtomicMinMax  = false;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_INT64_FEATURES_KHR: {
-         VkPhysicalDeviceShaderAtomicInt64FeaturesKHR *features = (void *)ext;
-         CORE_FEATURE(1, 2, shaderBufferInt64Atomics);
-         CORE_FEATURE(1, 2, shaderSharedInt64Atomics);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DEMOTE_TO_HELPER_INVOCATION_FEATURES_EXT: {
-         VkPhysicalDeviceShaderDemoteToHelperInvocationFeaturesEXT *features = (void *)ext;
-         features->shaderDemoteToHelperInvocation = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CLOCK_FEATURES_KHR: {
-         VkPhysicalDeviceShaderClockFeaturesKHR *features =
-            (VkPhysicalDeviceShaderClockFeaturesKHR *)ext;
-         features->shaderSubgroupClock = true;
-         features->shaderDeviceClock = false;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES: {
-         VkPhysicalDeviceShaderDrawParametersFeatures *features = (void *)ext;
-         CORE_FEATURE(1, 1, shaderDrawParameters);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_FUNCTIONS_2_FEATURES_INTEL: {
-         VkPhysicalDeviceShaderIntegerFunctions2FeaturesINTEL *features =
-            (VkPhysicalDeviceShaderIntegerFunctions2FeaturesINTEL *)ext;
-         features->shaderIntegerFunctions2 = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR: {
-         VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR *features =
-            (VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR *)ext;
-         features->shaderIntegerDotProduct = true;
-         break;
-      };
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SUBGROUP_EXTENDED_TYPES_FEATURES_KHR: {
-         VkPhysicalDeviceShaderSubgroupExtendedTypesFeaturesKHR *features =
-            (VkPhysicalDeviceShaderSubgroupExtendedTypesFeaturesKHR *)ext;
-         CORE_FEATURE(1, 2, shaderSubgroupExtendedTypes);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SUBGROUP_UNIFORM_CONTROL_FLOW_FEATURES_KHR: {
-         VkPhysicalDeviceShaderSubgroupUniformControlFlowFeaturesKHR *features =
-            (VkPhysicalDeviceShaderSubgroupUniformControlFlowFeaturesKHR *)ext;
-         features->shaderSubgroupUniformControlFlow = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_TERMINATE_INVOCATION_FEATURES_KHR: {
-         VkPhysicalDeviceShaderTerminateInvocationFeaturesKHR *features =
-            (VkPhysicalDeviceShaderTerminateInvocationFeaturesKHR *)ext;
-         features->shaderTerminateInvocation = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT: {
-         VkPhysicalDeviceSubgroupSizeControlFeaturesEXT *features =
-            (VkPhysicalDeviceSubgroupSizeControlFeaturesEXT *)ext;
-         features->subgroupSizeControl = true;
-         features->computeFullSubgroups = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_FEATURES_EXT: {
-         VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT *features =
-            (VkPhysicalDeviceTexelBufferAlignmentFeaturesEXT *)ext;
-         features->texelBufferAlignment = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES_KHR: {
-         VkPhysicalDeviceTimelineSemaphoreFeaturesKHR *features =
-            (VkPhysicalDeviceTimelineSemaphoreFeaturesKHR *) ext;
-         CORE_FEATURE(1, 2, timelineSemaphore);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES: {
-         VkPhysicalDeviceVariablePointersFeatures *features = (void *)ext;
-         CORE_FEATURE(1, 1, variablePointersStorageBuffer);
-         CORE_FEATURE(1, 1, variablePointers);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT: {
-         VkPhysicalDeviceTransformFeedbackFeaturesEXT *features =
-            (VkPhysicalDeviceTransformFeedbackFeaturesEXT *)ext;
-         features->transformFeedback = true;
-         features->geometryStreams = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_UNIFORM_BUFFER_STANDARD_LAYOUT_FEATURES_KHR: {
-         VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR *features =
-            (VkPhysicalDeviceUniformBufferStandardLayoutFeaturesKHR *)ext;
-         CORE_FEATURE(1, 2, uniformBufferStandardLayout);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_FEATURES_EXT: {
-         VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *features =
-            (VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *)ext;
-         features->vertexAttributeInstanceRateDivisor = true;
-         features->vertexAttributeInstanceRateZeroDivisor = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES:
-         anv_get_physical_device_features_1_1(pdevice, (void *)ext);
-         break;
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES:
-         anv_get_physical_device_features_1_2(pdevice, (void *)ext);
-         break;
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_MEMORY_MODEL_FEATURES_KHR: {
-         VkPhysicalDeviceVulkanMemoryModelFeaturesKHR *features = (void *)ext;
-         CORE_FEATURE(1, 2, vulkanMemoryModel);
-         CORE_FEATURE(1, 2, vulkanMemoryModelDeviceScope);
-         CORE_FEATURE(1, 2, vulkanMemoryModelAvailabilityVisibilityChains);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_FEATURES_KHR: {
-         VkPhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR *features =
-            (VkPhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR *)ext;
-         features->workgroupMemoryExplicitLayout = true;
-         features->workgroupMemoryExplicitLayoutScalarBlockLayout = true;
-         features->workgroupMemoryExplicitLayout8BitAccess = true;
-         features->workgroupMemoryExplicitLayout16BitAccess = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_YCBCR_IMAGE_ARRAYS_FEATURES_EXT: {
-         VkPhysicalDeviceYcbcrImageArraysFeaturesEXT *features =
-            (VkPhysicalDeviceYcbcrImageArraysFeaturesEXT *)ext;
-         features->ycbcrImageArrays = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_FEATURES_EXT: {
-         VkPhysicalDeviceExtendedDynamicStateFeaturesEXT *features =
-            (VkPhysicalDeviceExtendedDynamicStateFeaturesEXT *)ext;
-         features->extendedDynamicState = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_DYNAMIC_STATE_2_FEATURES_EXT: {
-         VkPhysicalDeviceExtendedDynamicState2FeaturesEXT *features =
-            (VkPhysicalDeviceExtendedDynamicState2FeaturesEXT *)ext;
-         features->extendedDynamicState2 = true;
-         features->extendedDynamicState2LogicOp = true;
-         features->extendedDynamicState2PatchControlPoints = false;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ZERO_INITIALIZE_WORKGROUP_MEMORY_FEATURES_KHR: {
-         VkPhysicalDeviceZeroInitializeWorkgroupMemoryFeaturesKHR *features =
-            (VkPhysicalDeviceZeroInitializeWorkgroupMemoryFeaturesKHR *)ext;
-         features->shaderZeroInitializeWorkgroupMemory = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_FEATURES_EXT: {
-         VkPhysicalDeviceMultiDrawFeaturesEXT *features = (VkPhysicalDeviceMultiDrawFeaturesEXT *)ext;
-         features->multiDraw = true;
-         break;
-      }
-
-      default:
-         anv_debug_ignored_stype(ext->sType);
-         break;
-      }
-   }
-
-#undef CORE_FEATURE
-}
-
-#define MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS   64
-
-#define MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS 64
-#define MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS       256
-
-#define MAX_CUSTOM_BORDER_COLORS                   4096
-
-void anv_GetPhysicalDeviceProperties(
-    VkPhysicalDevice                            physicalDevice,
-    VkPhysicalDeviceProperties*                 pProperties)
-{
-   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
-   const struct intel_device_info *devinfo = &pdevice->info;
-
-   /* See assertions made when programming the buffer surface state. */
-   const uint32_t max_raw_buffer_sz = devinfo->ver >= 7 ?
-                                      (1ul << 30) : (1ul << 27);
-
-   const uint32_t max_ssbos = pdevice->has_a64_buffer_access ? UINT16_MAX : 64;
-   const uint32_t max_textures =
-      pdevice->has_bindless_images ? UINT16_MAX : 128;
-   const uint32_t max_samplers =
-      pdevice->has_bindless_samplers ? UINT16_MAX :
-      (devinfo->verx10 >= 75) ? 128 : 16;
-   const uint32_t max_images =
-      pdevice->has_bindless_images ? UINT16_MAX : MAX_IMAGES;
-
-   /* If we can use bindless for everything, claim a high per-stage limit,
-    * otherwise use the binding table size, minus the slots reserved for
-    * render targets and one slot for the descriptor buffer. */
-   const uint32_t max_per_stage =
-      pdevice->has_bindless_images && pdevice->has_a64_buffer_access
-      ? UINT32_MAX : MAX_BINDING_TABLE_SIZE - MAX_RTS - 1;
-
-   const uint32_t max_workgroup_size = 32 * devinfo->max_cs_workgroup_threads;
-
-   VkSampleCountFlags sample_counts =
-      isl_device_get_sample_counts(&pdevice->isl_dev);
-
-
-   VkPhysicalDeviceLimits limits = {
-      .maxImageDimension1D                      = (1 << 14),
-      .maxImageDimension2D                      = (1 << 14),
-      .maxImageDimension3D                      = (1 << 11),
-      .maxImageDimensionCube                    = (1 << 14),
-      .maxImageArrayLayers                      = (1 << 11),
-      .maxTexelBufferElements                   = 128 * 1024 * 1024,
-      .maxUniformBufferRange                    = (1ul << 27),
-      .maxStorageBufferRange                    = max_raw_buffer_sz,
-      .maxPushConstantsSize                     = MAX_PUSH_CONSTANTS_SIZE,
-      .maxMemoryAllocationCount                 = UINT32_MAX,
-      .maxSamplerAllocationCount                = 64 * 1024,
-      .bufferImageGranularity                   = 64, /* A cache line */
-      .sparseAddressSpaceSize                   = 0,
-      .maxBoundDescriptorSets                   = MAX_SETS,
-      .maxPerStageDescriptorSamplers            = max_samplers,
-      .maxPerStageDescriptorUniformBuffers      = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS,
-      .maxPerStageDescriptorStorageBuffers      = max_ssbos,
-      .maxPerStageDescriptorSampledImages       = max_textures,
-      .maxPerStageDescriptorStorageImages       = max_images,
-      .maxPerStageDescriptorInputAttachments    = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS,
-      .maxPerStageResources                     = max_per_stage,
-      .maxDescriptorSetSamplers                 = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSamplers */
-      .maxDescriptorSetUniformBuffers           = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS,           /* number of stages * maxPerStageDescriptorUniformBuffers */
-      .maxDescriptorSetUniformBuffersDynamic    = MAX_DYNAMIC_BUFFERS / 2,
-      .maxDescriptorSetStorageBuffers           = 6 * max_ssbos,    /* number of stages * maxPerStageDescriptorStorageBuffers */
-      .maxDescriptorSetStorageBuffersDynamic    = MAX_DYNAMIC_BUFFERS / 2,
-      .maxDescriptorSetSampledImages            = 6 * max_textures, /* number of stages * maxPerStageDescriptorSampledImages */
-      .maxDescriptorSetStorageImages            = 6 * max_images,   /* number of stages * maxPerStageDescriptorStorageImages */
-      .maxDescriptorSetInputAttachments         = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS,
-      .maxVertexInputAttributes                 = MAX_VBS,
-      .maxVertexInputBindings                   = MAX_VBS,
-      .maxVertexInputAttributeOffset            = 2047,
-      .maxVertexInputBindingStride              = 2048,
-      .maxVertexOutputComponents                = 128,
-      .maxTessellationGenerationLevel           = 64,
-      .maxTessellationPatchSize                 = 32,
-      .maxTessellationControlPerVertexInputComponents = 128,
-      .maxTessellationControlPerVertexOutputComponents = 128,
-      .maxTessellationControlPerPatchOutputComponents = 128,
-      .maxTessellationControlTotalOutputComponents = 2048,
-      .maxTessellationEvaluationInputComponents = 128,
-      .maxTessellationEvaluationOutputComponents = 128,
-      .maxGeometryShaderInvocations             = 32,
-      .maxGeometryInputComponents               = devinfo->ver >= 8 ? 128 : 64,
-      .maxGeometryOutputComponents              = 128,
-      .maxGeometryOutputVertices                = 256,
-      .maxGeometryTotalOutputComponents         = 1024,
-      .maxFragmentInputComponents               = 116, /* 128 components - (PSIZ, CLIP_DIST0, CLIP_DIST1) */
-      .maxFragmentOutputAttachments             = 8,
-      .maxFragmentDualSrcAttachments            = 1,
-      .maxFragmentCombinedOutputResources       = 8,
-      .maxComputeSharedMemorySize               = 64 * 1024,
-      .maxComputeWorkGroupCount                 = { 65535, 65535, 65535 },
-      .maxComputeWorkGroupInvocations           = max_workgroup_size,
-      .maxComputeWorkGroupSize = {
-         max_workgroup_size,
-         max_workgroup_size,
-         max_workgroup_size,
-      },
-      .subPixelPrecisionBits                    = 8,
-      .subTexelPrecisionBits                    = 8,
-      .mipmapPrecisionBits                      = 8,
-      .maxDrawIndexedIndexValue                 = UINT32_MAX,
-      .maxDrawIndirectCount                     = UINT32_MAX,
-      .maxSamplerLodBias                        = 16,
-      .maxSamplerAnisotropy                     = 16,
-      .maxViewports                             = MAX_VIEWPORTS,
-      .maxViewportDimensions                    = { (1 << 14), (1 << 14) },
-      .viewportBoundsRange                      = { INT16_MIN, INT16_MAX },
-      .viewportSubPixelBits                     = 13, /* We take a float? */
-      .minMemoryMapAlignment                    = 4096, /* A page */
-      /* The dataport requires texel alignment so we need to assume a worst
-       * case of R32G32B32A32 which is 16 bytes.
-       */
-      .minTexelBufferOffsetAlignment            = 16,
-      .minUniformBufferOffsetAlignment          = ANV_UBO_ALIGNMENT,
-      .minStorageBufferOffsetAlignment          = ANV_SSBO_ALIGNMENT,
-      .minTexelOffset                           = -8,
-      .maxTexelOffset                           = 7,
-      .minTexelGatherOffset                     = -32,
-      .maxTexelGatherOffset                     = 31,
-      .minInterpolationOffset                   = -0.5,
-      .maxInterpolationOffset                   = 0.4375,
-      .subPixelInterpolationOffsetBits          = 4,
-      .maxFramebufferWidth                      = (1 << 14),
-      .maxFramebufferHeight                     = (1 << 14),
-      .maxFramebufferLayers                     = (1 << 11),
-      .framebufferColorSampleCounts             = sample_counts,
-      .framebufferDepthSampleCounts             = sample_counts,
-      .framebufferStencilSampleCounts           = sample_counts,
-      .framebufferNoAttachmentsSampleCounts     = sample_counts,
-      .maxColorAttachments                      = MAX_RTS,
-      .sampledImageColorSampleCounts            = sample_counts,
-      .sampledImageIntegerSampleCounts          = sample_counts,
-      .sampledImageDepthSampleCounts            = sample_counts,
-      .sampledImageStencilSampleCounts          = sample_counts,
-      .storageImageSampleCounts                 = VK_SAMPLE_COUNT_1_BIT,
-      .maxSampleMaskWords                       = 1,
-      .timestampComputeAndGraphics              = true,
-      .timestampPeriod                          = 1000000000.0 / devinfo->timestamp_frequency,
-      .maxClipDistances                         = 8,
-      .maxCullDistances                         = 8,
-      .maxCombinedClipAndCullDistances          = 8,
-      .discreteQueuePriorities                  = 2,
-      .pointSizeRange                           = { 0.125, 255.875 },
-      /* While SKL and up support much wider lines than we are setting here,
-       * in practice we run into conformance issues if we go past this limit.
-       * Since the Windows driver does the same, it's probably fair to assume
-       * that no one needs more than this.
-       */
-      .lineWidthRange                           = { 0.0, 7.9921875 },
-      .pointSizeGranularity                     = (1.0 / 8.0),
-      .lineWidthGranularity                     = (1.0 / 128.0),
-      .strictLines                              = false,
-      .standardSampleLocations                  = true,
-      .optimalBufferCopyOffsetAlignment         = 128,
-      .optimalBufferCopyRowPitchAlignment       = 128,
-      .nonCoherentAtomSize                      = 64,
-   };
-
-   *pProperties = (VkPhysicalDeviceProperties) {
-      .apiVersion = ANV_API_VERSION,
-      .driverVersion = vk_get_driver_version(),
-      .vendorID = 0x8086,
-      .deviceID = pdevice->info.chipset_id,
-      .deviceType = pdevice->info.has_local_mem ?
-                    VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU :
-                    VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU,
-      .limits = limits,
-      .sparseProperties = {0}, /* Broadwell doesn't do sparse. */
-   };
-
-   snprintf(pProperties->deviceName, sizeof(pProperties->deviceName),
-            "%s", pdevice->info.name);
-   memcpy(pProperties->pipelineCacheUUID,
-          pdevice->pipeline_cache_uuid, VK_UUID_SIZE);
-}
-
-static void
-anv_get_physical_device_properties_1_1(struct anv_physical_device *pdevice,
-                                       VkPhysicalDeviceVulkan11Properties *p)
-{
-   assert(p->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES);
-
-   memcpy(p->deviceUUID, pdevice->device_uuid, VK_UUID_SIZE);
-   memcpy(p->driverUUID, pdevice->driver_uuid, VK_UUID_SIZE);
-   memset(p->deviceLUID, 0, VK_LUID_SIZE);
-   p->deviceNodeMask = 0;
-   p->deviceLUIDValid = false;
-
-   p->subgroupSize = BRW_SUBGROUP_SIZE;
-   VkShaderStageFlags scalar_stages = 0;
-   for (unsigned stage = 0; stage < MESA_SHADER_STAGES; stage++) {
-      if (pdevice->compiler->scalar_stage[stage])
-         scalar_stages |= mesa_to_vk_shader_stage(stage);
-   }
-   if (pdevice->vk.supported_extensions.KHR_ray_tracing_pipeline) {
-      scalar_stages |= MESA_SHADER_RAYGEN |
-                       MESA_SHADER_ANY_HIT |
-                       MESA_SHADER_CLOSEST_HIT |
-                       MESA_SHADER_MISS |
-                       MESA_SHADER_INTERSECTION |
-                       MESA_SHADER_CALLABLE;
-   }
-   p->subgroupSupportedStages = scalar_stages;
-   p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT |
-                                    VK_SUBGROUP_FEATURE_VOTE_BIT |
-                                    VK_SUBGROUP_FEATURE_BALLOT_BIT |
-                                    VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
-                                    VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT |
-                                    VK_SUBGROUP_FEATURE_QUAD_BIT;
-   if (pdevice->info.ver >= 8) {
-      /* TODO: There's no technical reason why these can't be made to
-       * work on gfx7 but they don't at the moment so it's best to leave
-       * the feature disabled than enabled and broken.
-       */
-      p->subgroupSupportedOperations |= VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
-                                        VK_SUBGROUP_FEATURE_CLUSTERED_BIT;
-   }
-   p->subgroupQuadOperationsInAllStages = pdevice->info.ver >= 8;
-
-   p->pointClippingBehavior      = VK_POINT_CLIPPING_BEHAVIOR_USER_CLIP_PLANES_ONLY;
-   p->maxMultiviewViewCount      = 16;
-   p->maxMultiviewInstanceIndex  = UINT32_MAX / 16;
-   p->protectedNoFault           = false;
-   /* This value doesn't matter for us today as our per-stage descriptors are
-    * the real limit.
-    */
-   p->maxPerSetDescriptors       = 1024;
-   p->maxMemoryAllocationSize    = MAX_MEMORY_ALLOCATION_SIZE;
-}
-
-static void
-anv_get_physical_device_properties_1_2(struct anv_physical_device *pdevice,
-                                       VkPhysicalDeviceVulkan12Properties *p)
-{
-   assert(p->sType == VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES);
-
-   p->driverID = VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA_KHR;
-   memset(p->driverName, 0, sizeof(p->driverName));
-   snprintf(p->driverName, VK_MAX_DRIVER_NAME_SIZE_KHR,
-            "Intel open-source Mesa driver");
-   memset(p->driverInfo, 0, sizeof(p->driverInfo));
-   snprintf(p->driverInfo, VK_MAX_DRIVER_INFO_SIZE_KHR,
-            "Mesa " PACKAGE_VERSION MESA_GIT_SHA1);
-   p->conformanceVersion = (VkConformanceVersionKHR) {
-      .major = 1,
-      .minor = 2,
-      .subminor = 0,
-      .patch = 0,
-   };
-
-   p->denormBehaviorIndependence =
-      VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL_KHR;
-   p->roundingModeIndependence =
-      VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_NONE_KHR;
-
-   /* Broadwell does not support HF denorms and there are restrictions
-    * other gens. According to Kabylake's PRM:
-    *
-    * "math - Extended Math Function
-    * [...]
-    * Restriction : Half-float denorms are always retained."
-    */
-   p->shaderDenormFlushToZeroFloat16         = false;
-   p->shaderDenormPreserveFloat16            = pdevice->info.ver > 8;
-   p->shaderRoundingModeRTEFloat16           = true;
-   p->shaderRoundingModeRTZFloat16           = true;
-   p->shaderSignedZeroInfNanPreserveFloat16  = true;
-
-   p->shaderDenormFlushToZeroFloat32         = true;
-   p->shaderDenormPreserveFloat32            = true;
-   p->shaderRoundingModeRTEFloat32           = true;
-   p->shaderRoundingModeRTZFloat32           = true;
-   p->shaderSignedZeroInfNanPreserveFloat32  = true;
-
-   p->shaderDenormFlushToZeroFloat64         = true;
-   p->shaderDenormPreserveFloat64            = true;
-   p->shaderRoundingModeRTEFloat64           = true;
-   p->shaderRoundingModeRTZFloat64           = true;
-   p->shaderSignedZeroInfNanPreserveFloat64  = true;
-
-   /* It's a bit hard to exactly map our implementation to the limits
-    * described by Vulkan.  The bindless surface handle in the extended
-    * message descriptors is 20 bits and it's an index into the table of
-    * RENDER_SURFACE_STATE structs that starts at bindless surface base
-    * address.  This means that we can have at must 1M surface states
-    * allocated at any given time.  Since most image views take two
-    * descriptors, this means we have a limit of about 500K image views.
-    *
-    * However, since we allocate surface states at vkCreateImageView time,
-    * this means our limit is actually something on the order of 500K image
-    * views allocated at any time.  The actual limit describe by Vulkan, on
-    * the other hand, is a limit of how many you can have in a descriptor set.
-    * Assuming anyone using 1M descriptors will be using the same image view
-    * twice a bunch of times (or a bunch of null descriptors), we can safely
-    * advertise a larger limit here.
-    */
-   const unsigned max_bindless_views = 1 << 20;
-   p->maxUpdateAfterBindDescriptorsInAllPools            = max_bindless_views;
-   p->shaderUniformBufferArrayNonUniformIndexingNative   = false;
-   p->shaderSampledImageArrayNonUniformIndexingNative    = false;
-   p->shaderStorageBufferArrayNonUniformIndexingNative   = true;
-   p->shaderStorageImageArrayNonUniformIndexingNative    = false;
-   p->shaderInputAttachmentArrayNonUniformIndexingNative = false;
-   p->robustBufferAccessUpdateAfterBind                  = true;
-   p->quadDivergentImplicitLod                           = false;
-   p->maxPerStageDescriptorUpdateAfterBindSamplers       = max_bindless_views;
-   p->maxPerStageDescriptorUpdateAfterBindUniformBuffers = MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS;
-   p->maxPerStageDescriptorUpdateAfterBindStorageBuffers = UINT32_MAX;
-   p->maxPerStageDescriptorUpdateAfterBindSampledImages  = max_bindless_views;
-   p->maxPerStageDescriptorUpdateAfterBindStorageImages  = max_bindless_views;
-   p->maxPerStageDescriptorUpdateAfterBindInputAttachments = MAX_PER_STAGE_DESCRIPTOR_INPUT_ATTACHMENTS;
-   p->maxPerStageUpdateAfterBindResources                = UINT32_MAX;
-   p->maxDescriptorSetUpdateAfterBindSamplers            = max_bindless_views;
-   p->maxDescriptorSetUpdateAfterBindUniformBuffers      = 6 * MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BUFFERS;
-   p->maxDescriptorSetUpdateAfterBindUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2;
-   p->maxDescriptorSetUpdateAfterBindStorageBuffers      = UINT32_MAX;
-   p->maxDescriptorSetUpdateAfterBindStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2;
-   p->maxDescriptorSetUpdateAfterBindSampledImages       = max_bindless_views;
-   p->maxDescriptorSetUpdateAfterBindStorageImages       = max_bindless_views;
-   p->maxDescriptorSetUpdateAfterBindInputAttachments    = MAX_DESCRIPTOR_SET_INPUT_ATTACHMENTS;
-
-   /* We support all of the depth resolve modes */
-   p->supportedDepthResolveModes    = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR |
-                                      VK_RESOLVE_MODE_AVERAGE_BIT_KHR |
-                                      VK_RESOLVE_MODE_MIN_BIT_KHR |
-                                      VK_RESOLVE_MODE_MAX_BIT_KHR;
-   /* Average doesn't make sense for stencil so we don't support that */
-   p->supportedStencilResolveModes  = VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR;
-   if (pdevice->info.ver >= 8) {
-      /* The advanced stencil resolve modes currently require stencil
-       * sampling be supported by the hardware.
-       */
-      p->supportedStencilResolveModes |= VK_RESOLVE_MODE_MIN_BIT_KHR |
-                                         VK_RESOLVE_MODE_MAX_BIT_KHR;
-   }
-   p->independentResolveNone  = true;
-   p->independentResolve      = true;
-
-   p->filterMinmaxSingleComponentFormats  = pdevice->info.ver >= 9;
-   p->filterMinmaxImageComponentMapping   = pdevice->info.ver >= 9;
-
-   p->maxTimelineSemaphoreValueDifference = UINT64_MAX;
-
-   p->framebufferIntegerColorSampleCounts =
-      isl_device_get_sample_counts(&pdevice->isl_dev);
-}
-
 void anv_GetPhysicalDeviceProperties2(
     VkPhysicalDevice                            physicalDevice,
     VkPhysicalDeviceProperties2*                pProperties)
 {
-   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
-
-   anv_GetPhysicalDeviceProperties(physicalDevice, &pProperties->properties);
-
-   VkPhysicalDeviceVulkan11Properties core_1_1 = {
-      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES,
-   };
-   anv_get_physical_device_properties_1_1(pdevice, &core_1_1);
-
-   VkPhysicalDeviceVulkan12Properties core_1_2 = {
-      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES,
-   };
-   anv_get_physical_device_properties_1_2(pdevice, &core_1_2);
-
-#define CORE_RENAMED_PROPERTY(major, minor, ext_property, core_property) \
-   memcpy(&properties->ext_property, &core_##major##_##minor.core_property, \
-          sizeof(core_##major##_##minor.core_property))
-
-#define CORE_PROPERTY(major, minor, property) \
-   CORE_RENAMED_PROPERTY(major, minor, property, property)
+   vk_common_GetPhysicalDeviceProperties2(physicalDevice, pProperties);
 
+   /* Unfortunately the runtime isn't handling ANDROID extensions. */
    vk_foreach_struct(ext, pProperties->pNext) {
       switch (ext->sType) {
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ACCELERATION_STRUCTURE_PROPERTIES_KHR: {
-         VkPhysicalDeviceAccelerationStructurePropertiesKHR *props = (void *)ext;
-         props->maxGeometryCount = (1u << 24) - 1;
-         props->maxInstanceCount = (1u << 24) - 1;
-         props->maxPrimitiveCount = (1u << 29) - 1;
-         props->maxPerStageDescriptorAccelerationStructures = UINT16_MAX;
-         props->maxPerStageDescriptorUpdateAfterBindAccelerationStructures = UINT16_MAX;
-         props->maxDescriptorSetAccelerationStructures = UINT16_MAX;
-         props->maxDescriptorSetUpdateAfterBindAccelerationStructures = UINT16_MAX;
-         props->minAccelerationStructureScratchOffsetAlignment = 64;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONSERVATIVE_RASTERIZATION_PROPERTIES_EXT: {
-         /* TODO: Real limits */
-         VkPhysicalDeviceConservativeRasterizationPropertiesEXT *properties =
-            (VkPhysicalDeviceConservativeRasterizationPropertiesEXT *)ext;
-         /* There's nothing in the public docs about this value as far as I
-          * can tell.  However, this is the value the Windows driver reports
-          * and there's a comment on a rejected HW feature in the internal
-          * docs that says:
-          *
-          *    "This is similar to conservative rasterization, except the
-          *    primitive area is not extended by 1/512 and..."
-          *
-          * That's a bit of an obtuse reference but it's the best we've got
-          * for now.
-          */
-         properties->primitiveOverestimationSize = 1.0f / 512.0f;
-         properties->maxExtraPrimitiveOverestimationSize = 0.0f;
-         properties->extraPrimitiveOverestimationSizeGranularity = 0.0f;
-         properties->primitiveUnderestimation = false;
-         properties->conservativePointAndLineRasterization = false;
-         properties->degenerateTrianglesRasterized = true;
-         properties->degenerateLinesRasterized = false;
-         properties->fullyCoveredFragmentShaderInputVariable = false;
-         properties->conservativeRasterizationPostDepthCoverage = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CUSTOM_BORDER_COLOR_PROPERTIES_EXT: {
-         VkPhysicalDeviceCustomBorderColorPropertiesEXT *properties =
-            (VkPhysicalDeviceCustomBorderColorPropertiesEXT *)ext;
-         properties->maxCustomBorderColorSamplers = MAX_CUSTOM_BORDER_COLORS;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_STENCIL_RESOLVE_PROPERTIES_KHR: {
-         VkPhysicalDeviceDepthStencilResolvePropertiesKHR *properties =
-            (VkPhysicalDeviceDepthStencilResolvePropertiesKHR *)ext;
-         CORE_PROPERTY(1, 2, supportedDepthResolveModes);
-         CORE_PROPERTY(1, 2, supportedStencilResolveModes);
-         CORE_PROPERTY(1, 2, independentResolveNone);
-         CORE_PROPERTY(1, 2, independentResolve);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_PROPERTIES_EXT: {
-         VkPhysicalDeviceDescriptorIndexingPropertiesEXT *properties =
-            (VkPhysicalDeviceDescriptorIndexingPropertiesEXT *)ext;
-         CORE_PROPERTY(1, 2, maxUpdateAfterBindDescriptorsInAllPools);
-         CORE_PROPERTY(1, 2, shaderUniformBufferArrayNonUniformIndexingNative);
-         CORE_PROPERTY(1, 2, shaderSampledImageArrayNonUniformIndexingNative);
-         CORE_PROPERTY(1, 2, shaderStorageBufferArrayNonUniformIndexingNative);
-         CORE_PROPERTY(1, 2, shaderStorageImageArrayNonUniformIndexingNative);
-         CORE_PROPERTY(1, 2, shaderInputAttachmentArrayNonUniformIndexingNative);
-         CORE_PROPERTY(1, 2, robustBufferAccessUpdateAfterBind);
-         CORE_PROPERTY(1, 2, quadDivergentImplicitLod);
-         CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindSamplers);
-         CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindUniformBuffers);
-         CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindStorageBuffers);
-         CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindSampledImages);
-         CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindStorageImages);
-         CORE_PROPERTY(1, 2, maxPerStageDescriptorUpdateAfterBindInputAttachments);
-         CORE_PROPERTY(1, 2, maxPerStageUpdateAfterBindResources);
-         CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindSamplers);
-         CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindUniformBuffers);
-         CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindUniformBuffersDynamic);
-         CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindStorageBuffers);
-         CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindStorageBuffersDynamic);
-         CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindSampledImages);
-         CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindStorageImages);
-         CORE_PROPERTY(1, 2, maxDescriptorSetUpdateAfterBindInputAttachments);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FRAGMENT_SHADING_RATE_PROPERTIES_KHR: {
-         VkPhysicalDeviceFragmentShadingRatePropertiesKHR *props =
-            (VkPhysicalDeviceFragmentShadingRatePropertiesKHR *)ext;
-         /* Those must be 0 if attachmentFragmentShadingRate is not
-          * supported.
-          */
-         props->minFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 0, 0 };
-         props->maxFragmentShadingRateAttachmentTexelSize = (VkExtent2D) { 0, 0 };
-         props->maxFragmentShadingRateAttachmentTexelSizeAspectRatio = 0;
-
-         props->primitiveFragmentShadingRateWithMultipleViewports = false;
-         props->layeredShadingRateAttachments = false;
-         props->fragmentShadingRateNonTrivialCombinerOps = false;
-         props->maxFragmentSize = (VkExtent2D) { 4, 4 };
-         props->maxFragmentSizeAspectRatio = 4;
-         props->maxFragmentShadingRateCoverageSamples = 4 * 4 * 16;
-         props->maxFragmentShadingRateRasterizationSamples = VK_SAMPLE_COUNT_16_BIT;
-         props->fragmentShadingRateWithShaderDepthStencilWrites = false;
-         props->fragmentShadingRateWithSampleMask = true;
-         props->fragmentShadingRateWithShaderSampleMask = false;
-         props->fragmentShadingRateWithConservativeRasterization = true;
-         props->fragmentShadingRateWithFragmentShaderInterlock = true;
-         props->fragmentShadingRateWithCustomSampleLocations = true;
-         props->fragmentShadingRateStrictMultiplyCombiner = false;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES_KHR: {
-         VkPhysicalDeviceDriverPropertiesKHR *properties =
-            (VkPhysicalDeviceDriverPropertiesKHR *) ext;
-         CORE_PROPERTY(1, 2, driverID);
-         CORE_PROPERTY(1, 2, driverName);
-         CORE_PROPERTY(1, 2, driverInfo);
-         CORE_PROPERTY(1, 2, conformanceVersion);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRM_PROPERTIES_EXT: {
-         VkPhysicalDeviceDrmPropertiesEXT *props =
-            (VkPhysicalDeviceDrmPropertiesEXT *)ext;
-
-         props->hasPrimary = pdevice->has_master;
-         props->primaryMajor = pdevice->master_major;
-         props->primaryMinor = pdevice->master_minor;
-
-         props->hasRender = pdevice->has_local;
-         props->renderMajor = pdevice->local_major;
-         props->renderMinor = pdevice->local_minor;
-
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_MEMORY_HOST_PROPERTIES_EXT: {
-         VkPhysicalDeviceExternalMemoryHostPropertiesEXT *props =
-            (VkPhysicalDeviceExternalMemoryHostPropertiesEXT *) ext;
-         /* Userptr needs page aligned memory. */
-         props->minImportedHostPointerAlignment = 4096;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES: {
-         VkPhysicalDeviceIDProperties *properties =
-            (VkPhysicalDeviceIDProperties *)ext;
-         CORE_PROPERTY(1, 1, deviceUUID);
-         CORE_PROPERTY(1, 1, driverUUID);
-         CORE_PROPERTY(1, 1, deviceLUID);
-         CORE_PROPERTY(1, 1, deviceLUIDValid);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_PROPERTIES_EXT: {
-         VkPhysicalDeviceInlineUniformBlockPropertiesEXT *props =
-            (VkPhysicalDeviceInlineUniformBlockPropertiesEXT *)ext;
-         props->maxInlineUniformBlockSize = MAX_INLINE_UNIFORM_BLOCK_SIZE;
-         props->maxPerStageDescriptorInlineUniformBlocks =
-            MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
-         props->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks =
-            MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
-         props->maxDescriptorSetInlineUniformBlocks =
-            MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
-         props->maxDescriptorSetUpdateAfterBindInlineUniformBlocks =
-            MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LINE_RASTERIZATION_PROPERTIES_EXT: {
-         VkPhysicalDeviceLineRasterizationPropertiesEXT *props =
-            (VkPhysicalDeviceLineRasterizationPropertiesEXT *)ext;
-         /* In the Skylake PRM Vol. 7, subsection titled "GIQ (Diamond)
-          * Sampling Rules - Legacy Mode", it says the following:
-          *
-          *    "Note that the device divides a pixel into a 16x16 array of
-          *    subpixels, referenced by their upper left corners."
-          *
-          * This is the only known reference in the PRMs to the subpixel
-          * precision of line rasterization and a "16x16 array of subpixels"
-          * implies 4 subpixel precision bits.  Empirical testing has shown
-          * that 4 subpixel precision bits applies to all line rasterization
-          * types.
-          */
-         props->lineSubPixelPrecisionBits = 4;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES: {
-         VkPhysicalDeviceMaintenance3Properties *properties =
-            (VkPhysicalDeviceMaintenance3Properties *)ext;
-         /* This value doesn't matter for us today as our per-stage
-          * descriptors are the real limit.
-          */
-         CORE_PROPERTY(1, 1, maxPerSetDescriptors);
-         CORE_PROPERTY(1, 1, maxMemoryAllocationSize);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_PROPERTIES: {
-         VkPhysicalDeviceMultiviewProperties *properties =
-            (VkPhysicalDeviceMultiviewProperties *)ext;
-         CORE_PROPERTY(1, 1, maxMultiviewViewCount);
-         CORE_PROPERTY(1, 1, maxMultiviewInstanceIndex);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PCI_BUS_INFO_PROPERTIES_EXT: {
-         VkPhysicalDevicePCIBusInfoPropertiesEXT *properties =
-            (VkPhysicalDevicePCIBusInfoPropertiesEXT *)ext;
-         properties->pciDomain = pdevice->pci_info.domain;
-         properties->pciBus = pdevice->pci_info.bus;
-         properties->pciDevice = pdevice->pci_info.device;
-         properties->pciFunction = pdevice->pci_info.function;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_PROPERTIES_KHR: {
-         VkPhysicalDevicePerformanceQueryPropertiesKHR *properties =
-            (VkPhysicalDevicePerformanceQueryPropertiesKHR *)ext;
-         /* We could support this by spawning a shader to do the equation
-          * normalization.
-          */
-         properties->allowCommandBufferQueryCopies = false;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_POINT_CLIPPING_PROPERTIES: {
-         VkPhysicalDevicePointClippingProperties *properties =
-            (VkPhysicalDevicePointClippingProperties *) ext;
-         CORE_PROPERTY(1, 1, pointClippingBehavior);
-         break;
-      }
-
+#if DETECT_OS_ANDROID
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wswitch"
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENTATION_PROPERTIES_ANDROID: {
          VkPhysicalDevicePresentationPropertiesANDROID *props =
             (VkPhysicalDevicePresentationPropertiesANDROID *)ext;
-         props->sharedImage = VK_FALSE;
+         uint64_t front_rendering_usage = 0;
+         struct u_gralloc *gralloc = u_gralloc_create(U_GRALLOC_TYPE_AUTO);
+         if (gralloc != NULL) {
+            u_gralloc_get_front_rendering_usage(gralloc, &front_rendering_usage);
+            u_gralloc_destroy(&gralloc);
+         }
+         props->sharedImage = front_rendering_usage ? VK_TRUE : VK_FALSE;
          break;
       }
 #pragma GCC diagnostic pop
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_PROPERTIES: {
-         VkPhysicalDeviceProtectedMemoryProperties *properties =
-            (VkPhysicalDeviceProtectedMemoryProperties *)ext;
-         CORE_PROPERTY(1, 1, protectedNoFault);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_PROPERTIES_EXT: {
-         VkPhysicalDeviceProvokingVertexPropertiesEXT *properties =
-            (VkPhysicalDeviceProvokingVertexPropertiesEXT *)ext;
-         properties->provokingVertexModePerPipeline = true;
-         properties->transformFeedbackPreservesTriangleFanProvokingVertex = false;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR: {
-         VkPhysicalDevicePushDescriptorPropertiesKHR *properties =
-            (VkPhysicalDevicePushDescriptorPropertiesKHR *) ext;
-         properties->maxPushDescriptors = MAX_PUSH_DESCRIPTORS;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_PROPERTIES_EXT: {
-         VkPhysicalDeviceRobustness2PropertiesEXT *properties = (void *)ext;
-         properties->robustStorageBufferAccessSizeAlignment =
-            ANV_SSBO_BOUNDS_CHECK_ALIGNMENT;
-         properties->robustUniformBufferAccessSizeAlignment =
-            ANV_UBO_ALIGNMENT;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_FILTER_MINMAX_PROPERTIES_EXT: {
-         VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT *properties =
-            (VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT *)ext;
-         CORE_PROPERTY(1, 2, filterMinmaxImageComponentMapping);
-         CORE_PROPERTY(1, 2, filterMinmaxSingleComponentFormats);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_PROPERTIES_KHR: {
-         VkPhysicalDeviceShaderIntegerDotProductPropertiesKHR *props =
-            (VkPhysicalDeviceShaderIntegerDotProductPropertiesKHR *)ext;
-
-         props->integerDotProduct8BitUnsignedAccelerated = false;
-         props->integerDotProduct8BitSignedAccelerated = false;
-         props->integerDotProduct8BitMixedSignednessAccelerated = false;
-         props->integerDotProduct4x8BitPackedUnsignedAccelerated = pdevice->info.ver >= 12;
-         props->integerDotProduct4x8BitPackedSignedAccelerated = pdevice->info.ver >= 12;
-         props->integerDotProduct4x8BitPackedMixedSignednessAccelerated = pdevice->info.ver >= 12;
-         props->integerDotProduct16BitUnsignedAccelerated = false;
-         props->integerDotProduct16BitSignedAccelerated = false;
-         props->integerDotProduct16BitMixedSignednessAccelerated = false;
-         props->integerDotProduct32BitUnsignedAccelerated = false;
-         props->integerDotProduct32BitSignedAccelerated = false;
-         props->integerDotProduct32BitMixedSignednessAccelerated = false;
-         props->integerDotProduct64BitUnsignedAccelerated = false;
-         props->integerDotProduct64BitSignedAccelerated = false;
-         props->integerDotProduct64BitMixedSignednessAccelerated = false;
-         props->integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = false;
-         props->integerDotProductAccumulatingSaturating8BitSignedAccelerated = false;
-         props->integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = false;
-         props->integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = pdevice->info.ver >= 12;
-         props->integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = pdevice->info.ver >= 12;
-         props->integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = pdevice->info.ver >= 12;
-         props->integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = false;
-         props->integerDotProductAccumulatingSaturating16BitSignedAccelerated = false;
-         props->integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated = false;
-         props->integerDotProductAccumulatingSaturating32BitUnsignedAccelerated = false;
-         props->integerDotProductAccumulatingSaturating32BitSignedAccelerated = false;
-         props->integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated = false;
-         props->integerDotProductAccumulatingSaturating64BitUnsignedAccelerated = false;
-         props->integerDotProductAccumulatingSaturating64BitSignedAccelerated = false;
-         props->integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated = false;
-
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES: {
-         VkPhysicalDeviceSubgroupProperties *properties = (void *)ext;
-         CORE_PROPERTY(1, 1, subgroupSize);
-         CORE_RENAMED_PROPERTY(1, 1, supportedStages,
-                                     subgroupSupportedStages);
-         CORE_RENAMED_PROPERTY(1, 1, supportedOperations,
-                                     subgroupSupportedOperations);
-         CORE_RENAMED_PROPERTY(1, 1, quadOperationsInAllStages,
-                                     subgroupQuadOperationsInAllStages);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_PROPERTIES_EXT: {
-         VkPhysicalDeviceSubgroupSizeControlPropertiesEXT *props =
-            (VkPhysicalDeviceSubgroupSizeControlPropertiesEXT *)ext;
-         STATIC_ASSERT(8 <= BRW_SUBGROUP_SIZE && BRW_SUBGROUP_SIZE <= 32);
-         props->minSubgroupSize = 8;
-         props->maxSubgroupSize = 32;
-         props->maxComputeWorkgroupSubgroups = pdevice->info.max_cs_workgroup_threads;
-         props->requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT;
-         break;
-      }
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES_KHR : {
-         VkPhysicalDeviceFloatControlsPropertiesKHR *properties = (void *)ext;
-         CORE_PROPERTY(1, 2, denormBehaviorIndependence);
-         CORE_PROPERTY(1, 2, roundingModeIndependence);
-         CORE_PROPERTY(1, 2, shaderDenormFlushToZeroFloat16);
-         CORE_PROPERTY(1, 2, shaderDenormPreserveFloat16);
-         CORE_PROPERTY(1, 2, shaderRoundingModeRTEFloat16);
-         CORE_PROPERTY(1, 2, shaderRoundingModeRTZFloat16);
-         CORE_PROPERTY(1, 2, shaderSignedZeroInfNanPreserveFloat16);
-         CORE_PROPERTY(1, 2, shaderDenormFlushToZeroFloat32);
-         CORE_PROPERTY(1, 2, shaderDenormPreserveFloat32);
-         CORE_PROPERTY(1, 2, shaderRoundingModeRTEFloat32);
-         CORE_PROPERTY(1, 2, shaderRoundingModeRTZFloat32);
-         CORE_PROPERTY(1, 2, shaderSignedZeroInfNanPreserveFloat32);
-         CORE_PROPERTY(1, 2, shaderDenormFlushToZeroFloat64);
-         CORE_PROPERTY(1, 2, shaderDenormPreserveFloat64);
-         CORE_PROPERTY(1, 2, shaderRoundingModeRTEFloat64);
-         CORE_PROPERTY(1, 2, shaderRoundingModeRTZFloat64);
-         CORE_PROPERTY(1, 2, shaderSignedZeroInfNanPreserveFloat64);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLE_LOCATIONS_PROPERTIES_EXT: {
-         VkPhysicalDeviceSampleLocationsPropertiesEXT *props =
-            (VkPhysicalDeviceSampleLocationsPropertiesEXT *)ext;
-
-         props->sampleLocationSampleCounts =
-            isl_device_get_sample_counts(&pdevice->isl_dev);
-
-         /* See also anv_GetPhysicalDeviceMultisamplePropertiesEXT */
-         props->maxSampleLocationGridSize.width = 1;
-         props->maxSampleLocationGridSize.height = 1;
-
-         props->sampleLocationCoordinateRange[0] = 0;
-         props->sampleLocationCoordinateRange[1] = 0.9375;
-         props->sampleLocationSubPixelBits = 4;
-
-         props->variableSampleLocations = true;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TEXEL_BUFFER_ALIGNMENT_PROPERTIES_EXT: {
-         VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT *props =
-            (VkPhysicalDeviceTexelBufferAlignmentPropertiesEXT *)ext;
-
-         /* From the SKL PRM Vol. 2d, docs for RENDER_SURFACE_STATE::Surface
-          * Base Address:
-          *
-          *    "For SURFTYPE_BUFFER non-rendertarget surfaces, this field
-          *    specifies the base address of the first element of the surface,
-          *    computed in software by adding the surface base address to the
-          *    byte offset of the element in the buffer. The base address must
-          *    be aligned to element size."
-          *
-          * The typed dataport messages require that things be texel aligned.
-          * Otherwise, we may just load/store the wrong data or, in the worst
-          * case, there may be hangs.
-          */
-         props->storageTexelBufferOffsetAlignmentBytes = 16;
-         props->storageTexelBufferOffsetSingleTexelAlignment = true;
-
-         /* The sampler, however, is much more forgiving and it can handle
-          * arbitrary byte alignment for linear and buffer surfaces.  It's
-          * hard to find a good PRM citation for this but years of empirical
-          * experience demonstrate that this is true.
-          */
-         props->uniformTexelBufferOffsetAlignmentBytes = 1;
-         props->uniformTexelBufferOffsetSingleTexelAlignment = false;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_PROPERTIES_KHR: {
-         VkPhysicalDeviceTimelineSemaphorePropertiesKHR *properties =
-            (VkPhysicalDeviceTimelineSemaphorePropertiesKHR *) ext;
-         CORE_PROPERTY(1, 2, maxTimelineSemaphoreValueDifference);
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_PROPERTIES_EXT: {
-         VkPhysicalDeviceTransformFeedbackPropertiesEXT *props =
-            (VkPhysicalDeviceTransformFeedbackPropertiesEXT *)ext;
-
-         props->maxTransformFeedbackStreams = MAX_XFB_STREAMS;
-         props->maxTransformFeedbackBuffers = MAX_XFB_BUFFERS;
-         props->maxTransformFeedbackBufferSize = (1ull << 32);
-         props->maxTransformFeedbackStreamDataSize = 128 * 4;
-         props->maxTransformFeedbackBufferDataSize = 128 * 4;
-         props->maxTransformFeedbackBufferDataStride = 2048;
-         props->transformFeedbackQueries = true;
-         props->transformFeedbackStreamsLinesTriangles = false;
-         props->transformFeedbackRasterizationStreamSelect = false;
-         /* This requires MI_MATH */
-         props->transformFeedbackDraw = pdevice->info.verx10 >= 75;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_PROPERTIES_EXT: {
-         VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *props =
-            (VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *)ext;
-         /* We have to restrict this a bit for multiview */
-         props->maxVertexAttribDivisor = UINT32_MAX / 16;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTI_DRAW_PROPERTIES_EXT: {
-         VkPhysicalDeviceMultiDrawPropertiesEXT *props = (VkPhysicalDeviceMultiDrawPropertiesEXT *)ext;
-         props->maxMultiDrawCount = 2048;
-         break;
-      }
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_PROPERTIES:
-         anv_get_physical_device_properties_1_1(pdevice, (void *)ext);
-         break;
-
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_PROPERTIES:
-         anv_get_physical_device_properties_1_2(pdevice, (void *)ext);
-         break;
+#endif
 
       default:
-         anv_debug_ignored_stype(ext->sType);
          break;
       }
    }
-
-#undef CORE_RENAMED_PROPERTY
-#undef CORE_PROPERTY
 }
 
 static const VkQueueFamilyProperties
-anv_queue_family_properties_template = {
-   .timestampValidBits = 36, /* XXX: Real value here */
-   .minImageTransferGranularity = { 1, 1, 1 },
-};
+get_anv_queue_family_properties_template(const struct anv_physical_device *device)
+{
 
-void anv_GetPhysicalDeviceQueueFamilyProperties(
-    VkPhysicalDevice                            physicalDevice,
-    uint32_t*                                   pCount,
-    VkQueueFamilyProperties*                    pQueueFamilyProperties)
+   /*
+    * For Xe2+:
+    * Bspec 60411: Timestamp register can hold 64-bit value
+    *
+    * Platforms < Xe2:
+    * Bpsec 46111: Timestamp register can hold only 36-bit
+    *              value
+    */
+   const VkQueueFamilyProperties anv_queue_family_properties_template =
+   {
+      .timestampValidBits = device->info.ver >= 20 ? 64 : 36,
+      .minImageTransferGranularity = { 1, 1, 1 },
+   };
+
+   return anv_queue_family_properties_template;
+}
+
+static VkQueueFamilyProperties
+anv_device_physical_get_queue_properties(const struct anv_physical_device *device,
+                                         uint32_t family_index)
 {
-   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
-   VK_OUTARRAY_MAKE(out, pQueueFamilyProperties, pCount);
+   const struct anv_queue_family *family = &device->queue.families[family_index];
+   VkQueueFamilyProperties properties =
+      get_anv_queue_family_properties_template(device);
 
-   for (uint32_t i = 0; i < pdevice->queue.family_count; i++) {
-      struct anv_queue_family *queue_family = &pdevice->queue.families[i];
-      vk_outarray_append(&out, p) {
-         *p = anv_queue_family_properties_template;
-         p->queueFlags = queue_family->queueFlags;
-         p->queueCount = queue_family->queueCount;
-      }
-   }
+   properties.queueFlags = family->queueFlags;
+   properties.queueCount = family->queueCount;
+   return properties;
 }
 
 void anv_GetPhysicalDeviceQueueFamilyProperties2(
@@ -2802,17 +2787,57 @@ void anv_GetPhysicalDeviceQueueFamilyProperties2(
     VkQueueFamilyProperties2*                   pQueueFamilyProperties)
 {
    ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
-   VK_OUTARRAY_MAKE(out, pQueueFamilyProperties, pQueueFamilyPropertyCount);
+   VK_OUTARRAY_MAKE_TYPED(VkQueueFamilyProperties2, out,
+                          pQueueFamilyProperties, pQueueFamilyPropertyCount);
 
    for (uint32_t i = 0; i < pdevice->queue.family_count; i++) {
       struct anv_queue_family *queue_family = &pdevice->queue.families[i];
-      vk_outarray_append(&out, p) {
-         p->queueFamilyProperties = anv_queue_family_properties_template;
-         p->queueFamilyProperties.queueFlags = queue_family->queueFlags;
-         p->queueFamilyProperties.queueCount = queue_family->queueCount;
-
-         vk_foreach_struct(s, p->pNext) {
-            anv_debug_ignored_stype(s->sType);
+      vk_outarray_append_typed(VkQueueFamilyProperties2, &out, p) {
+         p->queueFamilyProperties =
+            anv_device_physical_get_queue_properties(pdevice, i);
+
+         vk_foreach_struct(ext, p->pNext) {
+            switch (ext->sType) {
+            case VK_STRUCTURE_TYPE_QUEUE_FAMILY_GLOBAL_PRIORITY_PROPERTIES_KHR: {
+               VkQueueFamilyGlobalPriorityPropertiesKHR *properties =
+                  (VkQueueFamilyGlobalPriorityPropertiesKHR *)ext;
+
+               /* Deliberately sorted low to high */
+               VkQueueGlobalPriorityKHR all_priorities[] = {
+                  VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR,
+                  VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR,
+                  VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR,
+                  VK_QUEUE_GLOBAL_PRIORITY_REALTIME_KHR,
+               };
+
+               uint32_t count = 0;
+               for (unsigned i = 0; i < ARRAY_SIZE(all_priorities); i++) {
+                  if (all_priorities[i] > pdevice->max_context_priority)
+                     break;
+
+                  properties->priorities[count++] = all_priorities[i];
+               }
+               properties->priorityCount = count;
+               break;
+            }
+            case VK_STRUCTURE_TYPE_QUEUE_FAMILY_QUERY_RESULT_STATUS_PROPERTIES_KHR: {
+               VkQueueFamilyQueryResultStatusPropertiesKHR *prop =
+                  (VkQueueFamilyQueryResultStatusPropertiesKHR *)ext;
+               prop->queryResultStatusSupport = VK_TRUE;
+               break;
+            }
+            case VK_STRUCTURE_TYPE_QUEUE_FAMILY_VIDEO_PROPERTIES_KHR: {
+               VkQueueFamilyVideoPropertiesKHR *prop =
+                  (VkQueueFamilyVideoPropertiesKHR *)ext;
+               if (queue_family->queueFlags & VK_QUEUE_VIDEO_DECODE_BIT_KHR) {
+                  prop->videoCodecOperations = VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR |
+                                               VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR;
+               }
+               break;
+            }
+            default:
+               anv_debug_ignored_stype(ext->sType);
+            }
          }
       }
    }
@@ -2847,6 +2872,9 @@ anv_get_memory_budget(VkPhysicalDevice physicalDevice,
 {
    ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
 
+   if (!device->vk.supported_extensions.EXT_memory_budget)
+      return;
+
    anv_update_meminfo(device, device->local_fd);
 
    VkDeviceSize total_sys_heaps_size = 0, total_vram_heaps_size = 0;
@@ -2866,10 +2894,14 @@ anv_get_memory_budget(VkPhysicalDevice physicalDevice,
 
       if (device->memory.heaps[i].is_local_mem) {
          total_heaps_size = total_vram_heaps_size;
-         mem_available = device->vram.available;
+         if (device->vram_non_mappable.size > 0 && i == 0) {
+            mem_available = device->vram_non_mappable.available;
+         } else {
+            mem_available = device->vram_mappable.available;
+         }
       } else {
          total_heaps_size = total_sys_heaps_size;
-         mem_available = device->sys.available;
+         mem_available = MIN2(device->sys.available, total_heaps_size);
       }
 
       double heap_proportion = (double) heap_size / total_heaps_size;
@@ -2926,21 +2958,6 @@ void anv_GetPhysicalDeviceMemoryProperties2(
    }
 }
 
-void
-anv_GetDeviceGroupPeerMemoryFeatures(
-    VkDevice                                    device,
-    uint32_t                                    heapIndex,
-    uint32_t                                    localDeviceIndex,
-    uint32_t                                    remoteDeviceIndex,
-    VkPeerMemoryFeatureFlags*                   pPeerMemoryFeatures)
-{
-   assert(localDeviceIndex == 0 && remoteDeviceIndex == 0);
-   *pPeerMemoryFeatures = VK_PEER_MEMORY_FEATURE_COPY_SRC_BIT |
-                          VK_PEER_MEMORY_FEATURE_COPY_DST_BIT |
-                          VK_PEER_MEMORY_FEATURE_GENERIC_SRC_BIT |
-                          VK_PEER_MEMORY_FEATURE_GENERIC_DST_BIT;
-}
-
 PFN_vkVoidFunction anv_GetInstanceProcAddr(
     VkInstance                                  _instance,
     const char*                                 pName)
@@ -2957,71 +2974,29 @@ PFN_vkVoidFunction anv_GetInstanceProcAddr(
 PUBLIC
 VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetInstanceProcAddr(
     VkInstance                                  instance,
-    const char*                                 pName);
-
-PUBLIC
-VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetInstanceProcAddr(
-    VkInstance                                  instance,
     const char*                                 pName)
 {
    return anv_GetInstanceProcAddr(instance, pName);
 }
 
-/* With version 4+ of the loader interface the ICD should expose
- * vk_icdGetPhysicalDeviceProcAddr()
- */
-PUBLIC
-VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetPhysicalDeviceProcAddr(
-    VkInstance  _instance,
-    const char* pName);
-
-PFN_vkVoidFunction vk_icdGetPhysicalDeviceProcAddr(
-    VkInstance  _instance,
-    const char* pName)
-{
-   ANV_FROM_HANDLE(anv_instance, instance, _instance);
-   return vk_instance_get_physical_device_proc_addr(&instance->vk, pName);
-}
-
-static struct anv_state
-anv_state_pool_emit_data(struct anv_state_pool *pool, size_t size, size_t align, const void *p)
-{
-   struct anv_state state;
-
-   state = anv_state_pool_alloc(pool, size, align);
-   memcpy(state.map, p, size);
-
-   return state;
-}
-
 static void
 anv_device_init_border_colors(struct anv_device *device)
 {
-   if (device->info.is_haswell) {
-      static const struct hsw_border_color border_colors[] = {
-         [VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK] =  { .float32 = { 0.0, 0.0, 0.0, 0.0 } },
-         [VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK] =       { .float32 = { 0.0, 0.0, 0.0, 1.0 } },
-         [VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE] =       { .float32 = { 1.0, 1.0, 1.0, 1.0 } },
-         [VK_BORDER_COLOR_INT_TRANSPARENT_BLACK] =    { .uint32 = { 0, 0, 0, 0 } },
-         [VK_BORDER_COLOR_INT_OPAQUE_BLACK] =         { .uint32 = { 0, 0, 0, 1 } },
-         [VK_BORDER_COLOR_INT_OPAQUE_WHITE] =         { .uint32 = { 1, 1, 1, 1 } },
-      };
-
-      device->border_colors =
-         anv_state_pool_emit_data(&device->dynamic_state_pool,
-                                  sizeof(border_colors), 512, border_colors);
-   } else {
-      static const struct gfx8_border_color border_colors[] = {
-         [VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK] =  { .float32 = { 0.0, 0.0, 0.0, 0.0 } },
-         [VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK] =       { .float32 = { 0.0, 0.0, 0.0, 1.0 } },
-         [VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE] =       { .float32 = { 1.0, 1.0, 1.0, 1.0 } },
-         [VK_BORDER_COLOR_INT_TRANSPARENT_BLACK] =    { .uint32 = { 0, 0, 0, 0 } },
-         [VK_BORDER_COLOR_INT_OPAQUE_BLACK] =         { .uint32 = { 0, 0, 0, 1 } },
-         [VK_BORDER_COLOR_INT_OPAQUE_WHITE] =         { .uint32 = { 1, 1, 1, 1 } },
-      };
+   static const struct gfx8_border_color border_colors[] = {
+      [VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK] =  { .float32 = { 0.0, 0.0, 0.0, 0.0 } },
+      [VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK] =       { .float32 = { 0.0, 0.0, 0.0, 1.0 } },
+      [VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE] =       { .float32 = { 1.0, 1.0, 1.0, 1.0 } },
+      [VK_BORDER_COLOR_INT_TRANSPARENT_BLACK] =    { .uint32 = { 0, 0, 0, 0 } },
+      [VK_BORDER_COLOR_INT_OPAQUE_BLACK] =         { .uint32 = { 0, 0, 0, 1 } },
+      [VK_BORDER_COLOR_INT_OPAQUE_WHITE] =         { .uint32 = { 1, 1, 1, 1 } },
+   };
 
-      device->border_colors =
-         anv_state_pool_emit_data(&device->dynamic_state_pool,
+   device->border_colors =
+      anv_state_pool_emit_data(&device->dynamic_state_pool,
+                               sizeof(border_colors), 64, border_colors);
+   if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
+      device->border_colors_db =
+         anv_state_pool_emit_data(&device->dynamic_state_db_pool,
                                   sizeof(border_colors), 64, border_colors);
    }
 }
@@ -3030,7 +3005,9 @@ static VkResult
 anv_device_init_trivial_batch(struct anv_device *device)
 {
    VkResult result = anv_device_alloc_bo(device, "trivial-batch", 4096,
-                                         ANV_BO_ALLOC_MAPPED,
+                                         ANV_BO_ALLOC_MAPPED |
+                                         ANV_BO_ALLOC_HOST_COHERENT |
+                                         ANV_BO_ALLOC_INTERNAL,
                                          0 /* explicit_address */,
                                          &device->trivial_batch_bo);
    if (result != VK_SUCCESS)
@@ -3045,29 +3022,9 @@ anv_device_init_trivial_batch(struct anv_device *device)
    anv_batch_emit(&batch, GFX7_MI_BATCH_BUFFER_END, bbe);
    anv_batch_emit(&batch, GFX7_MI_NOOP, noop);
 
-   if (!device->info.has_llc)
-      intel_clflush_range(batch.start, batch.next - batch.start);
-
    return VK_SUCCESS;
 }
 
-static int
-vk_priority_to_gen(int priority)
-{
-   switch (priority) {
-   case VK_QUEUE_GLOBAL_PRIORITY_LOW_EXT:
-      return INTEL_CONTEXT_LOW_PRIORITY;
-   case VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_EXT:
-      return INTEL_CONTEXT_MEDIUM_PRIORITY;
-   case VK_QUEUE_GLOBAL_PRIORITY_HIGH_EXT:
-      return INTEL_CONTEXT_HIGH_PRIORITY;
-   case VK_QUEUE_GLOBAL_PRIORITY_REALTIME_EXT:
-      return INTEL_CONTEXT_REALTIME_PRIORITY;
-   default:
-      unreachable("Invalid priority");
-   }
-}
-
 static bool
 get_bo_from_pool(struct intel_batch_decode_bo *ret,
                  struct anv_block_pool *pool,
@@ -3098,29 +3055,62 @@ decode_get_bo(void *v_batch, bool ppgtt, uint64_t address)
 
    if (get_bo_from_pool(&ret_bo, &device->dynamic_state_pool.block_pool, address))
       return ret_bo;
+   if (device->vk.enabled_extensions.EXT_descriptor_buffer &&
+       get_bo_from_pool(&ret_bo, &device->dynamic_state_db_pool.block_pool, address))
+      return ret_bo;
    if (get_bo_from_pool(&ret_bo, &device->instruction_state_pool.block_pool, address))
       return ret_bo;
    if (get_bo_from_pool(&ret_bo, &device->binding_table_pool.block_pool, address))
       return ret_bo;
-   if (get_bo_from_pool(&ret_bo, &device->surface_state_pool.block_pool, address))
+   if (get_bo_from_pool(&ret_bo, &device->scratch_surface_state_pool.block_pool, address))
+      return ret_bo;
+   if (device->physical->indirect_descriptors &&
+       get_bo_from_pool(&ret_bo, &device->bindless_surface_state_pool.block_pool, address))
+      return ret_bo;
+   if (get_bo_from_pool(&ret_bo, &device->internal_surface_state_pool.block_pool, address))
+      return ret_bo;
+   if (device->physical->indirect_descriptors &&
+       get_bo_from_pool(&ret_bo, &device->indirect_push_descriptor_pool.block_pool, address))
+      return ret_bo;
+   if (device->info->has_aux_map &&
+       get_bo_from_pool(&ret_bo, &device->aux_tt_pool.block_pool, address))
       return ret_bo;
 
    if (!device->cmd_buffer_being_decoded)
       return (struct intel_batch_decode_bo) { };
 
-   struct anv_batch_bo **bo;
-
-   u_vector_foreach(bo, &device->cmd_buffer_being_decoded->seen_bbos) {
+   struct anv_batch_bo **bbo;
+   u_vector_foreach(bbo, &device->cmd_buffer_being_decoded->seen_bbos) {
       /* The decoder zeroes out the top 16 bits, so we need to as well */
-      uint64_t bo_address = (*bo)->bo->offset & (~0ull >> 16);
+      uint64_t bo_address = (*bbo)->bo->offset & (~0ull >> 16);
 
-      if (address >= bo_address && address < bo_address + (*bo)->bo->size) {
+      if (address >= bo_address && address < bo_address + (*bbo)->bo->size) {
          return (struct intel_batch_decode_bo) {
             .addr = bo_address,
-            .size = (*bo)->bo->size,
-            .map = (*bo)->bo->map,
+            .size = (*bbo)->bo->size,
+            .map = (*bbo)->bo->map,
          };
       }
+
+      uint32_t dep_words = (*bbo)->relocs.dep_words;
+      BITSET_WORD *deps = (*bbo)->relocs.deps;
+      for (uint32_t w = 0; w < dep_words; w++) {
+         BITSET_WORD mask = deps[w];
+         while (mask) {
+            int i = u_bit_scan(&mask);
+            uint32_t gem_handle = w * BITSET_WORDBITS + i;
+            struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
+            assert(bo->refcount > 0);
+            bo_address = bo->offset & (~0ull >> 16);
+            if (address >= bo_address && address < bo_address + bo->size) {
+               return (struct intel_batch_decode_bo) {
+                  .addr = bo_address,
+                  .size = bo->size,
+                  .map = bo->map,
+               };
+            }
+         }
+      }
    }
 
    return (struct intel_batch_decode_bo) { };
@@ -3139,10 +3129,8 @@ intel_aux_map_buffer_alloc(void *driver_ctx, uint32_t size)
       return NULL;
 
    struct anv_device *device = (struct anv_device*)driver_ctx;
-   assert(device->physical->supports_48bit_addresses &&
-          device->physical->use_softpin);
 
-   struct anv_state_pool *pool = &device->dynamic_state_pool;
+   struct anv_state_pool *pool = &device->aux_tt_pool;
    buf->state = anv_state_pool_alloc(pool, size, size);
 
    buf->base.gpu = pool->block_pool.bo->offset + buf->state.offset;
@@ -3157,7 +3145,7 @@ intel_aux_map_buffer_free(void *driver_ctx, struct intel_buffer *buffer)
 {
    struct intel_aux_map_buffer *buf = (struct intel_aux_map_buffer*)buffer;
    struct anv_device *device = (struct anv_device*)driver_ctx;
-   struct anv_state_pool *pool = &device->dynamic_state_pool;
+   struct anv_state_pool *pool = &device->aux_tt_pool;
    anv_state_pool_free(pool, buf->state);
    free(buf);
 }
@@ -3168,22 +3156,93 @@ static struct intel_mapped_pinned_buffer_alloc aux_map_allocator = {
 };
 
 static VkResult
-check_physical_device_features(VkPhysicalDevice physicalDevice,
-                               const VkPhysicalDeviceFeatures *features)
+anv_device_setup_context_or_vm(struct anv_device *device,
+                               const VkDeviceCreateInfo *pCreateInfo,
+                               const uint32_t num_queues)
 {
-   VkPhysicalDeviceFeatures supported_features;
-   anv_GetPhysicalDeviceFeatures(physicalDevice, &supported_features);
-   VkBool32 *supported_feature = (VkBool32 *)&supported_features;
-   VkBool32 *enabled_feature = (VkBool32 *)features;
-   unsigned num_features = sizeof(VkPhysicalDeviceFeatures) / sizeof(VkBool32);
-   for (uint32_t i = 0; i < num_features; i++) {
-      if (enabled_feature[i] && !supported_feature[i])
-         return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
+   switch (device->info->kmd_type) {
+   case INTEL_KMD_TYPE_I915:
+      return anv_i915_device_setup_context(device, pCreateInfo, num_queues);
+   case INTEL_KMD_TYPE_XE:
+      return anv_xe_device_setup_vm(device);
+   default:
+      unreachable("Missing");
+      return VK_ERROR_UNKNOWN;
    }
+}
+
+static bool
+anv_device_destroy_context_or_vm(struct anv_device *device)
+{
+   switch (device->info->kmd_type) {
+   case INTEL_KMD_TYPE_I915:
+      if (device->physical->has_vm_control)
+         return anv_i915_device_destroy_vm(device);
+      else
+         return intel_gem_destroy_context(device->fd, device->context_id);
+   case INTEL_KMD_TYPE_XE:
+      return anv_xe_device_destroy_vm(device);
+   default:
+      unreachable("Missing");
+      return false;
+   }
+}
+
+static VkResult
+anv_device_init_trtt(struct anv_device *device)
+{
+   struct anv_trtt *trtt = &device->trtt;
+
+   if (pthread_mutex_init(&trtt->mutex, NULL) != 0)
+      return vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+
+   list_inithead(&trtt->in_flight_batches);
 
    return VK_SUCCESS;
 }
 
+static void
+anv_device_finish_trtt(struct anv_device *device)
+{
+   struct anv_trtt *trtt = &device->trtt;
+
+   if (trtt->timeline_val > 0) {
+      struct drm_syncobj_timeline_wait wait = {
+         .handles = (uintptr_t)&trtt->timeline_handle,
+         .points = (uintptr_t)&trtt->timeline_val,
+         .timeout_nsec = INT64_MAX,
+         .count_handles = 1,
+         .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
+         .first_signaled = false,
+      };
+      if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, &wait))
+         fprintf(stderr, "TR-TT syncobj wait failed!\n");
+
+      list_for_each_entry_safe(struct anv_trtt_batch_bo, trtt_bbo,
+                               &trtt->in_flight_batches, link)
+         anv_trtt_batch_bo_free(device, trtt_bbo);
+
+   }
+
+   if (trtt->timeline_handle > 0) {
+      struct drm_syncobj_destroy destroy = {
+         .handle = trtt->timeline_handle,
+      };
+      if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_DESTROY, &destroy))
+         fprintf(stderr, "TR-TT syncobj destroy failed!\n");
+   }
+
+   pthread_mutex_destroy(&trtt->mutex);
+
+   vk_free(&device->vk.alloc, trtt->l3_mirror);
+   vk_free(&device->vk.alloc, trtt->l2_mirror);
+
+   for (int i = 0; i < trtt->num_page_table_bos; i++)
+      anv_device_release_bo(device, trtt->page_table_bos[i]);
+
+   vk_free(&device->vk.alloc, trtt->page_table_bos);
+}
+
 VkResult anv_CreateDevice(
     VkPhysicalDevice                            physicalDevice,
     const VkDeviceCreateInfo*                   pCreateInfo,
@@ -3196,148 +3255,125 @@ VkResult anv_CreateDevice(
 
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO);
 
-   /* Check enabled features */
-   bool robust_buffer_access = false;
-   if (pCreateInfo->pEnabledFeatures) {
-      result = check_physical_device_features(physicalDevice,
-                                              pCreateInfo->pEnabledFeatures);
-      if (result != VK_SUCCESS)
-         return result;
-
-      if (pCreateInfo->pEnabledFeatures->robustBufferAccess)
-         robust_buffer_access = true;
-   }
-
-   vk_foreach_struct_const(ext, pCreateInfo->pNext) {
-      switch (ext->sType) {
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2: {
-         const VkPhysicalDeviceFeatures2 *features = (const void *)ext;
-         result = check_physical_device_features(physicalDevice,
-                                                 &features->features);
-         if (result != VK_SUCCESS)
-            return result;
-
-         if (features->features.robustBufferAccess)
-            robust_buffer_access = true;
-         break;
-      }
-
-      default:
-         /* Don't warn */
-         break;
-      }
-   }
-
    /* Check requested queues and fail if we are requested to create any
     * queues with flags we don't support.
     */
    assert(pCreateInfo->queueCreateInfoCount > 0);
    for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
-      if (pCreateInfo->pQueueCreateInfos[i].flags != 0)
-         return vk_error(VK_ERROR_INITIALIZATION_FAILED);
+      if (pCreateInfo->pQueueCreateInfos[i].flags & ~VK_DEVICE_QUEUE_CREATE_PROTECTED_BIT)
+         return vk_error(physical_device, VK_ERROR_INITIALIZATION_FAILED);
    }
 
-   /* Check if client specified queue priority. */
-   const VkDeviceQueueGlobalPriorityCreateInfoEXT *queue_priority =
-      vk_find_struct_const(pCreateInfo->pQueueCreateInfos[0].pNext,
-                           DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_EXT);
-
-   VkQueueGlobalPriorityEXT priority =
-      queue_priority ? queue_priority->globalPriority :
-         VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_EXT;
-
-   device = vk_alloc2(&physical_device->instance->vk.alloc, pAllocator,
+   device = vk_zalloc2(&physical_device->instance->vk.alloc, pAllocator,
                        sizeof(*device), 8,
                        VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
    if (!device)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(physical_device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    struct vk_device_dispatch_table dispatch_table;
+
+   bool override_initial_entrypoints = true;
+   if (physical_device->instance->vk.app_info.app_name &&
+       !strcmp(physical_device->instance->vk.app_info.app_name, "HITMAN3.exe")) {
+      vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+                                                &anv_hitman3_device_entrypoints,
+                                                true);
+      override_initial_entrypoints = false;
+   }
+   if (physical_device->info.ver < 12 &&
+       physical_device->instance->vk.app_info.app_name &&
+       !strcmp(physical_device->instance->vk.app_info.app_name, "DOOM 64")) {
+      vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+                                                &anv_doom64_device_entrypoints,
+                                                true);
+      override_initial_entrypoints = false;
+   }
+#if DETECT_OS_ANDROID
+   vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+                                             &anv_android_device_entrypoints,
+                                             true);
+   override_initial_entrypoints = false;
+#endif
+   if (physical_device->instance->vk.trace_mode & VK_TRACE_MODE_RMV) {
+      vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+                                                &anv_rmv_device_entrypoints,
+                                                true);
+      override_initial_entrypoints = false;
+   }
    vk_device_dispatch_table_from_entrypoints(&dispatch_table,
-      anv_genX(&physical_device->info, device_entrypoints), true);
+      anv_genX(&physical_device->info, device_entrypoints),
+      override_initial_entrypoints);
    vk_device_dispatch_table_from_entrypoints(&dispatch_table,
       &anv_device_entrypoints, false);
+   vk_device_dispatch_table_from_entrypoints(&dispatch_table,
+      &wsi_device_entrypoints, false);
+
 
    result = vk_device_init(&device->vk, &physical_device->vk,
                            &dispatch_table, pCreateInfo, pAllocator);
-   if (result != VK_SUCCESS) {
-      vk_error(result);
+   if (result != VK_SUCCESS)
       goto fail_alloc;
-   }
 
-   if (INTEL_DEBUG & DEBUG_BATCH) {
-      const unsigned decode_flags =
-         INTEL_BATCH_DECODE_FULL |
-         ((INTEL_DEBUG & DEBUG_COLOR) ? INTEL_BATCH_DECODE_IN_COLOR : 0) |
-         INTEL_BATCH_DECODE_OFFSETS |
-         INTEL_BATCH_DECODE_FLOATS;
+   if (INTEL_DEBUG(DEBUG_BATCH | DEBUG_BATCH_STATS)) {
+      for (unsigned i = 0; i < physical_device->queue.family_count; i++) {
+         struct intel_batch_decode_ctx *decoder = &device->decoder[i];
+
+         const unsigned decode_flags = INTEL_BATCH_DECODE_DEFAULT_FLAGS;
+
+         intel_batch_decode_ctx_init_brw(decoder,
+                                         &physical_device->compiler->isa,
+                                         &physical_device->info,
+                                         stderr, decode_flags, NULL,
+                                         decode_get_bo, NULL, device);
+         intel_batch_stats_reset(decoder);
 
-      intel_batch_decode_ctx_init(&device->decoder_ctx,
-                                  &physical_device->info,
-                                  stderr, decode_flags, NULL,
-                                  decode_get_bo, NULL, device);
+         decoder->engine = physical_device->queue.families[i].engine_class;
+         decoder->dynamic_base = physical_device->va.dynamic_state_pool.addr;
+         decoder->surface_base = physical_device->va.internal_surface_state_pool.addr;
+         decoder->instruction_base = physical_device->va.instruction_state_pool.addr;
+      }
    }
 
-   device->physical = physical_device;
-   device->_lost = false;
+   anv_device_set_physical(device, physical_device);
+   device->kmd_backend = anv_kmd_backend_get(device->info->kmd_type);
 
    /* XXX(chadv): Can we dup() physicalDevice->fd here? */
    device->fd = open(physical_device->path, O_RDWR | O_CLOEXEC);
    if (device->fd == -1) {
-      result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
+      result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
       goto fail_device;
    }
 
+   switch (device->info->kmd_type) {
+   case INTEL_KMD_TYPE_I915:
+      device->vk.check_status = anv_i915_device_check_status;
+      break;
+   case INTEL_KMD_TYPE_XE:
+      device->vk.check_status = anv_xe_device_check_status;
+      break;
+   default:
+      unreachable("Missing");
+   }
+
+   device->vk.command_buffer_ops = &anv_cmd_buffer_ops;
+   device->vk.create_sync_for_memory = anv_create_sync_for_memory;
+   if (physical_device->info.kmd_type == INTEL_KMD_TYPE_I915)
+      device->vk.create_sync_for_memory = anv_create_sync_for_memory;
+   vk_device_set_drm_fd(&device->vk, device->fd);
+
    uint32_t num_queues = 0;
    for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++)
       num_queues += pCreateInfo->pQueueCreateInfos[i].queueCount;
 
-   if (device->physical->engine_info) {
-      /* The kernel API supports at most 64 engines */
-      assert(num_queues <= 64);
-      uint16_t engine_classes[64];
-      int engine_count = 0;
-      for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
-         const VkDeviceQueueCreateInfo *queueCreateInfo =
-            &pCreateInfo->pQueueCreateInfos[i];
-
-         assert(queueCreateInfo->queueFamilyIndex <
-                physical_device->queue.family_count);
-         struct anv_queue_family *queue_family =
-            &physical_device->queue.families[queueCreateInfo->queueFamilyIndex];
-
-         for (uint32_t j = 0; j < queueCreateInfo->queueCount; j++)
-            engine_classes[engine_count++] = queue_family->engine_class;
-      }
-      device->context_id =
-         anv_gem_create_context_engines(device,
-                                        physical_device->engine_info,
-                                        engine_count, engine_classes);
-   } else {
-      assert(num_queues == 1);
-      device->context_id = anv_gem_create_context(device);
-   }
-   if (device->context_id == -1) {
-      result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
+   result = anv_device_setup_context_or_vm(device, pCreateInfo, num_queues);
+   if (result != VK_SUCCESS)
       goto fail_fd;
-   }
-
-   /* Here we tell the kernel not to attempt to recover our context but
-    * immediately (on the next batchbuffer submission) report that the
-    * context is lost, and we will do the recovery ourselves.  In the case
-    * of Vulkan, recovery means throwing VK_ERROR_DEVICE_LOST and letting
-    * the client clean up the pieces.
-    */
-   anv_gem_set_context_param(device->fd, device->context_id,
-                             I915_CONTEXT_PARAM_RECOVERABLE, false);
-
-   device->has_thread_submit = physical_device->has_thread_submit;
 
    device->queues =
       vk_zalloc(&device->vk.alloc, num_queues * sizeof(*device->queues), 8,
                 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
    if (device->queues == NULL) {
-      result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       goto fail_context_id;
    }
 
@@ -3347,15 +3383,8 @@ VkResult anv_CreateDevice(
          &pCreateInfo->pQueueCreateInfos[i];
 
       for (uint32_t j = 0; j < queueCreateInfo->queueCount; j++) {
-         /* When using legacy contexts, we use I915_EXEC_RENDER but, with
-          * engine-based contexts, the bottom 6 bits of exec_flags are used
-          * for the engine ID.
-          */
-         uint32_t exec_flags = device->physical->engine_info ?
-                               device->queue_count : I915_EXEC_RENDER;
-
          result = anv_queue_init(device, &device->queues[device->queue_count],
-                                 exec_flags, queueCreateInfo);
+                                 queueCreateInfo, j);
          if (result != VK_SUCCESS)
             goto fail_queues;
 
@@ -3363,149 +3392,281 @@ VkResult anv_CreateDevice(
       }
    }
 
-   if (physical_device->use_softpin) {
-      if (pthread_mutex_init(&device->vma_mutex, NULL) != 0) {
-         result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
-         goto fail_queues;
-      }
+   if (pthread_mutex_init(&device->vma_mutex, NULL) != 0) {
+      result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+      goto fail_queues;
+   }
 
-      /* keep the page with address zero out of the allocator */
-      util_vma_heap_init(&device->vma_lo,
-                         LOW_HEAP_MIN_ADDRESS, LOW_HEAP_SIZE);
+   /* keep the page with address zero out of the allocator */
+   util_vma_heap_init(&device->vma_lo,
+                      device->physical->va.low_heap.addr,
+                      device->physical->va.low_heap.size);
 
-      util_vma_heap_init(&device->vma_cva, CLIENT_VISIBLE_HEAP_MIN_ADDRESS,
-                         CLIENT_VISIBLE_HEAP_SIZE);
+   util_vma_heap_init(&device->vma_hi,
+                      device->physical->va.high_heap.addr,
+                      device->physical->va.high_heap.size);
 
-      /* Leave the last 4GiB out of the high vma range, so that no state
-       * base address + size can overflow 48 bits. For more information see
-       * the comment about Wa32bitGeneralStateOffset in anv_allocator.c
-       */
-      util_vma_heap_init(&device->vma_hi, HIGH_HEAP_MIN_ADDRESS,
-                         physical_device->gtt_size - (1ull << 32) -
-                         HIGH_HEAP_MIN_ADDRESS);
+   if (device->physical->indirect_descriptors) {
+      util_vma_heap_init(&device->vma_desc,
+                         device->physical->va.indirect_descriptor_pool.addr,
+                         device->physical->va.indirect_descriptor_pool.size);
+   } else {
+      util_vma_heap_init(&device->vma_desc,
+                         device->physical->va.bindless_surface_state_pool.addr,
+                         device->physical->va.bindless_surface_state_pool.size);
    }
 
-   list_inithead(&device->memory_objects);
-
-   /* As per spec, the driver implementation may deny requests to acquire
-    * a priority above the default priority (MEDIUM) if the caller does not
-    * have sufficient privileges. In this scenario VK_ERROR_NOT_PERMITTED_EXT
-    * is returned.
+   /* Always initialized because the the memory types point to this and they
+    * are on the physical device.
     */
-   if (physical_device->has_context_priority) {
-      int err = anv_gem_set_context_param(device->fd, device->context_id,
-                                          I915_CONTEXT_PARAM_PRIORITY,
-                                          vk_priority_to_gen(priority));
-      if (err != 0 && priority > VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_EXT) {
-         result = vk_error(VK_ERROR_NOT_PERMITTED_EXT);
-         goto fail_vmas;
-      }
-   }
-
-   device->info = physical_device->info;
-   device->isl_dev = physical_device->isl_dev;
+   util_vma_heap_init(&device->vma_desc_buf,
+                      device->physical->va.descriptor_buffer_pool.addr,
+                      device->physical->va.descriptor_buffer_pool.size);
 
-   /* On Broadwell and later, we can use batch chaining to more efficiently
-    * implement growing command buffers.  Prior to Haswell, the kernel
-    * command parser gets in the way and we have to fall back to growing
-    * the batch.
-    */
-   device->can_chain_batches = device->info.ver >= 8;
+   util_vma_heap_init(&device->vma_samplers,
+                      device->physical->va.sampler_state_pool.addr,
+                      device->physical->va.sampler_state_pool.size);
+   util_vma_heap_init(&device->vma_trtt,
+                      device->physical->va.trtt.addr,
+                      device->physical->va.trtt.size);
 
-   device->robust_buffer_access = robust_buffer_access;
+   list_inithead(&device->memory_objects);
+   list_inithead(&device->image_private_objects);
 
    if (pthread_mutex_init(&device->mutex, NULL) != 0) {
-      result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
-      goto fail_queues;
+      result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+      goto fail_vmas;
    }
 
    pthread_condattr_t condattr;
    if (pthread_condattr_init(&condattr) != 0) {
-      result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
+      result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
       goto fail_mutex;
    }
    if (pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC) != 0) {
       pthread_condattr_destroy(&condattr);
-      result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
+      result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
       goto fail_mutex;
    }
    if (pthread_cond_init(&device->queue_submit, &condattr) != 0) {
       pthread_condattr_destroy(&condattr);
-      result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
+      result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
       goto fail_mutex;
    }
    pthread_condattr_destroy(&condattr);
 
-   result = anv_bo_cache_init(&device->bo_cache);
+   if (physical_device->instance->vk.trace_mode & VK_TRACE_MODE_RMV)
+      anv_memory_trace_init(device);
+
+   result = anv_bo_cache_init(&device->bo_cache, device);
    if (result != VK_SUCCESS)
       goto fail_queue_cond;
 
-   anv_bo_pool_init(&device->batch_bo_pool, device, "batch");
+   anv_bo_pool_init(&device->batch_bo_pool, device, "batch",
+                    ANV_BO_ALLOC_MAPPED |
+                    ANV_BO_ALLOC_HOST_CACHED_COHERENT |
+                    ANV_BO_ALLOC_CAPTURE);
+   if (device->vk.enabled_extensions.KHR_acceleration_structure) {
+      anv_bo_pool_init(&device->bvh_bo_pool, device, "bvh build",
+                       0 /* alloc_flags */);
+   }
 
    /* Because scratch is also relative to General State Base Address, we leave
     * the base address 0 and start the pool memory at an offset.  This way we
     * get the correct offsets in the anv_states that get allocated from it.
     */
    result = anv_state_pool_init(&device->general_state_pool, device,
-                                "general pool",
-                                0, GENERAL_STATE_POOL_MIN_ADDRESS, 16384);
+                                &(struct anv_state_pool_params) {
+                                   .name         = "general pool",
+                                   .base_address = 0,
+                                   .start_offset = device->physical->va.general_state_pool.addr,
+                                   .block_size   = 16384,
+                                   .max_size     = device->physical->va.general_state_pool.size
+                                });
    if (result != VK_SUCCESS)
       goto fail_batch_bo_pool;
 
    result = anv_state_pool_init(&device->dynamic_state_pool, device,
-                                "dynamic pool",
-                                DYNAMIC_STATE_POOL_MIN_ADDRESS, 0, 16384);
+                                &(struct anv_state_pool_params) {
+                                   .name         = "dynamic pool",
+                                   .base_address = device->physical->va.dynamic_state_pool.addr,
+                                   .block_size   = 16384,
+                                   .max_size     = device->physical->va.dynamic_state_pool.size,
+                                });
    if (result != VK_SUCCESS)
       goto fail_general_state_pool;
 
-   if (device->info.ver >= 8) {
-      /* The border color pointer is limited to 24 bits, so we need to make
-       * sure that any such color used at any point in the program doesn't
-       * exceed that limit.
-       * We achieve that by reserving all the custom border colors we support
-       * right off the bat, so they are close to the base address.
-       */
-      anv_state_reserved_pool_init(&device->custom_border_colors,
-                                   &device->dynamic_state_pool,
-                                   MAX_CUSTOM_BORDER_COLORS,
-                                   sizeof(struct gfx8_border_color), 64);
+   if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
+      result = anv_state_pool_init(&device->dynamic_state_db_pool, device,
+                                   &(struct anv_state_pool_params) {
+                                      .name         = "dynamic pool (db)",
+                                      .base_address = device->physical->va.dynamic_state_db_pool.addr,
+                                      .block_size   = 16384,
+                                      .max_size     = device->physical->va.dynamic_state_db_pool.size,
+                                   });
+      if (result != VK_SUCCESS)
+         goto fail_dynamic_state_pool;
+   }
+
+   /* The border color pointer is limited to 24 bits, so we need to make
+    * sure that any such color used at any point in the program doesn't
+    * exceed that limit.
+    * We achieve that by reserving all the custom border colors we support
+    * right off the bat, so they are close to the base address.
+    */
+   anv_state_reserved_pool_init(&device->custom_border_colors,
+                                &device->dynamic_state_pool,
+                                MAX_CUSTOM_BORDER_COLORS,
+                                sizeof(struct gfx8_border_color), 64);
+   if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
+      result = anv_state_reserved_array_pool_init(&device->custom_border_colors_db,
+                                                  &device->dynamic_state_db_pool,
+                                                  MAX_CUSTOM_BORDER_COLORS,
+                                                  sizeof(struct gfx8_border_color), 64);
+      if (result != VK_SUCCESS)
+         goto fail_dynamic_state_db_pool;
    }
 
    result = anv_state_pool_init(&device->instruction_state_pool, device,
-                                "instruction pool",
-                                INSTRUCTION_STATE_POOL_MIN_ADDRESS, 0, 16384);
+                                &(struct anv_state_pool_params) {
+                                   .name         = "instruction pool",
+                                   .base_address = device->physical->va.instruction_state_pool.addr,
+                                   .block_size   = 16384,
+                                   .max_size     = device->physical->va.instruction_state_pool.size,
+                                });
    if (result != VK_SUCCESS)
-      goto fail_dynamic_state_pool;
+      goto fail_reserved_array_pool;
 
-   result = anv_state_pool_init(&device->surface_state_pool, device,
-                                "surface state pool",
-                                SURFACE_STATE_POOL_MIN_ADDRESS, 0, 4096);
+   if (device->info->verx10 >= 125) {
+      /* Put the scratch surface states at the beginning of the internal
+       * surface state pool.
+       */
+      result = anv_state_pool_init(&device->scratch_surface_state_pool, device,
+                                   &(struct anv_state_pool_params) {
+                                      .name         = "scratch surface state pool",
+                                      .base_address = device->physical->va.scratch_surface_state_pool.addr,
+                                      .block_size   = 4096,
+                                      .max_size     = device->physical->va.scratch_surface_state_pool.size,
+                                   });
+      if (result != VK_SUCCESS)
+         goto fail_instruction_state_pool;
+
+      result = anv_state_pool_init(&device->internal_surface_state_pool, device,
+                                   &(struct anv_state_pool_params) {
+                                      .name         = "internal surface state pool",
+                                      .base_address = device->physical->va.internal_surface_state_pool.addr,
+                                      .start_offset = device->physical->va.scratch_surface_state_pool.size,
+                                      .block_size   = 4096,
+                                      .max_size     = device->physical->va.internal_surface_state_pool.size,
+                                   });
+   } else {
+      result = anv_state_pool_init(&device->internal_surface_state_pool, device,
+                                   &(struct anv_state_pool_params) {
+                                      .name         = "internal surface state pool",
+                                      .base_address = device->physical->va.internal_surface_state_pool.addr,
+                                      .block_size   = 4096,
+                                      .max_size     = device->physical->va.internal_surface_state_pool.size,
+                                   });
+   }
    if (result != VK_SUCCESS)
-      goto fail_instruction_state_pool;
+      goto fail_scratch_surface_state_pool;
+
+   if (device->physical->indirect_descriptors) {
+      result = anv_state_pool_init(&device->bindless_surface_state_pool, device,
+                                   &(struct anv_state_pool_params) {
+                                      .name         = "bindless surface state pool",
+                                      .base_address = device->physical->va.bindless_surface_state_pool.addr,
+                                      .block_size   = 4096,
+                                      .max_size     = device->physical->va.bindless_surface_state_pool.size,
+                                   });
+      if (result != VK_SUCCESS)
+         goto fail_internal_surface_state_pool;
+   }
 
-   if (physical_device->use_softpin) {
-      int64_t bt_pool_offset = (int64_t)BINDING_TABLE_POOL_MIN_ADDRESS -
-                               (int64_t)SURFACE_STATE_POOL_MIN_ADDRESS;
+   if (device->info->verx10 >= 125) {
+      /* We're using 3DSTATE_BINDING_TABLE_POOL_ALLOC to give the binding
+       * table its own base address separately from surface state base.
+       */
+      result = anv_state_pool_init(&device->binding_table_pool, device,
+                                   &(struct anv_state_pool_params) {
+                                      .name         = "binding table pool",
+                                      .base_address = device->physical->va.binding_table_pool.addr,
+                                      .block_size   = BINDING_TABLE_POOL_BLOCK_SIZE,
+                                      .max_size     = device->physical->va.binding_table_pool.size,
+                                   });
+   } else {
+      /* The binding table should be in front of the surface states in virtual
+       * address space so that all surface states can be express as relative
+       * offsets from the binding table location.
+       */
+      assert(device->physical->va.binding_table_pool.addr <
+             device->physical->va.internal_surface_state_pool.addr);
+      int64_t bt_pool_offset = (int64_t)device->physical->va.binding_table_pool.addr -
+                               (int64_t)device->physical->va.internal_surface_state_pool.addr;
       assert(INT32_MIN < bt_pool_offset && bt_pool_offset < 0);
       result = anv_state_pool_init(&device->binding_table_pool, device,
-                                   "binding table pool",
-                                   SURFACE_STATE_POOL_MIN_ADDRESS,
-                                   bt_pool_offset, 4096);
+                                   &(struct anv_state_pool_params) {
+                                      .name         = "binding table pool",
+                                      .base_address = device->physical->va.internal_surface_state_pool.addr,
+                                      .start_offset = bt_pool_offset,
+                                      .block_size   = BINDING_TABLE_POOL_BLOCK_SIZE,
+                                      .max_size     = device->physical->va.internal_surface_state_pool.size,
+                                   });
+   }
+   if (result != VK_SUCCESS)
+      goto fail_bindless_surface_state_pool;
+
+   if (device->physical->indirect_descriptors) {
+      result = anv_state_pool_init(&device->indirect_push_descriptor_pool, device,
+                                   &(struct anv_state_pool_params) {
+                                      .name         = "indirect push descriptor pool",
+                                      .base_address = device->physical->va.indirect_push_descriptor_pool.addr,
+                                      .block_size   = 4096,
+                                      .max_size     = device->physical->va.indirect_push_descriptor_pool.size,
+                                   });
       if (result != VK_SUCCESS)
-         goto fail_surface_state_pool;
+         goto fail_binding_table_pool;
    }
 
-   if (device->info.has_aux_map) {
+   if (device->vk.enabled_extensions.EXT_descriptor_buffer &&
+       device->info->verx10 >= 125) {
+      /* On Gfx12.5+ because of the bindless stages (Mesh, Task, RT), the only
+       * way we can wire push descriptors is through the bindless heap. This
+       * state pool is a 1Gb carve out of the 4Gb HW heap.
+       */
+      result = anv_state_pool_init(&device->push_descriptor_buffer_pool, device,
+                                   &(struct anv_state_pool_params) {
+                                      .name         = "push descriptor buffer state pool",
+                                      .base_address = device->physical->va.push_descriptor_buffer_pool.addr,
+                                      .block_size   = 4096,
+                                      .max_size     = device->physical->va.push_descriptor_buffer_pool.size,
+                                   });
+      if (result != VK_SUCCESS)
+         goto fail_indirect_push_descriptor_pool;
+   }
+
+   if (device->info->has_aux_map) {
+      result = anv_state_pool_init(&device->aux_tt_pool, device,
+                                   &(struct anv_state_pool_params) {
+                                      .name         = "aux-tt pool",
+                                      .base_address = device->physical->va.aux_tt_pool.addr,
+                                      .block_size   = 16384,
+                                      .max_size     = device->physical->va.aux_tt_pool.size,
+                                   });
+      if (result != VK_SUCCESS)
+         goto fail_push_descriptor_buffer_pool;
+
       device->aux_map_ctx = intel_aux_map_init(device, &aux_map_allocator,
                                                &physical_device->info);
       if (!device->aux_map_ctx)
-         goto fail_binding_table_pool;
+         goto fail_aux_tt_pool;
    }
 
-   result = anv_device_alloc_bo(device, "workaround", 4096,
+   result = anv_device_alloc_bo(device, "workaround", 8192,
                                 ANV_BO_ALLOC_CAPTURE |
+                                ANV_BO_ALLOC_HOST_COHERENT |
                                 ANV_BO_ALLOC_MAPPED |
-                                ANV_BO_ALLOC_LOCAL_MEM,
+                                ANV_BO_ALLOC_INTERNAL,
                                 0 /* explicit_address */,
                                 &device->workaround_bo);
    if (result != VK_SUCCESS)
@@ -3513,85 +3674,296 @@ VkResult anv_CreateDevice(
 
    device->workaround_address = (struct anv_address) {
       .bo = device->workaround_bo,
-      .offset = align_u32(
-         intel_debug_write_identifiers(device->workaround_bo->map,
-                                       device->workaround_bo->size,
-                                       "Anv") + 8, 8),
+      .offset = align(intel_debug_write_identifiers(device->workaround_bo->map,
+                                                    device->workaround_bo->size,
+                                                    "Anv"), 32),
    };
 
+   device->workarounds.doom64_images = NULL;
+
+   device->rt_uuid_addr = anv_address_add(device->workaround_address, 8);
+   memcpy(device->rt_uuid_addr.bo->map + device->rt_uuid_addr.offset,
+          physical_device->rt_uuid,
+          sizeof(physical_device->rt_uuid));
+
    device->debug_frame_desc =
       intel_debug_get_identifier_block(device->workaround_bo->map,
                                        device->workaround_bo->size,
                                        INTEL_DEBUG_BLOCK_TYPE_FRAME);
 
+   if (device->vk.enabled_extensions.KHR_ray_query) {
+      uint32_t ray_queries_size =
+         align(brw_rt_ray_queries_hw_stacks_size(device->info), 4096);
+
+      result = anv_device_alloc_bo(device, "ray queries",
+                                   ray_queries_size,
+                                   ANV_BO_ALLOC_INTERNAL,
+                                   0 /* explicit_address */,
+                                   &device->ray_query_bo);
+      if (result != VK_SUCCESS)
+         goto fail_workaround_bo;
+   }
+
    result = anv_device_init_trivial_batch(device);
    if (result != VK_SUCCESS)
-      goto fail_workaround_bo;
+      goto fail_ray_query_bo;
 
-   /* Allocate a null surface state at surface state offset 0.  This makes
-    * NULL descriptor handling trivial because we can just memset structures
-    * to zero and they have a valid descriptor.
+   /* Emit the CPS states before running the initialization batch as those
+    * structures are referenced.
     */
-   device->null_surface_state =
-      anv_state_pool_alloc(&device->surface_state_pool,
-                           device->isl_dev.ss.size,
-                           device->isl_dev.ss.align);
-   isl_null_fill_state(&device->isl_dev, device->null_surface_state.map,
+   if (device->info->ver >= 12) {
+      uint32_t n_cps_states = 3 * 3; /* All combinaisons of X by Y CP sizes (1, 2, 4) */
+
+      if (device->info->has_coarse_pixel_primitive_and_cb)
+         n_cps_states *= 5 * 5; /* 5 combiners by 2 operators */
+
+      n_cps_states += 1; /* Disable CPS */
+
+       /* Each of the combinaison must be replicated on all viewports */
+      n_cps_states *= MAX_VIEWPORTS;
+
+      device->cps_states =
+         anv_state_pool_alloc(&device->dynamic_state_pool,
+                              n_cps_states * CPS_STATE_length(device->info) * 4,
+                              32);
+      if (device->cps_states.map == NULL)
+         goto fail_trivial_batch;
+
+      anv_genX(device->info, init_cps_device_state)(device);
+
+      if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
+         device->cps_states_db =
+            anv_state_pool_alloc(&device->dynamic_state_db_pool,
+                                 device->cps_states.alloc_size, 32);
+         if (device->cps_states_db.map == NULL)
+            goto fail_trivial_batch;
+
+         memcpy(device->cps_states_db.map, device->cps_states.map,
+                device->cps_states.alloc_size);
+      }
+   }
+
+   if (device->physical->indirect_descriptors) {
+      /* Allocate a null surface state at surface state offset 0. This makes
+       * NULL descriptor handling trivial because we can just memset
+       * structures to zero and they have a valid descriptor.
+       */
+      device->null_surface_state =
+         anv_state_pool_alloc(&device->bindless_surface_state_pool,
+                              device->isl_dev.ss.size,
+                              device->isl_dev.ss.align);
+      isl_null_fill_state(&device->isl_dev, device->null_surface_state.map,
+                          .size = isl_extent3d(1, 1, 1) /* This shouldn't matter */);
+      assert(device->null_surface_state.offset == 0);
+   } else {
+      /* When using direct descriptors, those can hold the null surface state
+       * directly. We still need a null surface for the binding table entries
+       * though but this one can live anywhere the internal surface state
+       * pool.
+       */
+      device->null_surface_state =
+         anv_state_pool_alloc(&device->internal_surface_state_pool,
+                              device->isl_dev.ss.size,
+                              device->isl_dev.ss.align);
+      isl_null_fill_state(&device->isl_dev, device->null_surface_state.map,
+                          .size = isl_extent3d(1, 1, 1) /* This shouldn't matter */);
+   }
+
+   isl_null_fill_state(&device->isl_dev, &device->host_null_surface_state,
                        .size = isl_extent3d(1, 1, 1) /* This shouldn't matter */);
-   assert(device->null_surface_state.offset == 0);
 
    anv_scratch_pool_init(device, &device->scratch_pool);
 
    /* TODO(RT): Do we want some sort of data structure for this? */
    memset(device->rt_scratch_bos, 0, sizeof(device->rt_scratch_bos));
 
-   result = anv_genX(&device->info, init_device_state)(device);
+   if (ANV_SUPPORT_RT && device->info->has_ray_tracing) {
+      /* The docs say to always allocate 128KB per DSS */
+      const uint32_t btd_fifo_bo_size =
+         128 * 1024 * intel_device_info_dual_subslice_id_bound(device->info);
+      result = anv_device_alloc_bo(device,
+                                   "rt-btd-fifo",
+                                   btd_fifo_bo_size,
+                                   ANV_BO_ALLOC_INTERNAL,
+                                   0 /* explicit_address */,
+                                   &device->btd_fifo_bo);
+      if (result != VK_SUCCESS)
+         goto fail_trivial_batch_bo_and_scratch_pool;
+   }
+
+   result = anv_device_init_trtt(device);
    if (result != VK_SUCCESS)
-      goto fail_trivial_batch_bo_and_scratch_pool;
+      goto fail_btd_fifo_bo;
 
-   anv_pipeline_cache_init(&device->default_pipeline_cache, device,
-                           true /* cache_enabled */, false /* external_sync */);
+   result = anv_genX(device->info, init_device_state)(device);
+   if (result != VK_SUCCESS)
+      goto fail_trtt;
+
+   struct vk_pipeline_cache_create_info pcc_info = { };
+   device->default_pipeline_cache =
+      vk_pipeline_cache_create(&device->vk, &pcc_info, NULL);
+   if (!device->default_pipeline_cache) {
+      result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      goto fail_trtt;
+   }
+
+   /* Internal shaders need their own pipeline cache because, unlike the rest
+    * of ANV, it won't work at all without the cache. It depends on it for
+    * shaders to remain resident while it runs. Therefore, we need a special
+    * cache just for BLORP/RT that's forced to always be enabled.
+    */
+   pcc_info.force_enable = true;
+   device->internal_cache =
+      vk_pipeline_cache_create(&device->vk, &pcc_info, NULL);
+   if (device->internal_cache == NULL) {
+      result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      goto fail_default_pipeline_cache;
+   }
+
+   /* The device (currently is ICL/TGL) does not have float64 support. */
+   if (!device->info->has_64bit_float &&
+      device->physical->instance->fp64_workaround_enabled)
+      anv_load_fp64_shader(device);
 
    result = anv_device_init_rt_shaders(device);
-   if (result != VK_SUCCESS)
-      goto fail_rt_trampoline;
+   if (result != VK_SUCCESS) {
+      result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      goto fail_internal_cache;
+   }
+
+#if DETECT_OS_ANDROID
+   device->u_gralloc = u_gralloc_create(U_GRALLOC_TYPE_AUTO);
+#endif
+
+   device->robust_buffer_access =
+      device->vk.enabled_features.robustBufferAccess ||
+      device->vk.enabled_features.nullDescriptor;
+
+   device->breakpoint = anv_state_pool_alloc(&device->dynamic_state_pool, 4,
+                                             4);
+   p_atomic_set(&device->draw_call_count, 0);
+
+   /* Create a separate command pool for companion RCS command buffer. */
+   if (device->info->verx10 >= 125) {
+      VkCommandPoolCreateInfo pool_info = {
+         .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+         .queueFamilyIndex =
+             anv_get_first_render_queue_index(device->physical),
+      };
+
+      result = vk_common_CreateCommandPool(anv_device_to_handle(device),
+                                           &pool_info, NULL,
+                                           &device->companion_rcs_cmd_pool);
+      if (result != VK_SUCCESS) {
+         goto fail_internal_cache;
+      }
+   }
 
    anv_device_init_blorp(device);
 
    anv_device_init_border_colors(device);
 
+   anv_device_init_internal_kernels(device);
+
+   anv_device_init_astc_emu(device);
+
    anv_device_perf_init(device);
 
+   anv_device_utrace_init(device);
+
+   anv_device_init_embedded_samplers(device);
+
+   BITSET_ONES(device->gfx_dirty_state);
+   BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_INDEX_BUFFER);
+   BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_SO_DECL_LIST);
+   if (device->info->ver < 11)
+      BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_VF_SGVS_2);
+   if (device->info->ver < 12) {
+      BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
+      BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_DEPTH_BOUNDS);
+   }
+   if (!device->vk.enabled_extensions.EXT_sample_locations)
+      BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_SAMPLE_PATTERN);
+   if (!device->vk.enabled_extensions.KHR_fragment_shading_rate)
+      BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_CPS);
+   if (!device->vk.enabled_extensions.EXT_mesh_shader) {
+      BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_SBE_MESH);
+      BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_CLIP_MESH);
+      BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_MESH_CONTROL);
+      BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_MESH_SHADER);
+      BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_MESH_DISTRIB);
+      BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_TASK_CONTROL);
+      BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_TASK_SHADER);
+      BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_TASK_REDISTRIB);
+   }
+   if (!intel_needs_workaround(device->info, 18019816803))
+      BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_WA_18019816803);
+   if (device->info->ver > 9)
+      BITSET_CLEAR(device->gfx_dirty_state, ANV_GFX_STATE_PMA_FIX);
+
    *pDevice = anv_device_to_handle(device);
 
    return VK_SUCCESS;
 
- fail_rt_trampoline:
-   anv_pipeline_cache_finish(&device->default_pipeline_cache);
+ fail_internal_cache:
+   vk_pipeline_cache_destroy(device->internal_cache, NULL);
+ fail_default_pipeline_cache:
+   vk_pipeline_cache_destroy(device->default_pipeline_cache, NULL);
+ fail_trtt:
+   anv_device_finish_trtt(device);
+ fail_btd_fifo_bo:
+   if (ANV_SUPPORT_RT && device->info->has_ray_tracing)
+      anv_device_release_bo(device, device->btd_fifo_bo);
  fail_trivial_batch_bo_and_scratch_pool:
    anv_scratch_pool_finish(device, &device->scratch_pool);
+ fail_trivial_batch:
    anv_device_release_bo(device, device->trivial_batch_bo);
+ fail_ray_query_bo:
+   if (device->ray_query_bo)
+      anv_device_release_bo(device, device->ray_query_bo);
  fail_workaround_bo:
    anv_device_release_bo(device, device->workaround_bo);
  fail_surface_aux_map_pool:
-   if (device->info.has_aux_map) {
+   if (device->info->has_aux_map) {
       intel_aux_map_finish(device->aux_map_ctx);
       device->aux_map_ctx = NULL;
    }
+ fail_aux_tt_pool:
+   if (device->info->has_aux_map)
+      anv_state_pool_finish(&device->aux_tt_pool);
+ fail_push_descriptor_buffer_pool:
+   if (device->vk.enabled_extensions.EXT_descriptor_buffer &&
+       device->info->verx10 >= 125)
+      anv_state_pool_finish(&device->push_descriptor_buffer_pool);
+ fail_indirect_push_descriptor_pool:
+   if (device->physical->indirect_descriptors)
+      anv_state_pool_finish(&device->indirect_push_descriptor_pool);
  fail_binding_table_pool:
-   if (physical_device->use_softpin)
-      anv_state_pool_finish(&device->binding_table_pool);
- fail_surface_state_pool:
-   anv_state_pool_finish(&device->surface_state_pool);
+   anv_state_pool_finish(&device->binding_table_pool);
+ fail_bindless_surface_state_pool:
+   if (device->physical->indirect_descriptors)
+      anv_state_pool_finish(&device->bindless_surface_state_pool);
+ fail_internal_surface_state_pool:
+   anv_state_pool_finish(&device->internal_surface_state_pool);
+ fail_scratch_surface_state_pool:
+   if (device->info->verx10 >= 125)
+      anv_state_pool_finish(&device->scratch_surface_state_pool);
  fail_instruction_state_pool:
    anv_state_pool_finish(&device->instruction_state_pool);
+ fail_reserved_array_pool:
+   if (device->vk.enabled_extensions.EXT_descriptor_buffer)
+      anv_state_reserved_array_pool_finish(&device->custom_border_colors_db);
+ fail_dynamic_state_db_pool:
+   anv_state_reserved_pool_finish(&device->custom_border_colors);
+   if (device->vk.enabled_extensions.EXT_descriptor_buffer)
+      anv_state_pool_finish(&device->dynamic_state_db_pool);
  fail_dynamic_state_pool:
-   if (device->info.ver >= 8)
-      anv_state_reserved_pool_finish(&device->custom_border_colors);
    anv_state_pool_finish(&device->dynamic_state_pool);
  fail_general_state_pool:
    anv_state_pool_finish(&device->general_state_pool);
  fail_batch_bo_pool:
+   if (device->vk.enabled_extensions.KHR_acceleration_structure)
+      anv_bo_pool_finish(&device->bvh_bo_pool);
    anv_bo_pool_finish(&device->batch_bo_pool);
    anv_bo_cache_finish(&device->bo_cache);
  fail_queue_cond:
@@ -3599,17 +3971,19 @@ VkResult anv_CreateDevice(
  fail_mutex:
    pthread_mutex_destroy(&device->mutex);
  fail_vmas:
-   if (physical_device->use_softpin) {
-      util_vma_heap_finish(&device->vma_hi);
-      util_vma_heap_finish(&device->vma_cva);
-      util_vma_heap_finish(&device->vma_lo);
-   }
+   util_vma_heap_finish(&device->vma_trtt);
+   util_vma_heap_finish(&device->vma_samplers);
+   util_vma_heap_finish(&device->vma_desc_buf);
+   util_vma_heap_finish(&device->vma_desc);
+   util_vma_heap_finish(&device->vma_hi);
+   util_vma_heap_finish(&device->vma_lo);
+   pthread_mutex_destroy(&device->vma_mutex);
  fail_queues:
    for (uint32_t i = 0; i < device->queue_count; i++)
       anv_queue_finish(&device->queues[i]);
    vk_free(&device->vk.alloc, device->queues);
  fail_context_id:
-   anv_gem_destroy_context(device, device->context_id);
+   anv_device_destroy_context_or_vm(device);
  fail_fd:
    close(device->fd);
  fail_device:
@@ -3629,20 +4003,58 @@ void anv_DestroyDevice(
    if (!device)
       return;
 
+#if DETECT_OS_ANDROID
+   u_gralloc_destroy(&device->u_gralloc);
+#endif
+
+   anv_memory_trace_finish(device);
+
+   struct anv_physical_device *pdevice = device->physical;
+
+   for (uint32_t i = 0; i < device->queue_count; i++)
+      anv_queue_finish(&device->queues[i]);
+   vk_free(&device->vk.alloc, device->queues);
+
+   anv_device_utrace_finish(device);
+
    anv_device_finish_blorp(device);
 
    anv_device_finish_rt_shaders(device);
 
-   anv_pipeline_cache_finish(&device->default_pipeline_cache);
+   anv_device_finish_astc_emu(device);
+
+   anv_device_finish_internal_kernels(device);
+
+   vk_pipeline_cache_destroy(device->internal_cache, NULL);
+   vk_pipeline_cache_destroy(device->default_pipeline_cache, NULL);
+
+   anv_device_finish_embedded_samplers(device);
+
+   anv_device_finish_trtt(device);
+
+   if (ANV_SUPPORT_RT && device->info->has_ray_tracing)
+      anv_device_release_bo(device, device->btd_fifo_bo);
+
+   if (device->info->verx10 >= 125) {
+      vk_common_DestroyCommandPool(anv_device_to_handle(device),
+                                   device->companion_rcs_cmd_pool, NULL);
+   }
 
 #ifdef HAVE_VALGRIND
    /* We only need to free these to prevent valgrind errors.  The backing
     * BO will go away in a couple of lines so we don't actually leak.
     */
-   if (device->info.ver >= 8)
-      anv_state_reserved_pool_finish(&device->custom_border_colors);
+   anv_state_reserved_pool_finish(&device->custom_border_colors);
    anv_state_pool_free(&device->dynamic_state_pool, device->border_colors);
    anv_state_pool_free(&device->dynamic_state_pool, device->slice_hash);
+   anv_state_pool_free(&device->dynamic_state_pool, device->cps_states);
+   anv_state_pool_free(&device->dynamic_state_pool, device->breakpoint);
+   if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
+      anv_state_pool_free(&device->dynamic_state_db_pool, device->cps_states_db);
+      anv_state_pool_free(&device->dynamic_state_db_pool, device->slice_hash_db);
+      anv_state_pool_free(&device->dynamic_state_db_pool, device->border_colors_db);
+      anv_state_reserved_array_pool_finish(&device->custom_border_colors_db);
+   }
 #endif
 
    for (unsigned i = 0; i < ARRAY_SIZE(device->rt_scratch_bos); i++) {
@@ -3652,42 +4064,66 @@ void anv_DestroyDevice(
 
    anv_scratch_pool_finish(device, &device->scratch_pool);
 
+   if (device->vk.enabled_extensions.KHR_ray_query) {
+      for (unsigned i = 0; i < ARRAY_SIZE(device->ray_query_shadow_bos); i++) {
+         if (device->ray_query_shadow_bos[i] != NULL)
+            anv_device_release_bo(device, device->ray_query_shadow_bos[i]);
+      }
+      anv_device_release_bo(device, device->ray_query_bo);
+   }
    anv_device_release_bo(device, device->workaround_bo);
    anv_device_release_bo(device, device->trivial_batch_bo);
 
-   if (device->info.has_aux_map) {
+   if (device->info->has_aux_map) {
       intel_aux_map_finish(device->aux_map_ctx);
       device->aux_map_ctx = NULL;
+      anv_state_pool_finish(&device->aux_tt_pool);
    }
-
-   if (device->physical->use_softpin)
-      anv_state_pool_finish(&device->binding_table_pool);
-   anv_state_pool_finish(&device->surface_state_pool);
+   if (device->vk.enabled_extensions.EXT_descriptor_buffer &&
+       device->info->verx10 >= 125)
+      anv_state_pool_finish(&device->push_descriptor_buffer_pool);
+   if (device->physical->indirect_descriptors)
+      anv_state_pool_finish(&device->indirect_push_descriptor_pool);
+   anv_state_pool_finish(&device->binding_table_pool);
+   if (device->info->verx10 >= 125)
+      anv_state_pool_finish(&device->scratch_surface_state_pool);
+   anv_state_pool_finish(&device->internal_surface_state_pool);
+   if (device->physical->indirect_descriptors)
+      anv_state_pool_finish(&device->bindless_surface_state_pool);
    anv_state_pool_finish(&device->instruction_state_pool);
+   if (device->vk.enabled_extensions.EXT_descriptor_buffer)
+      anv_state_pool_finish(&device->dynamic_state_db_pool);
    anv_state_pool_finish(&device->dynamic_state_pool);
    anv_state_pool_finish(&device->general_state_pool);
 
+   if (device->vk.enabled_extensions.KHR_acceleration_structure)
+      anv_bo_pool_finish(&device->bvh_bo_pool);
    anv_bo_pool_finish(&device->batch_bo_pool);
 
    anv_bo_cache_finish(&device->bo_cache);
 
-   if (device->physical->use_softpin) {
-      util_vma_heap_finish(&device->vma_hi);
-      util_vma_heap_finish(&device->vma_cva);
-      util_vma_heap_finish(&device->vma_lo);
-   }
+   util_vma_heap_finish(&device->vma_trtt);
+   util_vma_heap_finish(&device->vma_samplers);
+   util_vma_heap_finish(&device->vma_desc_buf);
+   util_vma_heap_finish(&device->vma_desc);
+   util_vma_heap_finish(&device->vma_hi);
+   util_vma_heap_finish(&device->vma_lo);
+   pthread_mutex_destroy(&device->vma_mutex);
 
    pthread_cond_destroy(&device->queue_submit);
    pthread_mutex_destroy(&device->mutex);
 
-   for (uint32_t i = 0; i < device->queue_count; i++)
-      anv_queue_finish(&device->queues[i]);
-   vk_free(&device->vk.alloc, device->queues);
+   ralloc_free(device->fp64_nir);
 
-   anv_gem_destroy_context(device, device->context_id);
+   anv_device_destroy_context_or_vm(device);
 
-   if (INTEL_DEBUG & DEBUG_BATCH)
-      intel_batch_decode_ctx_finish(&device->decoder_ctx);
+   if (INTEL_DEBUG(DEBUG_BATCH | DEBUG_BATCH_STATS)) {
+      for (unsigned i = 0; i < pdevice->queue.family_count; i++) {
+         if (INTEL_DEBUG(DEBUG_BATCH_STATS))
+            intel_batch_print_stats(&device->decoder[i]);
+         intel_batch_decode_ctx_finish(&device->decoder[i]);
+      }
+   }
 
    close(device->fd);
 
@@ -3705,161 +4141,7 @@ VkResult anv_EnumerateInstanceLayerProperties(
    }
 
    /* None supported at this time */
-   return vk_error(VK_ERROR_LAYER_NOT_PRESENT);
-}
-
-void anv_GetDeviceQueue2(
-    VkDevice                                    _device,
-    const VkDeviceQueueInfo2*                   pQueueInfo,
-    VkQueue*                                    pQueue)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   struct anv_physical_device *pdevice = device->physical;
-
-   assert(pQueueInfo->queueFamilyIndex < pdevice->queue.family_count);
-   struct anv_queue_family *queue_family =
-      &pdevice->queue.families[pQueueInfo->queueFamilyIndex];
-
-   int idx_in_family = 0;
-   struct anv_queue *queue = NULL;
-   for (uint32_t i = 0; i < device->queue_count; i++) {
-      if (device->queues[i].family != queue_family)
-         continue;
-
-      if (idx_in_family == pQueueInfo->queueIndex) {
-         queue = &device->queues[i];
-         break;
-      }
-
-      idx_in_family++;
-   }
-   assert(queue != NULL);
-
-   if (queue && queue->flags == pQueueInfo->flags)
-      *pQueue = anv_queue_to_handle(queue);
-   else
-      *pQueue = NULL;
-}
-
-void
-_anv_device_report_lost(struct anv_device *device)
-{
-   assert(p_atomic_read(&device->_lost) > 0);
-
-   device->lost_reported = true;
-
-   for (uint32_t i = 0; i < device->queue_count; i++) {
-      struct anv_queue *queue = &device->queues[i];
-      if (queue->lost) {
-         __vk_errorf(device->physical->instance, &device->vk.base,
-                     VK_ERROR_DEVICE_LOST,
-                     queue->error_file, queue->error_line,
-                     "%s", queue->error_msg);
-      }
-   }
-}
-
-VkResult
-_anv_device_set_lost(struct anv_device *device,
-                     const char *file, int line,
-                     const char *msg, ...)
-{
-   VkResult err;
-   va_list ap;
-
-   if (p_atomic_read(&device->_lost) > 0)
-      return VK_ERROR_DEVICE_LOST;
-
-   p_atomic_inc(&device->_lost);
-   device->lost_reported = true;
-
-   va_start(ap, msg);
-   err = __vk_errorv(device->physical->instance, &device->vk.base,
-                     VK_ERROR_DEVICE_LOST, file, line, msg, ap);
-   va_end(ap);
-
-   if (env_var_as_boolean("ANV_ABORT_ON_DEVICE_LOSS", false))
-      abort();
-
-   return err;
-}
-
-VkResult
-_anv_queue_set_lost(struct anv_queue *queue,
-                     const char *file, int line,
-                     const char *msg, ...)
-{
-   va_list ap;
-
-   if (queue->lost)
-      return VK_ERROR_DEVICE_LOST;
-
-   queue->lost = true;
-
-   queue->error_file = file;
-   queue->error_line = line;
-   va_start(ap, msg);
-   vsnprintf(queue->error_msg, sizeof(queue->error_msg),
-             msg, ap);
-   va_end(ap);
-
-   p_atomic_inc(&queue->device->_lost);
-
-   if (env_var_as_boolean("ANV_ABORT_ON_DEVICE_LOSS", false))
-      abort();
-
-   return VK_ERROR_DEVICE_LOST;
-}
-
-VkResult
-anv_device_query_status(struct anv_device *device)
-{
-   /* This isn't likely as most of the callers of this function already check
-    * for it.  However, it doesn't hurt to check and it potentially lets us
-    * avoid an ioctl.
-    */
-   if (anv_device_is_lost(device))
-      return VK_ERROR_DEVICE_LOST;
-
-   uint32_t active, pending;
-   int ret = anv_gem_context_get_reset_stats(device->fd, device->context_id,
-                                             &active, &pending);
-   if (ret == -1) {
-      /* We don't know the real error. */
-      return anv_device_set_lost(device, "get_reset_stats failed: %m");
-   }
-
-   if (active) {
-      return anv_device_set_lost(device, "GPU hung on one of our command buffers");
-   } else if (pending) {
-      return anv_device_set_lost(device, "GPU hung with commands in-flight");
-   }
-
-   return VK_SUCCESS;
-}
-
-VkResult
-anv_device_bo_busy(struct anv_device *device, struct anv_bo *bo)
-{
-   /* Note:  This only returns whether or not the BO is in use by an i915 GPU.
-    * Other usages of the BO (such as on different hardware) will not be
-    * flagged as "busy" by this ioctl.  Use with care.
-    */
-   int ret = anv_gem_busy(device, bo->gem_handle);
-   if (ret == 1) {
-      return VK_NOT_READY;
-   } else if (ret == -1) {
-      /* We don't know the real error. */
-      return anv_device_set_lost(device, "gem wait failed: %m");
-   }
-
-   /* Query for device status after the busy call.  If the BO we're checking
-    * got caught in a GPU hang we don't want to return VK_SUCCESS to the
-    * client because it clearly doesn't have valid data.  Yes, this most
-    * likely means an ioctl, but we just did an ioctl to query the busy status
-    * so it's no great loss.
-    */
-   return anv_device_query_status(device);
+   return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT);
 }
 
 VkResult
@@ -3871,52 +4153,60 @@ anv_device_wait(struct anv_device *device, struct anv_bo *bo,
       return VK_TIMEOUT;
    } else if (ret == -1) {
       /* We don't know the real error. */
-      return anv_device_set_lost(device, "gem wait failed: %m");
+      return vk_device_set_lost(&device->vk, "gem wait failed: %m");
+   } else {
+      return VK_SUCCESS;
    }
-
-   /* Query for device status after the wait.  If the BO we're waiting on got
-    * caught in a GPU hang we don't want to return VK_SUCCESS to the client
-    * because it clearly doesn't have valid data.  Yes, this most likely means
-    * an ioctl, but we just did an ioctl to wait so it's no great loss.
-    */
-   return anv_device_query_status(device);
 }
 
-VkResult anv_DeviceWaitIdle(
-    VkDevice                                    _device)
+static struct util_vma_heap *
+anv_vma_heap_for_flags(struct anv_device *device,
+                       enum anv_bo_alloc_flags alloc_flags)
 {
-   ANV_FROM_HANDLE(anv_device, device, _device);
+   if (alloc_flags & ANV_BO_ALLOC_TRTT)
+      return &device->vma_trtt;
 
-   if (anv_device_is_lost(device))
-      return VK_ERROR_DEVICE_LOST;
+   if (alloc_flags & ANV_BO_ALLOC_DESCRIPTOR_BUFFER_POOL)
+      return &device->vma_desc_buf;
 
-   for (uint32_t i = 0; i < device->queue_count; i++) {
-      VkResult res = anv_queue_submit_simple_batch(&device->queues[i], NULL);
-      if (res != VK_SUCCESS)
-         return res;
-   }
+   if (alloc_flags & ANV_BO_ALLOC_32BIT_ADDRESS)
+      return &device->vma_lo;
 
-   return VK_SUCCESS;
+   if (alloc_flags & ANV_BO_ALLOC_DESCRIPTOR_POOL)
+      return &device->vma_desc;
+
+   if (alloc_flags & ANV_BO_ALLOC_SAMPLER_POOL)
+      return &device->vma_samplers;
+
+   return &device->vma_hi;
 }
 
 uint64_t
 anv_vma_alloc(struct anv_device *device,
               uint64_t size, uint64_t align,
               enum anv_bo_alloc_flags alloc_flags,
-              uint64_t client_address)
+              uint64_t client_address,
+              struct util_vma_heap **out_vma_heap)
 {
    pthread_mutex_lock(&device->vma_mutex);
 
    uint64_t addr = 0;
+   *out_vma_heap = anv_vma_heap_for_flags(device, alloc_flags);
 
    if (alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) {
+      assert(*out_vma_heap == &device->vma_hi ||
+             *out_vma_heap == &device->vma_desc_buf ||
+             *out_vma_heap == &device->vma_trtt);
+
       if (client_address) {
-         if (util_vma_heap_alloc_addr(&device->vma_cva,
+         if (util_vma_heap_alloc_addr(*out_vma_heap,
                                       client_address, size)) {
             addr = client_address;
          }
       } else {
-         addr = util_vma_heap_alloc(&device->vma_cva, size, align);
+         (*out_vma_heap)->alloc_high = false;
+         addr = util_vma_heap_alloc(*out_vma_heap, size, align);
+         (*out_vma_heap)->alloc_high = true;
       }
       /* We don't want to fall back to other heaps */
       goto done;
@@ -3924,11 +4214,7 @@ anv_vma_alloc(struct anv_device *device,
 
    assert(client_address == 0);
 
-   if (!(alloc_flags & ANV_BO_ALLOC_32BIT_ADDRESS))
-      addr = util_vma_heap_alloc(&device->vma_hi, size, align);
-
-   if (addr == 0)
-      addr = util_vma_heap_alloc(&device->vma_lo, size, align);
+   addr = util_vma_heap_alloc(*out_vma_heap, size, align);
 
 done:
    pthread_mutex_unlock(&device->vma_mutex);
@@ -3939,22 +4225,21 @@ done:
 
 void
 anv_vma_free(struct anv_device *device,
+             struct util_vma_heap *vma_heap,
              uint64_t address, uint64_t size)
 {
+   assert(vma_heap == &device->vma_lo ||
+          vma_heap == &device->vma_hi ||
+          vma_heap == &device->vma_desc ||
+          vma_heap == &device->vma_desc_buf ||
+          vma_heap == &device->vma_samplers ||
+          vma_heap == &device->vma_trtt);
+
    const uint64_t addr_48b = intel_48b_address(address);
 
    pthread_mutex_lock(&device->vma_mutex);
 
-   if (addr_48b >= LOW_HEAP_MIN_ADDRESS &&
-       addr_48b <= LOW_HEAP_MAX_ADDRESS) {
-      util_vma_heap_free(&device->vma_lo, addr_48b, size);
-   } else if (addr_48b >= CLIENT_VISIBLE_HEAP_MIN_ADDRESS &&
-              addr_48b <= CLIENT_VISIBLE_HEAP_MAX_ADDRESS) {
-      util_vma_heap_free(&device->vma_cva, addr_48b, size);
-   } else {
-      assert(addr_48b >= HIGH_HEAP_MIN_ADDRESS);
-      util_vma_heap_free(&device->vma_hi, addr_48b, size);
-   }
+   util_vma_heap_free(vma_heap, addr_48b, size);
 
    pthread_mutex_unlock(&device->vma_mutex);
 }
@@ -3972,125 +4257,165 @@ VkResult anv_AllocateMemory(
 
    assert(pAllocateInfo->sType == VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO);
 
-   /* The Vulkan 1.0.33 spec says "allocationSize must be greater than 0". */
-   assert(pAllocateInfo->allocationSize > 0);
-
    VkDeviceSize aligned_alloc_size =
-      align_u64(pAllocateInfo->allocationSize, 4096);
-
-   if (aligned_alloc_size > MAX_MEMORY_ALLOCATION_SIZE)
-      return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      align64(pAllocateInfo->allocationSize, 4096);
 
    assert(pAllocateInfo->memoryTypeIndex < pdevice->memory.type_count);
-   struct anv_memory_type *mem_type =
+   const struct anv_memory_type *mem_type =
       &pdevice->memory.types[pAllocateInfo->memoryTypeIndex];
    assert(mem_type->heapIndex < pdevice->memory.heap_count);
    struct anv_memory_heap *mem_heap =
       &pdevice->memory.heaps[mem_type->heapIndex];
 
+   if (aligned_alloc_size > mem_heap->size)
+      return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
    uint64_t mem_heap_used = p_atomic_read(&mem_heap->used);
    if (mem_heap_used + aligned_alloc_size > mem_heap->size)
-      return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 
-   mem = vk_object_alloc(&device->vk, pAllocator, sizeof(*mem),
-                         VK_OBJECT_TYPE_DEVICE_MEMORY);
+   mem = vk_device_memory_create(&device->vk, pAllocateInfo,
+                                 pAllocator, sizeof(*mem));
    if (mem == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    mem->type = mem_type;
    mem->map = NULL;
    mem->map_size = 0;
-   mem->ahw = NULL;
-   mem->host_ptr = NULL;
+   mem->map_delta = 0;
 
    enum anv_bo_alloc_flags alloc_flags = 0;
 
-   const VkExportMemoryAllocateInfo *export_info = NULL;
-   const VkImportAndroidHardwareBufferInfoANDROID *ahw_import_info = NULL;
    const VkImportMemoryFdInfoKHR *fd_info = NULL;
-   const VkImportMemoryHostPointerInfoEXT *host_ptr_info = NULL;
    const VkMemoryDedicatedAllocateInfo *dedicated_info = NULL;
-   VkMemoryAllocateFlags vk_flags = 0;
+   const struct wsi_memory_allocate_info *wsi_info = NULL;
    uint64_t client_address = 0;
 
    vk_foreach_struct_const(ext, pAllocateInfo->pNext) {
-      switch (ext->sType) {
+      /* VK_STRUCTURE_TYPE_WSI_MEMORY_ALLOCATE_INFO_MESA isn't a real enum
+       * value, so use cast to avoid compiler warn
+       */
+      switch ((uint32_t)ext->sType) {
       case VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO:
-         export_info = (void *)ext;
-         break;
-
       case VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID:
-         ahw_import_info = (void *)ext;
+      case VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT:
+      case VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR:
+      case VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO:
+         /* handled by vk_device_memory_create */
          break;
 
       case VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR:
          fd_info = (void *)ext;
          break;
 
-      case VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT:
-         host_ptr_info = (void *)ext;
-         break;
-
-      case VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO: {
-         const VkMemoryAllocateFlagsInfo *flags_info = (void *)ext;
-         vk_flags = flags_info->flags;
-         break;
-      }
-
       case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO:
          dedicated_info = (void *)ext;
          break;
 
-      case VK_STRUCTURE_TYPE_MEMORY_OPAQUE_CAPTURE_ADDRESS_ALLOCATE_INFO_KHR: {
-         const VkMemoryOpaqueCaptureAddressAllocateInfoKHR *addr_info =
-            (const VkMemoryOpaqueCaptureAddressAllocateInfoKHR *)ext;
+      case VK_STRUCTURE_TYPE_MEMORY_OPAQUE_CAPTURE_ADDRESS_ALLOCATE_INFO: {
+         const VkMemoryOpaqueCaptureAddressAllocateInfo *addr_info =
+            (const VkMemoryOpaqueCaptureAddressAllocateInfo *)ext;
          client_address = addr_info->opaqueCaptureAddress;
          break;
       }
 
+      case VK_STRUCTURE_TYPE_WSI_MEMORY_ALLOCATE_INFO_MESA:
+         wsi_info = (void *)ext;
+         break;
+
       default:
          anv_debug_ignored_stype(ext->sType);
          break;
       }
    }
 
-   /* By default, we want all VkDeviceMemory objects to support CCS */
-   if (device->physical->has_implicit_ccs)
-      alloc_flags |= ANV_BO_ALLOC_IMPLICIT_CCS;
+   /* If i915 reported a mappable/non_mappable vram regions and the
+    * application want lmem mappable, then we need to use the
+    * I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS flag to create our BO.
+    */
+   if (pdevice->vram_mappable.size > 0 &&
+       pdevice->vram_non_mappable.size > 0 &&
+       (mem_type->propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) &&
+       (mem_type->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT))
+      alloc_flags |= ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE;
+
+   if (!mem_heap->is_local_mem)
+      alloc_flags |= ANV_BO_ALLOC_NO_LOCAL_MEM;
 
-   if (vk_flags & VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR)
+   if (mem->vk.alloc_flags & VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT)
       alloc_flags |= ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS;
 
-   if ((export_info && export_info->handleTypes) ||
-       (fd_info && fd_info->handleType) ||
-       (host_ptr_info && host_ptr_info->handleType)) {
-      /* Anything imported or exported is EXTERNAL */
-      alloc_flags |= ANV_BO_ALLOC_EXTERNAL;
+   if (mem_type->propertyFlags & VK_MEMORY_PROPERTY_PROTECTED_BIT)
+      alloc_flags |= ANV_BO_ALLOC_PROTECTED;
 
-      /* We can't have implicit CCS on external memory with an AUX-table.
-       * Doing so would require us to sync the aux tables across processes
-       * which is impractical.
-       */
-      if (device->info.has_aux_map)
-         alloc_flags &= ~ANV_BO_ALLOC_IMPLICIT_CCS;
-   }
+   /* For now, always allocated AUX-TT aligned memory, regardless of dedicated
+    * allocations. An application can for example, suballocate a large
+    * VkDeviceMemory and try to bind an image created with a CCS modifier. In
+    * that case we cannot disable CCS if the alignment doesn´t meet the AUX-TT
+    * requirements, so we need to ensure both the VkDeviceMemory and the
+    * alignment reported through vkGetImageMemoryRequirements() meet the
+    * AUX-TT requirement.
+    *
+    * TODO: when we enable EXT_descriptor_buffer, we'll be able to drop the
+    * AUX-TT alignment for that type of allocation.
+    */
+   if (device->info->has_aux_map)
+      alloc_flags |= ANV_BO_ALLOC_AUX_TT_ALIGNED;
 
-   /* Check if we need to support Android HW buffer export. If so,
-    * create AHardwareBuffer and import memory from it.
+   /* If the allocation is not dedicated nor a host pointer, allocate
+    * additional CCS space.
+    *
+    * TODO: If we ever ship VK_EXT_descriptor_buffer (ahahah... :() we could
+    * drop this flag in the descriptor buffer case as we don't need any
+    * compression there.
+    *
+    * TODO: We could also create new memory types for allocations that don't
+    * need any compression.
     */
-   bool android_export = false;
-   if (export_info && export_info->handleTypes &
-       VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID)
-      android_export = true;
+   if (device->physical->alloc_aux_tt_mem &&
+       dedicated_info == NULL &&
+       mem->vk.host_ptr == NULL)
+      alloc_flags |= ANV_BO_ALLOC_AUX_CCS;
+
+   /* TODO: Android, ChromeOS and other applications may need another way to
+    * allocate buffers that can be scanout to display but it should pretty
+    * easy to catch those as Xe KMD driver will print warnings in dmesg when
+    * scanning buffers allocated without proper flag set.
+    */
+   if (wsi_info)
+      alloc_flags |= ANV_BO_ALLOC_SCANOUT;
 
-   if (ahw_import_info) {
-      result = anv_import_ahw_memory(_device, mem, ahw_import_info);
-      if (result != VK_SUCCESS)
-         goto fail;
+   /* Anything imported or exported is EXTERNAL */
+   if (mem->vk.export_handle_types || mem->vk.import_handle_type) {
+      alloc_flags |= ANV_BO_ALLOC_EXTERNAL;
 
-      goto success;
-   } else if (android_export) {
-      result = anv_create_ahw_memory(_device, mem, pAllocateInfo);
+      /* wsi has its own way of synchronizing with the compositor */
+      if (pdevice->instance->external_memory_implicit_sync &&
+          !wsi_info && dedicated_info &&
+          dedicated_info->image != VK_NULL_HANDLE) {
+         ANV_FROM_HANDLE(anv_image, image, dedicated_info->image);
+
+         /* Apply implicit sync to be compatible with clients relying on
+          * implicit fencing. This matches the behavior in iris i915_batch
+          * submit. An example client is VA-API (iHD), so only dedicated
+          * image scenario has to be covered.
+          */
+         alloc_flags |= ANV_BO_ALLOC_IMPLICIT_SYNC;
+
+         /* For color attachment, apply IMPLICIT_WRITE so a client on the
+          * consumer side relying on implicit fencing can have a fence to
+          * wait for render complete.
+          */
+         if (image->vk.usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)
+            alloc_flags |= ANV_BO_ALLOC_IMPLICIT_WRITE;
+      }
+   }
+
+   if (mem_type->descriptor_buffer)
+      alloc_flags |= ANV_BO_ALLOC_DESCRIPTOR_BUFFER_POOL;
+
+   if (mem->vk.ahardware_buffer) {
+      result = anv_import_ahw_memory(_device, mem);
       if (result != VK_SUCCESS)
          goto fail;
 
@@ -4121,8 +4446,7 @@ VkResult anv_AllocateMemory(
        * this sort of attack but only if it can trust the buffer size.
        */
       if (mem->bo->size < aligned_alloc_size) {
-         result = vk_errorf(device, &device->vk.base,
-                            VK_ERROR_INVALID_EXTERNAL_HANDLE,
+         result = vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
                             "aligned allocationSize too large for "
                             "VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT: "
                             "%"PRIu64"B > %"PRIu64"B",
@@ -4144,34 +4468,39 @@ VkResult anv_AllocateMemory(
       goto success;
    }
 
-   if (host_ptr_info && host_ptr_info->handleType) {
-      if (host_ptr_info->handleType ==
+   if (mem->vk.host_ptr) {
+      if (mem->vk.import_handle_type ==
           VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_MAPPED_FOREIGN_MEMORY_BIT_EXT) {
-         result = vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
+         result = vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
          goto fail;
       }
 
-      assert(host_ptr_info->handleType ==
+      assert(mem->vk.import_handle_type ==
              VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT);
 
       result = anv_device_import_bo_from_host_ptr(device,
-                                                  host_ptr_info->pHostPointer,
-                                                  pAllocateInfo->allocationSize,
+                                                  mem->vk.host_ptr,
+                                                  mem->vk.size,
                                                   alloc_flags,
                                                   client_address,
                                                   &mem->bo);
       if (result != VK_SUCCESS)
          goto fail;
 
-      mem->host_ptr = host_ptr_info->pHostPointer;
       goto success;
    }
 
-   /* Set ALLOC_LOCAL_MEM flag if heap has device local bit set and requested
-    * memory property flag has DEVICE_LOCAL_BIT set.
-    */
-   if (mem_type->propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)
-      alloc_flags |= ANV_BO_ALLOC_LOCAL_MEM;
+   if (alloc_flags & (ANV_BO_ALLOC_EXTERNAL | ANV_BO_ALLOC_SCANOUT)) {
+      alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT;
+   } else if (mem_type->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
+      if (mem_type->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
+         alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT;
+      if (mem_type->propertyFlags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT)
+         alloc_flags |= ANV_BO_ALLOC_HOST_CACHED;
+   } else {
+      /* Required to set some host mode to have a valid pat index set */
+      alloc_flags |= ANV_BO_ALLOC_HOST_COHERENT;
+   }
 
    /* Regular allocate (not importing memory). */
 
@@ -4187,16 +4516,12 @@ VkResult anv_AllocateMemory(
        * the BO.  In this case, we have a dedicated allocation.
        */
       if (image->vk.wsi_legacy_scanout) {
-         const uint32_t i915_tiling =
-            isl_tiling_to_i915_tiling(image->planes[0].primary_surface.isl.tiling);
-         int ret = anv_gem_set_tiling(device, mem->bo->gem_handle,
-                                      image->planes[0].primary_surface.isl.row_pitch_B,
-                                      i915_tiling);
-         if (ret) {
+         const struct isl_surf *surf = &image->planes[0].primary_surface.isl;
+         result = anv_device_set_bo_tiling(device, mem->bo,
+                                           surf->row_pitch_B,
+                                           surf->tiling);
+         if (result != VK_SUCCESS) {
             anv_device_release_bo(device, mem->bo);
-            result = vk_errorf(device, &device->vk.base,
-                               VK_ERROR_OUT_OF_DEVICE_MEMORY,
-                               "failed to set BO tiling: %m");
             goto fail;
          }
       }
@@ -4207,8 +4532,7 @@ VkResult anv_AllocateMemory(
    if (mem_heap_used > mem_heap->size) {
       p_atomic_add(&mem_heap->used, -mem->bo->size);
       anv_device_release_bo(device, mem->bo);
-      result = vk_errorf(device, &device->vk.base,
-                         VK_ERROR_OUT_OF_DEVICE_MEMORY,
+      result = vk_errorf(device, VK_ERROR_OUT_OF_DEVICE_MEMORY,
                          "Out of heap memory");
       goto fail;
    }
@@ -4217,12 +4541,14 @@ VkResult anv_AllocateMemory(
    list_addtail(&mem->link, &device->memory_objects);
    pthread_mutex_unlock(&device->mutex);
 
+   ANV_RMV(heap_create, device, mem, false, 0);
+
    *pMem = anv_device_memory_to_handle(mem);
 
    return VK_SUCCESS;
 
  fail:
-   vk_object_free(&device->vk, pAllocator, mem);
+   vk_device_memory_destroy(&device->vk, pAllocator, &mem->vk);
 
    return result;
 }
@@ -4266,7 +4592,7 @@ VkResult anv_GetMemoryFdPropertiesKHR(
        *
        * So opaque handle types fall into the default "unsupported" case.
        */
-      return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
+      return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
    }
 }
 
@@ -4309,71 +4635,84 @@ void anv_FreeMemory(
    list_del(&mem->link);
    pthread_mutex_unlock(&device->mutex);
 
-   if (mem->map)
-      anv_UnmapMemory(_device, _mem);
+   if (mem->map) {
+      const VkMemoryUnmapInfoKHR unmap = {
+         .sType = VK_STRUCTURE_TYPE_MEMORY_UNMAP_INFO_KHR,
+         .memory = _mem,
+      };
+      anv_UnmapMemory2KHR(_device, &unmap);
+   }
 
    p_atomic_add(&device->physical->memory.heaps[mem->type->heapIndex].used,
                 -mem->bo->size);
 
    anv_device_release_bo(device, mem->bo);
 
-#if defined(ANDROID) && ANDROID_API_LEVEL >= 26
-   if (mem->ahw)
-      AHardwareBuffer_release(mem->ahw);
-#endif
+   ANV_RMV(resource_destroy, device, mem);
 
-   vk_object_free(&device->vk, pAllocator, mem);
+   vk_device_memory_destroy(&device->vk, pAllocator, &mem->vk);
 }
 
-VkResult anv_MapMemory(
+VkResult anv_MapMemory2KHR(
     VkDevice                                    _device,
-    VkDeviceMemory                              _memory,
-    VkDeviceSize                                offset,
-    VkDeviceSize                                size,
-    VkMemoryMapFlags                            flags,
+    const VkMemoryMapInfoKHR*                   pMemoryMapInfo,
     void**                                      ppData)
 {
    ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_device_memory, mem, _memory);
+   ANV_FROM_HANDLE(anv_device_memory, mem, pMemoryMapInfo->memory);
 
    if (mem == NULL) {
       *ppData = NULL;
       return VK_SUCCESS;
    }
 
-   if (mem->host_ptr) {
-      *ppData = mem->host_ptr + offset;
+   if (mem->vk.host_ptr) {
+      *ppData = mem->vk.host_ptr + pMemoryMapInfo->offset;
       return VK_SUCCESS;
    }
 
-   if (size == VK_WHOLE_SIZE)
-      size = mem->bo->size - offset;
-
    /* From the Vulkan spec version 1.0.32 docs for MapMemory:
     *
-    *  * If size is not equal to VK_WHOLE_SIZE, size must be greater than 0
-    *    assert(size != 0);
-    *  * If size is not equal to VK_WHOLE_SIZE, size must be less than or
-    *    equal to the size of the memory minus offset
+    *  * memory must have been created with a memory type that reports
+    *    VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT
     */
-   assert(size > 0);
-   assert(offset + size <= mem->bo->size);
+   if (!(mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) {
+      return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED,
+                       "Memory object not mappable.");
+   }
 
-   /* FIXME: Is this supposed to be thread safe? Since vkUnmapMemory() only
-    * takes a VkDeviceMemory pointer, it seems like only one map of the memory
-    * at a time is valid. We could just mmap up front and return an offset
-    * pointer here, but that may exhaust virtual memory on 32 bit
-    * userspace. */
+   assert(pMemoryMapInfo->size > 0);
+   const VkDeviceSize offset = pMemoryMapInfo->offset;
+   const VkDeviceSize size =
+      vk_device_memory_range(&mem->vk, pMemoryMapInfo->offset,
+                                       pMemoryMapInfo->size);
 
-   uint32_t gem_flags = 0;
+   if (size != (size_t)size) {
+      return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED,
+                       "requested size 0x%"PRIx64" does not fit in %u bits",
+                       size, (unsigned)(sizeof(size_t) * 8));
+   }
+
+   /* From the Vulkan 1.2.194 spec:
+    *
+    *    "memory must not be currently host mapped"
+    */
+   if (mem->map != NULL) {
+      return vk_errorf(device, VK_ERROR_MEMORY_MAP_FAILED,
+                       "Memory object already mapped.");
+   }
 
-   if (!device->info.has_llc &&
-       (mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT))
-      gem_flags |= I915_MMAP_WC;
+   void *placed_addr = NULL;
+   if (pMemoryMapInfo->flags & VK_MEMORY_MAP_PLACED_BIT_EXT) {
+      const VkMemoryMapPlacedInfoEXT *placed_info =
+         vk_find_struct_const(pMemoryMapInfo->pNext, MEMORY_MAP_PLACED_INFO_EXT);
+      assert(placed_info != NULL);
+      placed_addr = placed_info->pPlacedAddress;
+   }
 
    /* GEM will fail to map if the offset isn't 4k-aligned.  Round down. */
    uint64_t map_offset;
-   if (!device->physical->has_mmap_offset)
+   if (!device->physical->info.has_mmap_offset)
       map_offset = offset & ~4095ull;
    else
       map_offset = 0;
@@ -4381,53 +4720,43 @@ VkResult anv_MapMemory(
    uint64_t map_size = (offset + size) - map_offset;
 
    /* Let's map whole pages */
-   map_size = align_u64(map_size, 4096);
+   map_size = align64(map_size, 4096);
 
-   void *map = anv_gem_mmap(device, mem->bo->gem_handle,
-                            map_offset, map_size, gem_flags);
-   if (map == MAP_FAILED)
-      return vk_error(VK_ERROR_MEMORY_MAP_FAILED);
+   void *map;
+   VkResult result = anv_device_map_bo(device, mem->bo, map_offset,
+                                       map_size, placed_addr, &map);
+   if (result != VK_SUCCESS)
+      return result;
 
    mem->map = map;
    mem->map_size = map_size;
-
-   *ppData = mem->map + (offset - map_offset);
+   mem->map_delta = (offset - map_offset);
+   *ppData = mem->map + mem->map_delta;
 
    return VK_SUCCESS;
 }
 
-void anv_UnmapMemory(
+VkResult anv_UnmapMemory2KHR(
     VkDevice                                    _device,
-    VkDeviceMemory                              _memory)
+    const VkMemoryUnmapInfoKHR*                 pMemoryUnmapInfo)
 {
    ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_device_memory, mem, _memory);
+   ANV_FROM_HANDLE(anv_device_memory, mem, pMemoryUnmapInfo->memory);
 
-   if (mem == NULL || mem->host_ptr)
-      return;
+   if (mem == NULL || mem->vk.host_ptr)
+      return VK_SUCCESS;
 
-   anv_gem_munmap(device, mem->map, mem->map_size);
+   VkResult result =
+      anv_device_unmap_bo(device, mem->bo, mem->map, mem->map_size,
+                          pMemoryUnmapInfo->flags & VK_MEMORY_UNMAP_RESERVE_BIT_EXT);
+   if (result != VK_SUCCESS)
+      return result;
 
    mem->map = NULL;
    mem->map_size = 0;
-}
-
-static void
-clflush_mapped_ranges(struct anv_device         *device,
-                      uint32_t                   count,
-                      const VkMappedMemoryRange *ranges)
-{
-   for (uint32_t i = 0; i < count; i++) {
-      ANV_FROM_HANDLE(anv_device_memory, mem, ranges[i].memory);
-      if (ranges[i].offset >= mem->map_size)
-         continue;
+   mem->map_delta = 0;
 
-      if (mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
-         continue;
-
-      intel_clflush_range(mem->map + ranges[i].offset,
-                        MIN2(ranges[i].size, mem->map_size - ranges[i].offset));
-   }
+   return VK_SUCCESS;
 }
 
 VkResult anv_FlushMappedMemoryRanges(
@@ -4435,16 +4764,29 @@ VkResult anv_FlushMappedMemoryRanges(
     uint32_t                                    memoryRangeCount,
     const VkMappedMemoryRange*                  pMemoryRanges)
 {
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
    ANV_FROM_HANDLE(anv_device, device, _device);
 
-   if (!device->physical->memory.need_clflush)
+   if (!device->physical->memory.need_flush)
       return VK_SUCCESS;
 
    /* Make sure the writes we're flushing have landed. */
    __builtin_ia32_mfence();
 
-   clflush_mapped_ranges(device, memoryRangeCount, pMemoryRanges);
+   for (uint32_t i = 0; i < memoryRangeCount; i++) {
+      ANV_FROM_HANDLE(anv_device_memory, mem, pMemoryRanges[i].memory);
+      if (mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
+         continue;
+
+      uint64_t map_offset = pMemoryRanges[i].offset + mem->map_delta;
+      if (map_offset >= mem->map_size)
+         continue;
 
+      intel_flush_range(mem->map + map_offset,
+                        MIN2(pMemoryRanges[i].size,
+                             mem->map_size - map_offset));
+   }
+#endif
    return VK_SUCCESS;
 }
 
@@ -4453,73 +4795,32 @@ VkResult anv_InvalidateMappedMemoryRanges(
     uint32_t                                    memoryRangeCount,
     const VkMappedMemoryRange*                  pMemoryRanges)
 {
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
    ANV_FROM_HANDLE(anv_device, device, _device);
 
-   if (!device->physical->memory.need_clflush)
+   if (!device->physical->memory.need_flush)
       return VK_SUCCESS;
 
-   clflush_mapped_ranges(device, memoryRangeCount, pMemoryRanges);
+   for (uint32_t i = 0; i < memoryRangeCount; i++) {
+      ANV_FROM_HANDLE(anv_device_memory, mem, pMemoryRanges[i].memory);
+      if (mem->type->propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
+         continue;
+
+      uint64_t map_offset = pMemoryRanges[i].offset + mem->map_delta;
+      if (map_offset >= mem->map_size)
+         continue;
+
+      intel_invalidate_range(mem->map + map_offset,
+                             MIN2(pMemoryRanges[i].size,
+                                  mem->map_size - map_offset));
+   }
 
    /* Make sure no reads get moved up above the invalidate. */
    __builtin_ia32_mfence();
-
+#endif
    return VK_SUCCESS;
 }
 
-void anv_GetBufferMemoryRequirements2(
-    VkDevice                                    _device,
-    const VkBufferMemoryRequirementsInfo2*      pInfo,
-    VkMemoryRequirements2*                      pMemoryRequirements)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_buffer, buffer, pInfo->buffer);
-
-   /* The Vulkan spec (git aaed022) says:
-    *
-    *    memoryTypeBits is a bitfield and contains one bit set for every
-    *    supported memory type for the resource. The bit `1<<i` is set if and
-    *    only if the memory type `i` in the VkPhysicalDeviceMemoryProperties
-    *    structure for the physical device is supported.
-    */
-   uint32_t memory_types = (1ull << device->physical->memory.type_count) - 1;
-
-   /* Base alignment requirement of a cache line */
-   uint32_t alignment = 16;
-
-   if (buffer->usage & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT)
-      alignment = MAX2(alignment, ANV_UBO_ALIGNMENT);
-
-   pMemoryRequirements->memoryRequirements.size = buffer->size;
-   pMemoryRequirements->memoryRequirements.alignment = alignment;
-
-   /* Storage and Uniform buffers should have their size aligned to
-    * 32-bits to avoid boundary checks when last DWord is not complete.
-    * This would ensure that not internal padding would be needed for
-    * 16-bit types.
-    */
-   if (device->robust_buffer_access &&
-       (buffer->usage & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT ||
-        buffer->usage & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT))
-      pMemoryRequirements->memoryRequirements.size = align_u64(buffer->size, 4);
-
-   pMemoryRequirements->memoryRequirements.memoryTypeBits = memory_types;
-
-   vk_foreach_struct(ext, pMemoryRequirements->pNext) {
-      switch (ext->sType) {
-      case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
-         VkMemoryDedicatedRequirements *requirements = (void *)ext;
-         requirements->prefersDedicatedAllocation = false;
-         requirements->requiresDedicatedAllocation = false;
-         break;
-      }
-
-      default:
-         anv_debug_ignored_stype(ext->sType);
-         break;
-      }
-   }
-}
-
 void anv_GetDeviceMemoryCommitment(
     VkDevice                                    device,
     VkDeviceMemory                              memory,
@@ -4529,16 +4830,21 @@ void anv_GetDeviceMemoryCommitment(
 }
 
 static void
-anv_bind_buffer_memory(const VkBindBufferMemoryInfo *pBindInfo)
+anv_bind_buffer_memory(struct anv_device *device,
+                       const VkBindBufferMemoryInfo *pBindInfo)
 {
    ANV_FROM_HANDLE(anv_device_memory, mem, pBindInfo->memory);
    ANV_FROM_HANDLE(anv_buffer, buffer, pBindInfo->buffer);
 
    assert(pBindInfo->sType == VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO);
+   assert(!anv_buffer_is_sparse(buffer));
+
+   const VkBindMemoryStatusKHR *bind_status =
+      vk_find_struct_const(pBindInfo->pNext, BIND_MEMORY_STATUS_KHR);
 
    if (mem) {
-      assert(pBindInfo->memoryOffset < mem->bo->size);
-      assert(mem->bo->size - pBindInfo->memoryOffset >= buffer->size);
+      assert(pBindInfo->memoryOffset < mem->vk.size);
+      assert(mem->vk.size - pBindInfo->memoryOffset >= buffer->vk.size);
       buffer->address = (struct anv_address) {
          .bo = mem->bo,
          .offset = pBindInfo->memoryOffset,
@@ -4546,32 +4852,26 @@ anv_bind_buffer_memory(const VkBindBufferMemoryInfo *pBindInfo)
    } else {
       buffer->address = ANV_NULL_ADDRESS;
    }
+
+   ANV_RMV(buffer_bind, device, buffer);
+
+   if (bind_status)
+      *bind_status->pResult = VK_SUCCESS;
 }
 
 VkResult anv_BindBufferMemory2(
-    VkDevice                                    device,
+    VkDevice                                    _device,
     uint32_t                                    bindInfoCount,
     const VkBindBufferMemoryInfo*               pBindInfos)
 {
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
    for (uint32_t i = 0; i < bindInfoCount; i++)
-      anv_bind_buffer_memory(&pBindInfos[i]);
+      anv_bind_buffer_memory(device, &pBindInfos[i]);
 
    return VK_SUCCESS;
 }
 
-VkResult anv_QueueBindSparse(
-    VkQueue                                     _queue,
-    uint32_t                                    bindInfoCount,
-    const VkBindSparseInfo*                     pBindInfo,
-    VkFence                                     fence)
-{
-   ANV_FROM_HANDLE(anv_queue, queue, _queue);
-   if (anv_device_is_lost(queue->device))
-      return VK_ERROR_DEVICE_LOST;
-
-   return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
-}
-
 // Event functions
 
 VkResult anv_CreateEvent(
@@ -4588,12 +4888,14 @@ VkResult anv_CreateEvent(
    event = vk_object_alloc(&device->vk, pAllocator, sizeof(*event),
                            VK_OBJECT_TYPE_EVENT);
    if (event == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    event->state = anv_state_pool_alloc(&device->dynamic_state_pool,
                                        sizeof(uint64_t), 8);
    *(uint64_t *)event->state.map = VK_EVENT_RESET;
 
+   ANV_RMV(event_create, device, event, pCreateInfo->flags, false);
+
    *pEvent = anv_event_to_handle(event);
 
    return VK_SUCCESS;
@@ -4610,6 +4912,8 @@ void anv_DestroyEvent(
    if (!event)
       return;
 
+   ANV_RMV(resource_destroy, device, event);
+
    anv_state_pool_free(&device->dynamic_state_pool, event->state);
 
    vk_object_free(&device->vk, pAllocator, event);
@@ -4622,7 +4926,7 @@ VkResult anv_GetEventStatus(
    ANV_FROM_HANDLE(anv_device, device, _device);
    ANV_FROM_HANDLE(anv_event, event, _event);
 
-   if (anv_device_is_lost(device))
+   if (vk_device_is_lost(&device->vk))
       return VK_ERROR_DEVICE_LOST;
 
    return *(uint64_t *)event->state.map;
@@ -4652,6 +4956,105 @@ VkResult anv_ResetEvent(
 
 // Buffer functions
 
+static void
+anv_get_buffer_memory_requirements(struct anv_device *device,
+                                   VkBufferCreateFlags flags,
+                                   VkDeviceSize size,
+                                   VkBufferUsageFlags usage,
+                                   bool is_sparse,
+                                   VkMemoryRequirements2* pMemoryRequirements)
+{
+   /* The Vulkan spec (git aaed022) says:
+    *
+    *    memoryTypeBits is a bitfield and contains one bit set for every
+    *    supported memory type for the resource. The bit `1<<i` is set if and
+    *    only if the memory type `i` in the VkPhysicalDeviceMemoryProperties
+    *    structure for the physical device is supported.
+    *
+    * We have special memory types for descriptor buffers.
+    */
+   uint32_t memory_types =
+      (flags & VK_BUFFER_CREATE_PROTECTED_BIT) ?
+      device->physical->memory.protected_mem_types :
+      ((usage & (VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT |
+                 VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT)) ?
+       device->physical->memory.desc_buffer_mem_types :
+       device->physical->memory.default_buffer_mem_types);
+
+   /* The GPU appears to write back to main memory in cachelines. Writes to a
+    * buffers should not clobber with writes to another buffers so make sure
+    * those are in different cachelines.
+    */
+   uint32_t alignment = 64;
+
+   /* From the spec, section "Sparse Buffer and Fully-Resident Image Block
+    * Size":
+    *   "The sparse block size in bytes for sparse buffers and fully-resident
+    *    images is reported as VkMemoryRequirements::alignment. alignment
+    *    represents both the memory alignment requirement and the binding
+    *    granularity (in bytes) for sparse resources."
+    */
+   if (is_sparse) {
+      alignment = ANV_SPARSE_BLOCK_SIZE;
+      size = align64(size, alignment);
+   }
+
+   pMemoryRequirements->memoryRequirements.size = size;
+   pMemoryRequirements->memoryRequirements.alignment = alignment;
+
+   /* Storage and Uniform buffers should have their size aligned to
+    * 32-bits to avoid boundary checks when last DWord is not complete.
+    * This would ensure that not internal padding would be needed for
+    * 16-bit types.
+    */
+   if (device->robust_buffer_access &&
+       (usage & VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT ||
+        usage & VK_BUFFER_USAGE_STORAGE_BUFFER_BIT))
+      pMemoryRequirements->memoryRequirements.size = align64(size, 4);
+
+   pMemoryRequirements->memoryRequirements.memoryTypeBits = memory_types;
+
+   vk_foreach_struct(ext, pMemoryRequirements->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
+         VkMemoryDedicatedRequirements *requirements = (void *)ext;
+         requirements->prefersDedicatedAllocation = false;
+         requirements->requiresDedicatedAllocation = false;
+         break;
+      }
+
+      default:
+         anv_debug_ignored_stype(ext->sType);
+         break;
+      }
+   }
+}
+
+void anv_GetDeviceBufferMemoryRequirements(
+    VkDevice                                    _device,
+    const VkDeviceBufferMemoryRequirements*     pInfo,
+    VkMemoryRequirements2*                      pMemoryRequirements)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   const bool is_sparse =
+      pInfo->pCreateInfo->flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT;
+
+   if ((device->physical->sparse_type == ANV_SPARSE_TYPE_NOT_SUPPORTED) &&
+       INTEL_DEBUG(DEBUG_SPARSE) &&
+       pInfo->pCreateInfo->flags & (VK_BUFFER_CREATE_SPARSE_BINDING_BIT |
+                                    VK_BUFFER_CREATE_SPARSE_RESIDENCY_BIT |
+                                    VK_BUFFER_CREATE_SPARSE_ALIASED_BIT))
+      fprintf(stderr, "=== %s %s:%d flags:0x%08x\n", __func__, __FILE__,
+              __LINE__, pInfo->pCreateInfo->flags);
+
+   anv_get_buffer_memory_requirements(device,
+                                      pInfo->pCreateInfo->flags,
+                                      pInfo->pCreateInfo->size,
+                                      pInfo->pCreateInfo->usage,
+                                      is_sparse,
+                                      pMemoryRequirements);
+}
+
 VkResult anv_CreateBuffer(
     VkDevice                                    _device,
     const VkBufferCreateInfo*                   pCreateInfo,
@@ -4661,25 +5064,62 @@ VkResult anv_CreateBuffer(
    ANV_FROM_HANDLE(anv_device, device, _device);
    struct anv_buffer *buffer;
 
+   if ((device->physical->sparse_type == ANV_SPARSE_TYPE_NOT_SUPPORTED) &&
+       INTEL_DEBUG(DEBUG_SPARSE) &&
+       pCreateInfo->flags & (VK_BUFFER_CREATE_SPARSE_BINDING_BIT |
+                             VK_BUFFER_CREATE_SPARSE_RESIDENCY_BIT |
+                             VK_BUFFER_CREATE_SPARSE_ALIASED_BIT))
+      fprintf(stderr, "=== %s %s:%d flags:0x%08x\n", __func__, __FILE__,
+              __LINE__, pCreateInfo->flags);
+
    /* Don't allow creating buffers bigger than our address space.  The real
     * issue here is that we may align up the buffer size and we don't want
     * doing so to cause roll-over.  However, no one has any business
     * allocating a buffer larger than our GTT size.
     */
    if (pCreateInfo->size > device->physical->gtt_size)
-      return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO);
-
-   buffer = vk_object_alloc(&device->vk, pAllocator, sizeof(*buffer),
-                            VK_OBJECT_TYPE_BUFFER);
+   buffer = vk_buffer_create(&device->vk, pCreateInfo,
+                             pAllocator, sizeof(*buffer));
    if (buffer == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   buffer->create_flags = pCreateInfo->flags;
-   buffer->size = pCreateInfo->size;
-   buffer->usage = pCreateInfo->usage;
    buffer->address = ANV_NULL_ADDRESS;
+   if (anv_buffer_is_sparse(buffer)) {
+      enum anv_bo_alloc_flags alloc_flags = 0;
+      uint64_t client_address = 0;
+
+      if (buffer->vk.create_flags & VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT) {
+         alloc_flags = ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS;
+         const VkBufferOpaqueCaptureAddressCreateInfo *opaque_addr_info =
+            vk_find_struct_const(pCreateInfo->pNext,
+                                 BUFFER_OPAQUE_CAPTURE_ADDRESS_CREATE_INFO);
+         if (opaque_addr_info)
+            client_address = opaque_addr_info->opaqueCaptureAddress;
+      }
+
+      if (buffer->vk.create_flags & VK_BUFFER_CREATE_DESCRIPTOR_BUFFER_CAPTURE_REPLAY_BIT_EXT) {
+         alloc_flags = ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS;
+
+         const VkOpaqueCaptureDescriptorDataCreateInfoEXT *opaque_info =
+            vk_find_struct_const(pCreateInfo->pNext,
+                                 OPAQUE_CAPTURE_DESCRIPTOR_DATA_CREATE_INFO_EXT);
+         if (opaque_info)
+            client_address = *((const uint64_t *)opaque_info->opaqueCaptureDescriptorData);
+      }
+
+      VkResult result = anv_init_sparse_bindings(device, buffer->vk.size,
+                                                 &buffer->sparse_data,
+                                                 alloc_flags, client_address,
+                                                 &buffer->address);
+      if (result != VK_SUCCESS) {
+         vk_buffer_destroy(&device->vk, pAllocator, &buffer->vk);
+         return result;
+      }
+   }
+
+   ANV_RMV(buffer_create, device, false, buffer);
 
    *pBuffer = anv_buffer_to_handle(buffer);
 
@@ -4697,57 +5137,98 @@ void anv_DestroyBuffer(
    if (!buffer)
       return;
 
-   vk_object_free(&device->vk, pAllocator, buffer);
+   ANV_RMV(buffer_destroy, device, buffer);
+
+   if (anv_buffer_is_sparse(buffer)) {
+      assert(buffer->address.offset == buffer->sparse_data.address);
+      anv_free_sparse_bindings(device, &buffer->sparse_data);
+   }
+
+   vk_buffer_destroy(&device->vk, pAllocator, &buffer->vk);
 }
 
 VkDeviceAddress anv_GetBufferDeviceAddress(
     VkDevice                                    device,
-    const VkBufferDeviceAddressInfoKHR*         pInfo)
+    const VkBufferDeviceAddressInfo*            pInfo)
 {
    ANV_FROM_HANDLE(anv_buffer, buffer, pInfo->buffer);
 
    assert(!anv_address_is_null(buffer->address));
-   assert(buffer->address.bo->flags & EXEC_OBJECT_PINNED);
 
    return anv_address_physical(buffer->address);
 }
 
 uint64_t anv_GetBufferOpaqueCaptureAddress(
     VkDevice                                    device,
-    const VkBufferDeviceAddressInfoKHR*         pInfo)
+    const VkBufferDeviceAddressInfo*            pInfo)
 {
-   return 0;
+   ANV_FROM_HANDLE(anv_buffer, buffer, pInfo->buffer);
+
+   return anv_address_physical(buffer->address);
+}
+
+VkResult anv_GetBufferOpaqueCaptureDescriptorDataEXT(
+    VkDevice                                    device,
+    const VkBufferCaptureDescriptorDataInfoEXT* pInfo,
+    void*                                       pData)
+{
+   ANV_FROM_HANDLE(anv_buffer, buffer, pInfo->buffer);
+
+   *((uint64_t *)pData) = anv_address_physical(buffer->address);
+
+   return VK_SUCCESS;
 }
 
 uint64_t anv_GetDeviceMemoryOpaqueCaptureAddress(
     VkDevice                                    device,
-    const VkDeviceMemoryOpaqueCaptureAddressInfoKHR* pInfo)
+    const VkDeviceMemoryOpaqueCaptureAddressInfo* pInfo)
 {
    ANV_FROM_HANDLE(anv_device_memory, memory, pInfo->memory);
 
-   assert(memory->bo->flags & EXEC_OBJECT_PINNED);
-   assert(memory->bo->has_client_visible_address);
+   assert(memory->bo->alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS);
 
    return intel_48b_address(memory->bo->offset);
 }
 
 void
-anv_fill_buffer_surface_state(struct anv_device *device, struct anv_state state,
+anv_fill_buffer_surface_state(struct anv_device *device,
+                              void *surface_state_ptr,
                               enum isl_format format,
+                              struct isl_swizzle swizzle,
                               isl_surf_usage_flags_t usage,
                               struct anv_address address,
                               uint32_t range, uint32_t stride)
 {
-   isl_buffer_fill_state(&device->isl_dev, state.map,
+   isl_buffer_fill_state(&device->isl_dev, surface_state_ptr,
                          .address = anv_address_physical(address),
                          .mocs = isl_mocs(&device->isl_dev, usage,
-                                          address.bo && address.bo->is_external),
+                                          address.bo && anv_bo_is_external(address.bo)),
                          .size_B = range,
                          .format = format,
-                         .swizzle = ISL_SWIZZLE_IDENTITY,
+                         .swizzle = swizzle,
                          .stride_B = stride);
 }
 
+VkResult anv_GetSamplerOpaqueCaptureDescriptorDataEXT(
+    VkDevice                                    _device,
+    const VkSamplerCaptureDescriptorDataInfoEXT* pInfo,
+    void*                                       pData)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_sampler, sampler, pInfo->sampler);
+
+   if (sampler->custom_border_color_db.alloc_size != 0) {
+      *((uint32_t *)pData) =
+         anv_state_reserved_array_pool_state_index(
+            &device->custom_border_colors_db,
+            sampler->custom_border_color_db);
+   } else {
+      *((uint32_t *)pData) = 0;
+   }
+
+   return VK_SUCCESS;
+}
+
 void anv_DestroySampler(
     VkDevice                                    _device,
     VkSampler                                   _sampler,
@@ -4768,85 +5249,32 @@ void anv_DestroySampler(
       anv_state_reserved_pool_free(&device->custom_border_colors,
                                    sampler->custom_border_color);
    }
-
-   vk_object_free(&device->vk, pAllocator, sampler);
-}
-
-VkResult anv_CreateFramebuffer(
-    VkDevice                                    _device,
-    const VkFramebufferCreateInfo*              pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkFramebuffer*                              pFramebuffer)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   struct anv_framebuffer *framebuffer;
-
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO);
-
-   size_t size = sizeof(*framebuffer);
-
-   /* VK_KHR_imageless_framebuffer extension says:
-    *
-    *    If flags includes VK_FRAMEBUFFER_CREATE_IMAGELESS_BIT_KHR,
-    *    parameter pAttachments is ignored.
-    */
-   if (!(pCreateInfo->flags & VK_FRAMEBUFFER_CREATE_IMAGELESS_BIT_KHR))
-      size += sizeof(struct anv_image_view *) * pCreateInfo->attachmentCount;
-
-   framebuffer = vk_object_alloc(&device->vk, pAllocator, size,
-                                 VK_OBJECT_TYPE_FRAMEBUFFER);
-   if (framebuffer == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   framebuffer->width = pCreateInfo->width;
-   framebuffer->height = pCreateInfo->height;
-   framebuffer->layers = pCreateInfo->layers;
-
-   if (!(pCreateInfo->flags & VK_FRAMEBUFFER_CREATE_IMAGELESS_BIT_KHR)) {
-      for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
-         ANV_FROM_HANDLE(anv_image_view, iview, pCreateInfo->pAttachments[i]);
-         framebuffer->attachments[i] = iview;
-      }
-      framebuffer->attachment_count = pCreateInfo->attachmentCount;
+   if (sampler->custom_border_color_db.map) {
+      anv_state_reserved_array_pool_free(&device->custom_border_colors_db,
+                                         sampler->custom_border_color_db);
    }
 
-   *pFramebuffer = anv_framebuffer_to_handle(framebuffer);
-
-   return VK_SUCCESS;
-}
-
-void anv_DestroyFramebuffer(
-    VkDevice                                    _device,
-    VkFramebuffer                               _fb,
-    const VkAllocationCallbacks*                pAllocator)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_framebuffer, fb, _fb);
-
-   if (!fb)
-      return;
-
-   vk_object_free(&device->vk, pAllocator, fb);
+   vk_sampler_destroy(&device->vk, pAllocator, &sampler->vk);
 }
 
-static const VkTimeDomainEXT anv_time_domains[] = {
-   VK_TIME_DOMAIN_DEVICE_EXT,
-   VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT,
+static const VkTimeDomainKHR anv_time_domains[] = {
+   VK_TIME_DOMAIN_DEVICE_KHR,
+   VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR,
 #ifdef CLOCK_MONOTONIC_RAW
-   VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT,
+   VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR,
 #endif
 };
 
-VkResult anv_GetPhysicalDeviceCalibrateableTimeDomainsEXT(
+VkResult anv_GetPhysicalDeviceCalibrateableTimeDomainsKHR(
    VkPhysicalDevice                             physicalDevice,
    uint32_t                                     *pTimeDomainCount,
-   VkTimeDomainEXT                              *pTimeDomains)
+   VkTimeDomainKHR                              *pTimeDomains)
 {
    int d;
-   VK_OUTARRAY_MAKE(out, pTimeDomains, pTimeDomainCount);
+   VK_OUTARRAY_MAKE_TYPED(VkTimeDomainKHR, out, pTimeDomains, pTimeDomainCount);
 
    for (d = 0; d < ARRAY_SIZE(anv_time_domains); d++) {
-      vk_outarray_append(&out, i) {
+      vk_outarray_append_typed(VkTimeDomainKHR, &out, i) {
          *i = anv_time_domains[d];
       }
    }
@@ -4854,63 +5282,146 @@ VkResult anv_GetPhysicalDeviceCalibrateableTimeDomainsEXT(
    return vk_outarray_status(&out);
 }
 
-static uint64_t
-anv_clock_gettime(clockid_t clock_id)
+static inline clockid_t
+anv_get_default_cpu_clock_id(void)
 {
-   struct timespec current;
-   int ret;
+#ifdef CLOCK_MONOTONIC_RAW
+   return CLOCK_MONOTONIC_RAW;
+#else
+   return CLOCK_MONOTONIC;
+#endif
+}
 
-   ret = clock_gettime(clock_id, &current);
+static inline clockid_t
+vk_time_domain_to_clockid(VkTimeDomainKHR domain)
+{
+   switch (domain) {
 #ifdef CLOCK_MONOTONIC_RAW
-   if (ret < 0 && clock_id == CLOCK_MONOTONIC_RAW)
-      ret = clock_gettime(CLOCK_MONOTONIC, &current);
+   case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR:
+      return CLOCK_MONOTONIC_RAW;
 #endif
-   if (ret < 0)
-      return 0;
+   case VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR:
+      return CLOCK_MONOTONIC;
+   default:
+      unreachable("Missing");
+      return CLOCK_MONOTONIC;
+   }
+}
 
-   return (uint64_t) current.tv_sec * 1000000000ULL + current.tv_nsec;
+static inline bool
+is_cpu_time_domain(VkTimeDomainKHR domain)
+{
+   return domain == VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR ||
+          domain == VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR;
 }
 
-VkResult anv_GetCalibratedTimestampsEXT(
+static inline bool
+is_gpu_time_domain(VkTimeDomainKHR domain)
+{
+   return domain == VK_TIME_DOMAIN_DEVICE_KHR;
+}
+
+VkResult anv_GetCalibratedTimestampsKHR(
    VkDevice                                     _device,
    uint32_t                                     timestampCount,
-   const VkCalibratedTimestampInfoEXT           *pTimestampInfos,
+   const VkCalibratedTimestampInfoKHR           *pTimestampInfos,
    uint64_t                                     *pTimestamps,
    uint64_t                                     *pMaxDeviation)
 {
    ANV_FROM_HANDLE(anv_device, device, _device);
-   uint64_t timestamp_frequency = device->info.timestamp_frequency;
-   int  ret;
-   int d;
+   const uint64_t timestamp_frequency = device->info->timestamp_frequency;
+   const uint64_t device_period = DIV_ROUND_UP(1000000000, timestamp_frequency);
+   uint32_t d, increment;
    uint64_t begin, end;
    uint64_t max_clock_period = 0;
+   const enum intel_kmd_type kmd_type = device->physical->info.kmd_type;
+   const bool has_correlate_timestamp = kmd_type == INTEL_KMD_TYPE_XE;
+   clockid_t cpu_clock_id = -1;
+
+   begin = end = vk_clock_gettime(anv_get_default_cpu_clock_id());
+
+   for (d = 0, increment = 1; d < timestampCount; d += increment) {
+      const VkTimeDomainKHR current = pTimestampInfos[d].timeDomain;
+      /* If we have a request pattern like this :
+       * - domain0 = VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR or VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR
+       * - domain1 = VK_TIME_DOMAIN_DEVICE_KHR
+       * - domain2 = domain0 (optional)
+       *
+       * We can combine all of those into a single ioctl for maximum accuracy.
+       */
+      if (has_correlate_timestamp && (d + 1) < timestampCount) {
+         const VkTimeDomainKHR next = pTimestampInfos[d + 1].timeDomain;
+
+         if ((is_cpu_time_domain(current) && is_gpu_time_domain(next)) ||
+             (is_gpu_time_domain(current) && is_cpu_time_domain(next))) {
+            /* We'll consume at least 2 elements. */
+            increment = 2;
+
+            if (is_cpu_time_domain(current))
+               cpu_clock_id = vk_time_domain_to_clockid(current);
+            else
+               cpu_clock_id = vk_time_domain_to_clockid(next);
+
+            uint64_t cpu_timestamp, gpu_timestamp, cpu_delta_timestamp, cpu_end_timestamp;
+            if (!intel_gem_read_correlate_cpu_gpu_timestamp(device->fd,
+                                                            kmd_type,
+                                                            INTEL_ENGINE_CLASS_RENDER,
+                                                            0 /* engine_instance */,
+                                                            cpu_clock_id,
+                                                            &cpu_timestamp,
+                                                            &gpu_timestamp,
+                                                            &cpu_delta_timestamp))
+               return vk_device_set_lost(&device->vk, "Failed to read correlate timestamp %m");
+
+            cpu_end_timestamp = cpu_timestamp + cpu_delta_timestamp;
+            if (is_cpu_time_domain(current)) {
+               pTimestamps[d] = cpu_timestamp;
+               pTimestamps[d + 1] = gpu_timestamp;
+            } else {
+               pTimestamps[d] = gpu_timestamp;
+               pTimestamps[d + 1] = cpu_end_timestamp;
+            }
+            max_clock_period = MAX2(max_clock_period, device_period);
+
+            /* If we can consume a third element */
+            if ((d + 2) < timestampCount &&
+                is_cpu_time_domain(current) &&
+                current == pTimestampInfos[d + 2].timeDomain) {
+               pTimestamps[d + 2] = cpu_end_timestamp;
+               increment++;
+            }
+
+            /* If we're the first element, we can replace begin */
+            if (d == 0 && cpu_clock_id == anv_get_default_cpu_clock_id())
+               begin = cpu_timestamp;
+
+            /* If we're in the same clock domain as begin/end. We can set the end. */
+            if (cpu_clock_id == anv_get_default_cpu_clock_id())
+               end = cpu_end_timestamp;
 
-#ifdef CLOCK_MONOTONIC_RAW
-   begin = anv_clock_gettime(CLOCK_MONOTONIC_RAW);
-#else
-   begin = anv_clock_gettime(CLOCK_MONOTONIC);
-#endif
-
-   for (d = 0; d < timestampCount; d++) {
-      switch (pTimestampInfos[d].timeDomain) {
-      case VK_TIME_DOMAIN_DEVICE_EXT:
-         ret = anv_gem_reg_read(device->fd, TIMESTAMP | I915_REG_READ_8B_WA,
-                                &pTimestamps[d]);
+            continue;
+         }
+      }
 
-         if (ret != 0) {
-            return anv_device_set_lost(device, "Failed to read the TIMESTAMP "
-                                               "register: %m");
+      /* fallback to regular method */
+      increment = 1;
+      switch (current) {
+      case VK_TIME_DOMAIN_DEVICE_KHR:
+         if (!intel_gem_read_render_timestamp(device->fd,
+                                              device->info->kmd_type,
+                                              &pTimestamps[d])) {
+            return vk_device_set_lost(&device->vk, "Failed to read the "
+                                      "TIMESTAMP register: %m");
          }
-         uint64_t device_period = DIV_ROUND_UP(1000000000, timestamp_frequency);
          max_clock_period = MAX2(max_clock_period, device_period);
          break;
-      case VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT:
-         pTimestamps[d] = anv_clock_gettime(CLOCK_MONOTONIC);
+      case VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR:
+         pTimestamps[d] = vk_clock_gettime(CLOCK_MONOTONIC);
          max_clock_period = MAX2(max_clock_period, 1);
          break;
 
 #ifdef CLOCK_MONOTONIC_RAW
-      case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_EXT:
+      case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR:
          pTimestamps[d] = begin;
          break;
 #endif
@@ -4920,50 +5431,13 @@ VkResult anv_GetCalibratedTimestampsEXT(
       }
    }
 
-#ifdef CLOCK_MONOTONIC_RAW
-   end = anv_clock_gettime(CLOCK_MONOTONIC_RAW);
-#else
-   end = anv_clock_gettime(CLOCK_MONOTONIC);
-#endif
+   /* If last timestamp was not get with has_correlate_timestamp method or
+    * if it was but last cpu clock is not the default one, get time again
+    */
+   if (increment == 1 || cpu_clock_id != anv_get_default_cpu_clock_id())
+      end = vk_clock_gettime(anv_get_default_cpu_clock_id());
 
-    /*
-     * The maximum deviation is the sum of the interval over which we
-     * perform the sampling and the maximum period of any sampled
-     * clock. That's because the maximum skew between any two sampled
-     * clock edges is when the sampled clock with the largest period is
-     * sampled at the end of that period but right at the beginning of the
-     * sampling interval and some other clock is sampled right at the
-     * begining of its sampling period and right at the end of the
-     * sampling interval. Let's assume the GPU has the longest clock
-     * period and that the application is sampling GPU and monotonic:
-     *
-     *                               s                 e
-     *			 w x y z 0 1 2 3 4 5 6 7 8 9 a b c d e f
-     *	Raw              -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-
-     *
-     *                               g
-     *		  0         1         2         3
-     *	GPU       -----_____-----_____-----_____-----_____
-     *
-     *                                                m
-     *					    x y z 0 1 2 3 4 5 6 7 8 9 a b c
-     *	Monotonic                           -_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-
-     *
-     *	Interval                     <----------------->
-     *	Deviation           <-------------------------->
-     *
-     *		s  = read(raw)       2
-     *		g  = read(GPU)       1
-     *		m  = read(monotonic) 2
-     *		e  = read(raw)       b
-     *
-     * We round the sample interval up by one tick to cover sampling error
-     * in the interval clock
-     */
-
-   uint64_t sample_interval = end - begin + 1;
-
-   *pMaxDeviation = sample_interval + max_clock_period;
+   *pMaxDeviation = vk_time_max_deviation(begin, end, max_clock_period);
 
    return VK_SUCCESS;
 }
@@ -4992,86 +5466,198 @@ void anv_GetPhysicalDeviceMultisamplePropertiesEXT(
       anv_debug_ignored_stype(ext->sType);
 }
 
-/* vk_icd.h does not declare this function, so we declare it here to
- * suppress Wmissing-prototypes.
- */
-PUBLIC VKAPI_ATTR VkResult VKAPI_CALL
-vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t* pSupportedVersion);
-
-PUBLIC VKAPI_ATTR VkResult VKAPI_CALL
-vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t* pSupportedVersion)
-{
-   /* For the full details on loader interface versioning, see
-    * <https://github.com/KhronosGroup/Vulkan-LoaderAndValidationLayers/blob/master/loader/LoaderAndLayerInterface.md>.
-    * What follows is a condensed summary, to help you navigate the large and
-    * confusing official doc.
-    *
-    *   - Loader interface v0 is incompatible with later versions. We don't
-    *     support it.
-    *
-    *   - In loader interface v1:
-    *       - The first ICD entrypoint called by the loader is
-    *         vk_icdGetInstanceProcAddr(). The ICD must statically expose this
-    *         entrypoint.
-    *       - The ICD must statically expose no other Vulkan symbol unless it is
-    *         linked with -Bsymbolic.
-    *       - Each dispatchable Vulkan handle created by the ICD must be
-    *         a pointer to a struct whose first member is VK_LOADER_DATA. The
-    *         ICD must initialize VK_LOADER_DATA.loadMagic to ICD_LOADER_MAGIC.
-    *       - The loader implements vkCreate{PLATFORM}SurfaceKHR() and
-    *         vkDestroySurfaceKHR(). The ICD must be capable of working with
-    *         such loader-managed surfaces.
-    *
-    *    - Loader interface v2 differs from v1 in:
-    *       - The first ICD entrypoint called by the loader is
-    *         vk_icdNegotiateLoaderICDInterfaceVersion(). The ICD must
-    *         statically expose this entrypoint.
-    *
-    *    - Loader interface v3 differs from v2 in:
-    *        - The ICD must implement vkCreate{PLATFORM}SurfaceKHR(),
-    *          vkDestroySurfaceKHR(), and other API which uses VKSurfaceKHR,
-    *          because the loader no longer does so.
-    *
-    *    - Loader interface v4 differs from v3 in:
-    *        - The ICD must implement vk_icdGetPhysicalDeviceProcAddr().
-    */
-   *pSupportedVersion = MIN2(*pSupportedVersion, 4u);
-   return VK_SUCCESS;
-}
-
 VkResult anv_GetPhysicalDeviceFragmentShadingRatesKHR(
     VkPhysicalDevice                            physicalDevice,
     uint32_t*                                   pFragmentShadingRateCount,
     VkPhysicalDeviceFragmentShadingRateKHR*     pFragmentShadingRates)
 {
    ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
-   VK_OUTARRAY_MAKE(out, pFragmentShadingRates, pFragmentShadingRateCount);
-
-#define append_rate(_samples, _width, _height)                          \
-   do {                                                                 \
-      vk_outarray_append(&out, __r) {                                   \
-         __r->sampleCounts = _samples;                                  \
-         __r->fragmentSize = (VkExtent2D) {                             \
-            .width = _width,                                            \
-            .height = _height,                                          \
-         };                                                             \
-      }                                                                 \
+   VK_OUTARRAY_MAKE_TYPED(VkPhysicalDeviceFragmentShadingRateKHR, out,
+                          pFragmentShadingRates, pFragmentShadingRateCount);
+
+#define append_rate(_samples, _width, _height)                                      \
+   do {                                                                             \
+      vk_outarray_append_typed(VkPhysicalDeviceFragmentShadingRateKHR, &out, __r) { \
+         __r->sampleCounts = _samples;                                              \
+         __r->fragmentSize = (VkExtent2D) {                                         \
+            .width = _width,                                                        \
+            .height = _height,                                                      \
+         };                                                                         \
+      }                                                                             \
    } while (0)
 
    VkSampleCountFlags sample_counts =
       isl_device_get_sample_counts(&physical_device->isl_dev);
 
+   /* BSpec 47003: There are a number of restrictions on the sample count
+    * based off the coarse pixel size.
+    */
+   static const VkSampleCountFlags cp_size_sample_limits[] = {
+      [1]  = ISL_SAMPLE_COUNT_16_BIT | ISL_SAMPLE_COUNT_8_BIT |
+             ISL_SAMPLE_COUNT_4_BIT | ISL_SAMPLE_COUNT_2_BIT | ISL_SAMPLE_COUNT_1_BIT,
+      [2]  = ISL_SAMPLE_COUNT_4_BIT | ISL_SAMPLE_COUNT_2_BIT | ISL_SAMPLE_COUNT_1_BIT,
+      [4]  = ISL_SAMPLE_COUNT_4_BIT | ISL_SAMPLE_COUNT_2_BIT | ISL_SAMPLE_COUNT_1_BIT,
+      [8]  = ISL_SAMPLE_COUNT_2_BIT | ISL_SAMPLE_COUNT_1_BIT,
+      [16] = ISL_SAMPLE_COUNT_1_BIT,
+   };
+
    for (uint32_t x = 4; x >= 1; x /= 2) {
        for (uint32_t y = 4; y >= 1; y /= 2) {
-          /* For size {1, 1}, the sample count must be ~0 */
-          if (x == 1 && y == 1)
-             append_rate(~0, x, y);
-          else
-             append_rate(sample_counts, x, y);
-      }
+          if (physical_device->info.has_coarse_pixel_primitive_and_cb) {
+             /* BSpec 47003:
+              *   "CPsize 1x4 and 4x1 are not supported"
+              */
+             if ((x == 1 && y == 4) || (x == 4 && y == 1))
+                continue;
+
+             /* For size {1, 1}, the sample count must be ~0
+              *
+              * 4x2 is also a specially case.
+              */
+             if (x == 1 && y == 1)
+                append_rate(~0, x, y);
+             else if (x == 4 && y == 2)
+                append_rate(ISL_SAMPLE_COUNT_1_BIT, x, y);
+             else
+                append_rate(cp_size_sample_limits[x * y], x, y);
+          } else {
+             /* For size {1, 1}, the sample count must be ~0 */
+             if (x == 1 && y == 1)
+                append_rate(~0, x, y);
+             else
+                append_rate(sample_counts, x, y);
+          }
+       }
    }
 
 #undef append_rate
 
    return vk_outarray_status(&out);
 }
+
+const struct intel_device_info_pat_entry *
+anv_device_get_pat_entry(struct anv_device *device,
+                         enum anv_bo_alloc_flags alloc_flags)
+{
+   if (alloc_flags & ANV_BO_ALLOC_IMPORTED)
+      return &device->info->pat.cached_coherent;
+
+   /* PAT indexes has no actual effect in DG2 and DG1, smem caches will always
+    * be snopped by GPU and lmem will always be WC.
+    * This might change in future discrete platforms.
+    */
+   if (anv_physical_device_has_vram(device->physical)) {
+      if (alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM)
+         return &device->info->pat.cached_coherent;
+      return &device->info->pat.writecombining;
+   }
+
+   if ((alloc_flags & (ANV_BO_ALLOC_HOST_CACHED_COHERENT)) == ANV_BO_ALLOC_HOST_CACHED_COHERENT)
+      return &device->info->pat.cached_coherent;
+   else if (alloc_flags & (ANV_BO_ALLOC_EXTERNAL | ANV_BO_ALLOC_SCANOUT))
+      return &device->info->pat.scanout;
+   else if (alloc_flags & ANV_BO_ALLOC_HOST_CACHED)
+      return &device->info->pat.writeback_incoherent;
+   else
+      return &device->info->pat.writecombining;
+}
+
+static VkComponentTypeKHR
+convert_component_type(enum intel_cooperative_matrix_component_type t)
+{
+   switch (t) {
+   case INTEL_CMAT_FLOAT16: return VK_COMPONENT_TYPE_FLOAT16_KHR;
+   case INTEL_CMAT_FLOAT32: return VK_COMPONENT_TYPE_FLOAT32_KHR;
+   case INTEL_CMAT_SINT32:  return VK_COMPONENT_TYPE_SINT32_KHR;
+   case INTEL_CMAT_SINT8:   return VK_COMPONENT_TYPE_SINT8_KHR;
+   case INTEL_CMAT_UINT32:  return VK_COMPONENT_TYPE_UINT32_KHR;
+   case INTEL_CMAT_UINT8:   return VK_COMPONENT_TYPE_UINT8_KHR;
+   }
+   unreachable("invalid cooperative matrix component type in configuration");
+}
+
+static VkScopeKHR
+convert_scope(enum intel_cmat_scope scope)
+{
+   switch (scope) {
+   case INTEL_CMAT_SCOPE_SUBGROUP: return VK_SCOPE_SUBGROUP_KHR;
+   default:
+      unreachable("invalid cooperative matrix scope in configuration");
+   }
+}
+
+VkResult anv_GetPhysicalDeviceCooperativeMatrixPropertiesKHR(
+   VkPhysicalDevice                            physicalDevice,
+   uint32_t*                                   pPropertyCount,
+   VkCooperativeMatrixPropertiesKHR*           pProperties)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+   const struct intel_device_info *devinfo = &pdevice->info;
+
+   assert(anv_has_cooperative_matrix(pdevice));
+
+   VK_OUTARRAY_MAKE_TYPED(VkCooperativeMatrixPropertiesKHR, out, pProperties, pPropertyCount);
+
+   for (int i = 0; i < ARRAY_SIZE(devinfo->cooperative_matrix_configurations); i++) {
+      const struct intel_cooperative_matrix_configuration *cfg =
+         &devinfo->cooperative_matrix_configurations[i];
+
+      if (cfg->scope == INTEL_CMAT_SCOPE_NONE)
+         break;
+
+      vk_outarray_append_typed(VkCooperativeMatrixPropertiesKHR, &out, prop) {
+         prop->sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR;
+
+         prop->MSize = cfg->m;
+         prop->NSize = cfg->n;
+         prop->KSize = cfg->k;
+
+         prop->AType      = convert_component_type(cfg->a);
+         prop->BType      = convert_component_type(cfg->b);
+         prop->CType      = convert_component_type(cfg->c);
+         prop->ResultType = convert_component_type(cfg->result);
+
+         prop->saturatingAccumulation = VK_FALSE;
+         prop->scope = convert_scope(cfg->scope);
+      }
+
+      /* VUID-RuntimeSpirv-saturatingAccumulation-08983 says:
+       *
+       *    For OpCooperativeMatrixMulAddKHR, the SaturatingAccumulation
+       *    cooperative matrix operand must be present if and only if
+       *    VkCooperativeMatrixPropertiesKHR::saturatingAccumulation is
+       *    VK_TRUE.
+       *
+       * As a result, we have to advertise integer configs both with and
+       * without this flag set.
+       *
+       * The DPAS instruction does not support the .sat modifier, so only
+       * advertise the configurations when the DPAS would be lowered.
+       *
+       * FINISHME: It should be possible to do better than full lowering on
+       * platforms that support DPAS. Emit a DPAS with a NULL accumulator
+       * argument, then perform the correct sequence of saturating add
+       * instructions.
+       */
+      if (cfg->a != INTEL_CMAT_FLOAT16 &&
+          (devinfo->verx10 < 125 || debug_get_bool_option("INTEL_LOWER_DPAS", false))) {
+         vk_outarray_append_typed(VkCooperativeMatrixPropertiesKHR, &out, prop) {
+            prop->sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR;
+
+            prop->MSize = cfg->m;
+            prop->NSize = cfg->n;
+            prop->KSize = cfg->k;
+
+            prop->AType      = convert_component_type(cfg->a);
+            prop->BType      = convert_component_type(cfg->b);
+            prop->CType      = convert_component_type(cfg->c);
+            prop->ResultType = convert_component_type(cfg->result);
+
+            prop->saturatingAccumulation = VK_TRUE;
+            prop->scope = convert_scope(cfg->scope);
+         }
+      }
+   }
+
+   return vk_outarray_status(&out);
+}
diff --git a/src/intel/vulkan/anv_formats.c b/src/intel/vulkan/anv_formats.c
index 1713b446825..91cd9a9e7a9 100644
--- a/src/intel/vulkan/anv_formats.c
+++ b/src/intel/vulkan/anv_formats.c
@@ -23,6 +23,8 @@
 
 #include "anv_private.h"
 #include "drm-uapi/drm_fourcc.h"
+#include "vk_android.h"
+#include "vk_enum_defines.h"
 #include "vk_enum_to_str.h"
 #include "vk_format.h"
 #include "vk_util.h"
@@ -50,7 +52,6 @@
    [VK_ENUM_OFFSET(__vk_fmt)] = { \
       .planes = { \
          { .isl_format = __hw_fmt, .swizzle = __swizzle, \
-           .denominator_scales = { 1, 1, }, \
            .aspect = VK_IMAGE_ASPECT_COLOR_BIT, \
          }, \
       }, \
@@ -65,7 +66,6 @@
    [VK_ENUM_OFFSET(__vk_fmt)] = { \
       .planes = { \
          { .isl_format = __hw_fmt, .swizzle = RGBA, \
-           .denominator_scales = { 1, 1, }, \
            .aspect = VK_IMAGE_ASPECT_DEPTH_BIT, \
          }, \
       }, \
@@ -77,7 +77,6 @@
    [VK_ENUM_OFFSET(__vk_fmt)] = { \
       .planes = { \
          { .isl_format = __hw_fmt, .swizzle = RGBA, \
-           .denominator_scales = { 1, 1, }, \
            .aspect = VK_IMAGE_ASPECT_STENCIL_BIT, \
          }, \
       }, \
@@ -89,11 +88,9 @@
    [VK_ENUM_OFFSET(__vk_fmt)] = { \
       .planes = { \
          { .isl_format = __fmt1, .swizzle = RGBA, \
-           .denominator_scales = { 1, 1, }, \
            .aspect = VK_IMAGE_ASPECT_DEPTH_BIT, \
          }, \
          { .isl_format = __fmt2, .swizzle = RGBA, \
-           .denominator_scales = { 1, 1, }, \
            .aspect = VK_IMAGE_ASPECT_STENCIL_BIT, \
          }, \
       }, \
@@ -109,32 +106,21 @@
       .vk_format = VK_FORMAT_UNDEFINED, \
    }
 
-#define y_plane(__plane, __hw_fmt, __swizzle, __ycbcr_swizzle, dhs, dvs) \
+#define ycbcr_plane(__plane, __hw_fmt, __swizzle) \
    { .isl_format = __hw_fmt, \
      .swizzle = __swizzle, \
-     .ycbcr_swizzle = __ycbcr_swizzle, \
-     .denominator_scales = { dhs, dvs, }, \
-     .has_chroma = false, \
-     .aspect = VK_IMAGE_ASPECT_PLANE_0_BIT, /* Y plane is always plane 0 */ \
-   }
-
-#define chroma_plane(__plane, __hw_fmt, __swizzle, __ycbcr_swizzle, dhs, dvs) \
-   { .isl_format = __hw_fmt, \
-     .swizzle = __swizzle, \
-     .ycbcr_swizzle = __ycbcr_swizzle, \
-     .denominator_scales = { dhs, dvs, }, \
-     .has_chroma = true, \
      .aspect = VK_IMAGE_ASPECT_PLANE_ ## __plane ## _BIT, \
    }
 
-#define ycbcr_fmt(__vk_fmt, __n_planes, ...) \
+#define ycbcr_fmt(__vk_fmt, __n_planes, __can_ycbcr, __can_video, ...) \
    [VK_ENUM_OFFSET(__vk_fmt)] = { \
       .planes = { \
          __VA_ARGS__, \
       }, \
       .vk_format = __vk_fmt, \
       .n_planes = __n_planes, \
-      .can_ycbcr = true, \
+      .can_ycbcr = __can_ycbcr, \
+      .can_video = __can_video, \
    }
 
 /* HINT: For array formats, the ISL name should match the VK name.  For
@@ -148,9 +134,9 @@ static const struct anv_format main_formats[] = {
    fmt1(VK_FORMAT_R4G4B4A4_UNORM_PACK16,             ISL_FORMAT_A4B4G4R4_UNORM),
    swiz_fmt1(VK_FORMAT_B4G4R4A4_UNORM_PACK16,        ISL_FORMAT_A4B4G4R4_UNORM,  BGRA),
    fmt1(VK_FORMAT_R5G6B5_UNORM_PACK16,               ISL_FORMAT_B5G6R5_UNORM),
-   fmt_unsupported(VK_FORMAT_B5G6R5_UNORM_PACK16),
+   swiz_fmt1(VK_FORMAT_B5G6R5_UNORM_PACK16,          ISL_FORMAT_B5G6R5_UNORM, BGRA),
    fmt1(VK_FORMAT_R5G5B5A1_UNORM_PACK16,             ISL_FORMAT_A1B5G5R5_UNORM),
-   fmt_unsupported(VK_FORMAT_B5G5R5A1_UNORM_PACK16),
+   swiz_fmt1(VK_FORMAT_B5G5R5A1_UNORM_PACK16,        ISL_FORMAT_A1B5G5R5_UNORM, BGRA),
    fmt1(VK_FORMAT_A1R5G5B5_UNORM_PACK16,             ISL_FORMAT_B5G5R5A1_UNORM),
    fmt1(VK_FORMAT_R8_UNORM,                          ISL_FORMAT_R8_UNORM),
    fmt1(VK_FORMAT_R8_SNORM,                          ISL_FORMAT_R8_SNORM),
@@ -334,33 +320,33 @@ static const struct anv_format main_formats[] = {
 };
 
 static const struct anv_format _4444_formats[] = {
-   fmt1(VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT, ISL_FORMAT_B4G4R4A4_UNORM),
-   fmt_unsupported(VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT),
+   fmt1(VK_FORMAT_A4R4G4B4_UNORM_PACK16, ISL_FORMAT_B4G4R4A4_UNORM),
+   fmt_unsupported(VK_FORMAT_A4B4G4R4_UNORM_PACK16),
 };
 
 static const struct anv_format ycbcr_formats[] = {
-   ycbcr_fmt(VK_FORMAT_G8B8G8R8_422_UNORM, 1,
-             y_plane(0, ISL_FORMAT_YCRCB_SWAPUV, RGBA, _ISL_SWIZZLE(BLUE, GREEN, RED, ZERO), 1, 1)),
-   ycbcr_fmt(VK_FORMAT_B8G8R8G8_422_UNORM, 1,
-             y_plane(0, ISL_FORMAT_YCRCB_SWAPUVY, RGBA, _ISL_SWIZZLE(BLUE, GREEN, RED, ZERO), 1, 1)),
-   ycbcr_fmt(VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM, 3,
-             y_plane(0, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
-             chroma_plane(1, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 2, 2),
-             chroma_plane(2, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 2, 2)),
-   ycbcr_fmt(VK_FORMAT_G8_B8R8_2PLANE_420_UNORM, 2,
-             y_plane(0, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
-             chroma_plane(1, ISL_FORMAT_R8G8_UNORM, RGBA, _ISL_SWIZZLE(BLUE, RED, ZERO, ZERO), 2, 2)),
-   ycbcr_fmt(VK_FORMAT_G8_B8_R8_3PLANE_422_UNORM, 3,
-             y_plane(0, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
-             chroma_plane(1, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 2, 1),
-             chroma_plane(2, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 2, 1)),
-   ycbcr_fmt(VK_FORMAT_G8_B8R8_2PLANE_422_UNORM, 2,
-             y_plane(0, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
-             chroma_plane(1, ISL_FORMAT_R8G8_UNORM, RGBA, _ISL_SWIZZLE(BLUE, RED, ZERO, ZERO), 2, 1)),
-   ycbcr_fmt(VK_FORMAT_G8_B8_R8_3PLANE_444_UNORM, 3,
-             y_plane(0, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
-             chroma_plane(1, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 1, 1),
-             chroma_plane(2, ISL_FORMAT_R8_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 1, 1)),
+   ycbcr_fmt(VK_FORMAT_G8B8G8R8_422_UNORM, 1, true, false,
+             ycbcr_plane(0, ISL_FORMAT_YCRCB_NORMAL, RGBA)),
+   ycbcr_fmt(VK_FORMAT_B8G8R8G8_422_UNORM, 1, true, false,
+             ycbcr_plane(0, ISL_FORMAT_YCRCB_SWAPY, RGBA)),
+   ycbcr_fmt(VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM, 3, true, false,
+             ycbcr_plane(0, ISL_FORMAT_R8_UNORM, RGBA),
+             ycbcr_plane(1, ISL_FORMAT_R8_UNORM, RGBA),
+             ycbcr_plane(2, ISL_FORMAT_R8_UNORM, RGBA)),
+   ycbcr_fmt(VK_FORMAT_G8_B8R8_2PLANE_420_UNORM, 2, true, true,
+             ycbcr_plane(0, ISL_FORMAT_R8_UNORM, RGBA),
+             ycbcr_plane(1, ISL_FORMAT_R8G8_UNORM, RGBA)),
+   ycbcr_fmt(VK_FORMAT_G8_B8_R8_3PLANE_422_UNORM, 3, true, false,
+             ycbcr_plane(0, ISL_FORMAT_R8_UNORM, RGBA),
+             ycbcr_plane(1, ISL_FORMAT_R8_UNORM, RGBA),
+             ycbcr_plane(2, ISL_FORMAT_R8_UNORM, RGBA)),
+   ycbcr_fmt(VK_FORMAT_G8_B8R8_2PLANE_422_UNORM, 2, true, false,
+             ycbcr_plane(0, ISL_FORMAT_R8_UNORM, RGBA),
+             ycbcr_plane(1, ISL_FORMAT_R8G8_UNORM, RGBA)),
+   ycbcr_fmt(VK_FORMAT_G8_B8_R8_3PLANE_444_UNORM, 3, true, false,
+             ycbcr_plane(0, ISL_FORMAT_R8_UNORM, RGBA),
+             ycbcr_plane(1, ISL_FORMAT_R8_UNORM, RGBA),
+             ycbcr_plane(2, ISL_FORMAT_R8_UNORM, RGBA)),
 
    fmt_unsupported(VK_FORMAT_R10X6_UNORM_PACK16),
    fmt_unsupported(VK_FORMAT_R10X6G10X6_UNORM_2PACK16),
@@ -368,7 +354,9 @@ static const struct anv_format ycbcr_formats[] = {
    fmt_unsupported(VK_FORMAT_G10X6B10X6G10X6R10X6_422_UNORM_4PACK16),
    fmt_unsupported(VK_FORMAT_B10X6G10X6R10X6G10X6_422_UNORM_4PACK16),
    fmt_unsupported(VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_420_UNORM_3PACK16),
-   fmt_unsupported(VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16),
+   ycbcr_fmt(VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16, 2, false, true,
+             ycbcr_plane(0, ISL_FORMAT_R16_UNORM, RGBA),
+             ycbcr_plane(1, ISL_FORMAT_R16G16_UNORM, RGBA)),
    fmt_unsupported(VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_422_UNORM_3PACK16),
    fmt_unsupported(VK_FORMAT_G10X6_B10X6R10X6_2PLANE_422_UNORM_3PACK16),
    fmt_unsupported(VK_FORMAT_G10X6_B10X6_R10X6_3PLANE_444_UNORM_3PACK16),
@@ -388,24 +376,29 @@ static const struct anv_format ycbcr_formats[] = {
    fmt_unsupported(VK_FORMAT_G16B16G16R16_422_UNORM),
    fmt_unsupported(VK_FORMAT_B16G16R16G16_422_UNORM),
 
-   ycbcr_fmt(VK_FORMAT_G16_B16_R16_3PLANE_420_UNORM, 3,
-             y_plane(0, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
-             chroma_plane(1, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 2, 2),
-             chroma_plane(2, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 2, 2)),
-   ycbcr_fmt(VK_FORMAT_G16_B16R16_2PLANE_420_UNORM, 2,
-             y_plane(0, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
-             chroma_plane(1, ISL_FORMAT_R16G16_UNORM, RGBA, _ISL_SWIZZLE(BLUE, RED, ZERO, ZERO), 2, 2)),
-   ycbcr_fmt(VK_FORMAT_G16_B16_R16_3PLANE_422_UNORM, 3,
-             y_plane(0, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
-             chroma_plane(1, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 2, 1),
-             chroma_plane(2, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 2, 1)),
-   ycbcr_fmt(VK_FORMAT_G16_B16R16_2PLANE_422_UNORM, 2,
-             y_plane(0, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
-             chroma_plane(1, ISL_FORMAT_R16G16_UNORM, RGBA, _ISL_SWIZZLE(BLUE, RED, ZERO, ZERO), 2, 1)),
-   ycbcr_fmt(VK_FORMAT_G16_B16_R16_3PLANE_444_UNORM, 3,
-             y_plane(0, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(GREEN, ZERO, ZERO, ZERO), 1, 1),
-             chroma_plane(1, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(BLUE, ZERO, ZERO, ZERO), 1, 1),
-             chroma_plane(2, ISL_FORMAT_R16_UNORM, RGBA, _ISL_SWIZZLE(RED, ZERO, ZERO, ZERO), 1, 1)),
+   ycbcr_fmt(VK_FORMAT_G16_B16_R16_3PLANE_420_UNORM, 3, true, false,
+             ycbcr_plane(0, ISL_FORMAT_R16_UNORM, RGBA),
+             ycbcr_plane(1, ISL_FORMAT_R16_UNORM, RGBA),
+             ycbcr_plane(2, ISL_FORMAT_R16_UNORM, RGBA)),
+   ycbcr_fmt(VK_FORMAT_G16_B16R16_2PLANE_420_UNORM, 2, true, false,
+             ycbcr_plane(0, ISL_FORMAT_R16_UNORM, RGBA),
+             ycbcr_plane(1, ISL_FORMAT_R16G16_UNORM, RGBA)),
+   ycbcr_fmt(VK_FORMAT_G16_B16_R16_3PLANE_422_UNORM, 3, true, false,
+             ycbcr_plane(0, ISL_FORMAT_R16_UNORM, RGBA),
+             ycbcr_plane(1, ISL_FORMAT_R16_UNORM, RGBA),
+             ycbcr_plane(2, ISL_FORMAT_R16_UNORM, RGBA)),
+   ycbcr_fmt(VK_FORMAT_G16_B16R16_2PLANE_422_UNORM, 2, true, false,
+             ycbcr_plane(0, ISL_FORMAT_R16_UNORM, RGBA),
+             ycbcr_plane(1, ISL_FORMAT_R16G16_UNORM, RGBA)),
+   ycbcr_fmt(VK_FORMAT_G16_B16_R16_3PLANE_444_UNORM, 3, true, false,
+             ycbcr_plane(0, ISL_FORMAT_R16_UNORM, RGBA),
+             ycbcr_plane(1, ISL_FORMAT_R16_UNORM, RGBA),
+             ycbcr_plane(2, ISL_FORMAT_R16_UNORM, RGBA)),
+};
+
+static const struct anv_format maintenance5_formats[] = {
+   fmt1(VK_FORMAT_A8_UNORM_KHR,                   ISL_FORMAT_A8_UNORM),
+   swiz_fmt1(VK_FORMAT_A1B5G5R5_UNORM_PACK16_KHR, ISL_FORMAT_B5G5R5A1_UNORM, BGRA)
 };
 
 #undef _fmt
@@ -423,6 +416,8 @@ static const struct {
                                                  .n_formats = ARRAY_SIZE(_4444_formats), },
    [_VK_KHR_sampler_ycbcr_conversion_number] = { .formats = ycbcr_formats,
                                                  .n_formats = ARRAY_SIZE(ycbcr_formats), },
+   [_VK_KHR_maintenance5_number]             = { .formats = maintenance5_formats,
+                                                 .n_formats = ARRAY_SIZE(maintenance5_formats), },
 };
 
 const struct anv_format *
@@ -494,14 +489,6 @@ anv_get_format_plane(const struct intel_device_info *devinfo,
    const struct isl_format_layout *isl_layout =
       isl_format_get_layout(plane_format.isl_format);
 
-   /* On Ivy Bridge we don't even have enough 24 and 48-bit formats that we
-    * can reliably do texture upload with BLORP so just don't claim support
-    * for any of them.
-    */
-   if (devinfo->verx10 == 70 &&
-       (isl_layout->bpb == 24 || isl_layout->bpb == 48))
-      return unsupported;
-
    if (tiling == VK_IMAGE_TILING_OPTIMAL &&
        !util_is_power_of_two_or_zero(isl_layout->bpb)) {
       /* Tiled formats *must* be power-of-two because we need up upload
@@ -520,14 +507,6 @@ anv_get_format_plane(const struct intel_device_info *devinfo,
       }
    }
 
-   /* The B4G4R4A4 format isn't available prior to Broadwell so we have to fall
-    * back to a format with a more complex swizzle.
-    */
-   if (vk_format == VK_FORMAT_B4G4R4A4_UNORM_PACK16 && devinfo->ver < 8) {
-      plane_format.isl_format = ISL_FORMAT_B4G4R4A4_UNORM;
-      plane_format.swizzle = ISL_SWIZZLE(GREEN, RED, ALPHA, BLUE);
-   }
-
    return plane_format;
 }
 
@@ -543,14 +522,15 @@ anv_get_format_aspect(const struct intel_device_info *devinfo,
 
 // Format capabilities
 
-VkFormatFeatureFlags
-anv_get_image_format_features(const struct intel_device_info *devinfo,
-                              VkFormat vk_format,
-                              const struct anv_format *anv_format,
-                              VkImageTiling vk_tiling,
-                              const struct isl_drm_modifier_info *isl_mod_info)
+VkFormatFeatureFlags2
+anv_get_image_format_features2(const struct anv_physical_device *physical_device,
+                               VkFormat vk_format,
+                               const struct anv_format *anv_format,
+                               VkImageTiling vk_tiling,
+                               const struct isl_drm_modifier_info *isl_mod_info)
 {
-   VkFormatFeatureFlags flags = 0;
+   const struct intel_device_info *devinfo = &physical_device->info;
+   VkFormatFeatureFlags2 flags = 0;
 
    if (anv_format == NULL)
       return 0;
@@ -558,6 +538,23 @@ anv_get_image_format_features(const struct intel_device_info *devinfo,
    assert((isl_mod_info != NULL) ==
           (vk_tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT));
 
+   if (anv_is_format_emulated(physical_device, vk_format)) {
+      assert(isl_format_is_compressed(anv_format->planes[0].isl_format));
+
+      /* require optimal tiling so that we can decompress on upload */
+      if (vk_tiling != VK_IMAGE_TILING_OPTIMAL)
+         return 0;
+
+      /* required features for compressed formats */
+      flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT |
+               VK_FORMAT_FEATURE_2_BLIT_SRC_BIT |
+               VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT |
+               VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
+               VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
+
+      return flags;
+   }
+
    const VkImageAspectFlags aspects = vk_format_aspects(vk_format);
 
    if (aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
@@ -565,23 +562,30 @@ anv_get_image_format_features(const struct intel_device_info *devinfo,
           vk_tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT)
          return 0;
 
-      flags |= VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT |
-               VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT |
-               VK_FORMAT_FEATURE_BLIT_SRC_BIT |
-               VK_FORMAT_FEATURE_BLIT_DST_BIT |
-               VK_FORMAT_FEATURE_TRANSFER_SRC_BIT |
-               VK_FORMAT_FEATURE_TRANSFER_DST_BIT;
-
-      if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
-         flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
-
-      if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && devinfo->ver >= 9)
-         flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT_EXT;
+      flags |= VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT |
+               VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT |
+               VK_FORMAT_FEATURE_2_BLIT_SRC_BIT |
+               VK_FORMAT_FEATURE_2_BLIT_DST_BIT |
+               VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
+               VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
+
+      if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
+         flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT |
+                  VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT |
+                  VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_DEPTH_COMPARISON_BIT;
+      }
 
       return flags;
    }
 
    assert(aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
+
+   if (physical_device->video_decode_enabled &&
+       anv_format->can_video) {
+      flags |= VK_FORMAT_FEATURE_2_VIDEO_DECODE_OUTPUT_BIT_KHR |
+               VK_FORMAT_FEATURE_2_VIDEO_DECODE_DPB_BIT_KHR;
+   }
+
    const struct anv_format_plane plane_format =
       anv_get_format_plane(devinfo, vk_format, 0, vk_tiling);
 
@@ -596,29 +600,26 @@ anv_get_image_format_features(const struct intel_device_info *devinfo,
 
    enum isl_format base_isl_format = base_plane_format.isl_format;
 
-   /* ASTC textures must be in Y-tiled memory, and we reject compressed formats
-    * with modifiers.
-    */
-   if (vk_tiling != VK_IMAGE_TILING_OPTIMAL &&
-       isl_format_get_layout(plane_format.isl_format)->txc == ISL_TXC_ASTC)
-      return 0;
-
-   /* ASTC requires nasty workarounds on BSW so we just disable it for now.
-    *
-    * TODO: Figure out the ASTC workarounds and re-enable on BSW.
-    */
-   if (devinfo->ver < 9 &&
-       isl_format_get_layout(plane_format.isl_format)->txc == ISL_TXC_ASTC)
-      return 0;
-
    if (isl_format_supports_sampling(devinfo, plane_format.isl_format)) {
-      flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT;
 
-      if (devinfo->ver >= 9)
-         flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT_EXT;
+      /* Unlike other surface formats, our sampler requires that the ASTC
+       * format only be used on surfaces in non-linearly-tiled memory.
+       * Thankfully, we can make an exception for linearly-tiled images that
+       * are only used for transfers. blorp_copy will reinterpret any
+       * compressed format to an uncompressed one.
+       *
+       * We handle modifier tilings further down in this function.
+       */
+      if (vk_tiling == VK_IMAGE_TILING_LINEAR &&
+          isl_format_get_layout(plane_format.isl_format)->txc == ISL_TXC_ASTC)
+         return VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
+                VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
+
+      flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT |
+               VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT;
 
       if (isl_format_supports_filtering(devinfo, plane_format.isl_format))
-         flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
+         flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
    }
 
    /* We can render to swizzled formats.  However, if the alpha channel is
@@ -627,31 +628,50 @@ anv_get_image_format_features(const struct intel_device_info *devinfo,
     */
    if (isl_format_supports_rendering(devinfo, plane_format.isl_format) &&
        plane_format.swizzle.a == ISL_CHANNEL_SELECT_ALPHA) {
-      flags |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT;
+      flags |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT;
 
-      if (isl_format_supports_alpha_blending(devinfo, plane_format.isl_format))
-         flags |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT;
+      /* While we can render to swizzled formats, they don't blend correctly
+       * if there are blend constants involved.  The swizzle just remaps the
+       * output of the shader to different channels in the texture.  It
+       * doesn't change the interpretation of the constant blend factors in
+       * COLOR_CALC_STATE.
+       */
+      if (isl_format_supports_alpha_blending(devinfo, plane_format.isl_format) &&
+          isl_swizzle_is_identity(plane_format.swizzle))
+         flags |= VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT;
    }
 
    /* Load/store is determined based on base format.  This prevents RGB
     * formats from showing up as load/store capable.
     */
+   if (isl_format_supports_typed_reads(devinfo, base_isl_format))
+      flags |= VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT;
    if (isl_format_supports_typed_writes(devinfo, base_isl_format))
-      flags |= VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT;
+      flags |= VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT;
+
+   /* Keep this old behavior on VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT.
+    * When KHR_format_features2 is enabled, applications should only rely on
+    * it for the list of shader storage extended formats [1]. Before that,
+    * this applies to all VkFormats.
+    *
+    * [1] : https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/vkspec.html#features-shaderStorageImageExtendedFormats
+    */
+   if (flags & VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT)
+      flags |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT;
 
    if (base_isl_format == ISL_FORMAT_R32_SINT ||
        base_isl_format == ISL_FORMAT_R32_UINT ||
        base_isl_format == ISL_FORMAT_R32_FLOAT)
-      flags |= VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT;
+      flags |= VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT;
 
    if (flags) {
-      flags |= VK_FORMAT_FEATURE_BLIT_SRC_BIT |
-               VK_FORMAT_FEATURE_TRANSFER_SRC_BIT |
-               VK_FORMAT_FEATURE_TRANSFER_DST_BIT;
+      flags |= VK_FORMAT_FEATURE_2_BLIT_SRC_BIT |
+               VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
+               VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
 
       /* Blit destination requires rendering support. */
       if (isl_format_supports_rendering(devinfo, plane_format.isl_format))
-         flags |= VK_FORMAT_FEATURE_BLIT_DST_BIT;
+         flags |= VK_FORMAT_FEATURE_2_BLIT_DST_BIT;
    }
 
    /* XXX: We handle 3-channel formats by switching them out for RGBX or
@@ -665,10 +685,17 @@ anv_get_image_format_features(const struct intel_device_info *devinfo,
        base_isl_format != ISL_FORMAT_UNSUPPORTED &&
        !util_is_power_of_two_or_zero(isl_format_layouts[base_isl_format].bpb) &&
        isl_format_rgb_to_rgbx(base_isl_format) == ISL_FORMAT_UNSUPPORTED) {
-      flags &= ~VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT;
-      flags &= ~VK_FORMAT_FEATURE_BLIT_DST_BIT;
+      flags &= ~VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT;
+      flags &= ~VK_FORMAT_FEATURE_2_BLIT_DST_BIT;
    }
 
+   const VkFormatFeatureFlags2 disallowed_ycbcr_image_features =
+      VK_FORMAT_FEATURE_2_BLIT_SRC_BIT |
+      VK_FORMAT_FEATURE_2_BLIT_DST_BIT |
+      VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT |
+      VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BLEND_BIT |
+      VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT;
+
    if (anv_format->can_ycbcr) {
       /* The sampler doesn't have support for mid point when it handles YUV on
        * its own.
@@ -678,35 +705,34 @@ anv_get_image_format_features(const struct intel_device_info *devinfo,
           * sampler. The failures show a slightly out of range values on the
           * bottom left of the sampled image.
           */
-         flags |= VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT;
+         flags |= VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT;
       } else {
-         flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT |
-                  VK_FORMAT_FEATURE_MIDPOINT_CHROMA_SAMPLES_BIT |
-                  VK_FORMAT_FEATURE_SAMPLED_IMAGE_YCBCR_CONVERSION_SEPARATE_RECONSTRUCTION_FILTER_BIT;
+         flags |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_LINEAR_FILTER_BIT |
+                  VK_FORMAT_FEATURE_2_MIDPOINT_CHROMA_SAMPLES_BIT |
+                  VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_YCBCR_CONVERSION_SEPARATE_RECONSTRUCTION_FILTER_BIT;
       }
 
       /* We can support cosited chroma locations when handle planes with our
        * own shader snippets.
        */
-      for (unsigned p = 0; p < anv_format->n_planes; p++) {
-         if (anv_format->planes[p].denominator_scales[0] > 1 ||
-             anv_format->planes[p].denominator_scales[1] > 1) {
+      const struct vk_format_ycbcr_info *ycbcr_info =
+         vk_format_get_ycbcr_info(vk_format);
+      assert(anv_format->n_planes == ycbcr_info->n_planes);
+      for (unsigned p = 0; p < ycbcr_info->n_planes; p++) {
+         if (ycbcr_info->planes[p].denominator_scales[0] > 1 ||
+             ycbcr_info->planes[p].denominator_scales[1] > 1) {
             flags |= VK_FORMAT_FEATURE_COSITED_CHROMA_SAMPLES_BIT;
             break;
          }
       }
 
       if (anv_format->n_planes > 1)
-         flags |= VK_FORMAT_FEATURE_DISJOINT_BIT;
-
-      const VkFormatFeatureFlags disallowed_ycbcr_image_features =
-         VK_FORMAT_FEATURE_BLIT_SRC_BIT |
-         VK_FORMAT_FEATURE_BLIT_DST_BIT |
-         VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
-         VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT |
-         VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT;
+         flags |= VK_FORMAT_FEATURE_2_DISJOINT_BIT;
 
       flags &= ~disallowed_ycbcr_image_features;
+   } else if (anv_format->can_video) {
+      /* This format is for video decoding. */
+      flags &= ~disallowed_ycbcr_image_features;
    }
 
    if (vk_tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
@@ -755,14 +781,16 @@ anv_get_image_format_features(const struct intel_device_info *devinfo,
 
       if (anv_format->n_planes > 1) {
          /* For simplicity, keep DISJOINT disabled for multi-planar format. */
-         flags &= ~VK_FORMAT_FEATURE_DISJOINT_BIT;
+         flags &= ~VK_FORMAT_FEATURE_2_DISJOINT_BIT;
 
          /* VK_ANDROID_external_memory_android_hardware_buffer in Virtio-GPU
           * Venus driver layers on top of VK_EXT_image_drm_format_modifier of
-          * the host Vulkan driver, and VK_FORMAT_G8_B8R8_2PLANE_420_UNORM is
-          * required to support camera/media interop in Android.
+          * the host Vulkan driver, and both VK_FORMAT_G8_B8R8_2PLANE_420_UNORM
+          * and VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM and required to support
+          * camera/media interop in Android.
           */
-         if (vk_format != VK_FORMAT_G8_B8R8_2PLANE_420_UNORM) {
+         if (vk_format != VK_FORMAT_G8_B8R8_2PLANE_420_UNORM &&
+             vk_format != VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM) {
             anv_finishme("support more multi-planar formats with DRM modifiers");
             return 0;
          }
@@ -771,41 +799,46 @@ anv_get_image_format_features(const struct intel_device_info *devinfo,
           * planes and aux planes due to the lack of defined ABI for external
           * multi-planar images.
           */
-         if (isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE) {
+         if (isl_drm_modifier_has_aux(isl_mod_info->modifier)) {
             return 0;
          }
       }
 
-      if (isl_mod_info->aux_usage == ISL_AUX_USAGE_CCS_E &&
-          !isl_format_supports_ccs_e(devinfo, plane_format.isl_format)) {
+      if (isl_drm_modifier_has_aux(isl_mod_info->modifier) &&
+          !anv_format_supports_ccs_e(devinfo, plane_format.isl_format)) {
          return 0;
       }
 
-      if (isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE) {
+      if (isl_drm_modifier_has_aux(isl_mod_info->modifier)) {
          /* Rejection DISJOINT for consistency with the GL driver. In
           * eglCreateImage, we require that the dma_buf for the primary surface
           * and the dma_buf for its aux surface refer to the same bo.
           */
-         flags &= ~VK_FORMAT_FEATURE_DISJOINT_BIT;
+         flags &= ~VK_FORMAT_FEATURE_2_DISJOINT_BIT;
 
          /* When the hardware accesses a storage image, it bypasses the aux
           * surface. We could support storage access on images with aux
           * modifiers by resolving the aux surface prior to the storage access.
           */
-         flags &= ~VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT;
-         flags &= ~VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT;
+         flags &= ~VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT;
+         flags &= ~VK_FORMAT_FEATURE_2_STORAGE_IMAGE_ATOMIC_BIT;
       }
    }
 
+   if (devinfo->has_coarse_pixel_primitive_and_cb &&
+       vk_format == VK_FORMAT_R8_UINT &&
+       vk_tiling == VK_IMAGE_TILING_OPTIMAL)
+      flags |= VK_FORMAT_FEATURE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR;
+
    return flags;
 }
 
-static VkFormatFeatureFlags
-get_buffer_format_features(const struct intel_device_info *devinfo,
-                           VkFormat vk_format,
-                           const struct anv_format *anv_format)
+static VkFormatFeatureFlags2
+get_buffer_format_features2(const struct intel_device_info *devinfo,
+                            VkFormat vk_format,
+                            const struct anv_format *anv_format)
 {
-   VkFormatFeatureFlags flags = 0;
+   VkFormatFeatureFlags2 flags = 0;
 
    if (anv_format == NULL)
       return 0;
@@ -818,7 +851,7 @@ get_buffer_format_features(const struct intel_device_info *devinfo,
    if (anv_format->n_planes > 1)
       return 0;
 
-   if (anv_format->can_ycbcr)
+   if (anv_format->can_ycbcr || anv_format->can_video)
       return 0;
 
    if (vk_format_is_depth_or_stencil(vk_format))
@@ -826,16 +859,42 @@ get_buffer_format_features(const struct intel_device_info *devinfo,
 
    if (isl_format_supports_sampling(devinfo, isl_format) &&
        !isl_format_is_compressed(isl_format))
-      flags |= VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT;
+      flags |= VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT;
 
    if (isl_format_supports_vertex_fetch(devinfo, isl_format))
-      flags |= VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT;
+      flags |= VK_FORMAT_FEATURE_2_VERTEX_BUFFER_BIT;
 
-   if (isl_is_storage_image_format(isl_format))
-      flags |= VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT;
+   if (isl_is_storage_image_format(devinfo, isl_format))
+      flags |= VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_BIT;
 
    if (isl_format == ISL_FORMAT_R32_SINT || isl_format == ISL_FORMAT_R32_UINT)
-      flags |= VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_ATOMIC_BIT;
+      flags |= VK_FORMAT_FEATURE_2_STORAGE_TEXEL_BUFFER_ATOMIC_BIT;
+
+   if (isl_format_supports_typed_reads(devinfo, isl_format))
+      flags |= VK_FORMAT_FEATURE_2_STORAGE_READ_WITHOUT_FORMAT_BIT;
+   if (isl_format_supports_typed_writes(devinfo, isl_format))
+      flags |= VK_FORMAT_FEATURE_2_STORAGE_WRITE_WITHOUT_FORMAT_BIT;
+
+   if (devinfo->has_ray_tracing) {
+      switch (vk_format) {
+      case VK_FORMAT_R32G32_SFLOAT:
+      case VK_FORMAT_R32G32B32_SFLOAT:
+      case VK_FORMAT_R16G16_SFLOAT:
+      case VK_FORMAT_R16G16B16A16_SFLOAT:
+      case VK_FORMAT_R16G16_SNORM:
+      case VK_FORMAT_R16G16B16A16_SNORM:
+      case VK_FORMAT_R16G16B16A16_UNORM:
+      case VK_FORMAT_R16G16_UNORM:
+      case VK_FORMAT_R8G8B8A8_UNORM:
+      case VK_FORMAT_R8G8_UNORM:
+      case VK_FORMAT_R8G8B8A8_SNORM:
+      case VK_FORMAT_R8G8_SNORM:
+         flags |= VK_FORMAT_FEATURE_ACCELERATION_STRUCTURE_VERTEX_BUFFER_BIT_KHR;
+         break;
+      default:
+         break;
+      }
+   }
 
    return flags;
 }
@@ -845,25 +904,27 @@ get_drm_format_modifier_properties_list(const struct anv_physical_device *physic
                                         VkFormat vk_format,
                                         VkDrmFormatModifierPropertiesListEXT *list)
 {
-   const struct intel_device_info *devinfo = &physical_device->info;
    const struct anv_format *anv_format = anv_get_format(vk_format);
 
-   VK_OUTARRAY_MAKE(out, list->pDrmFormatModifierProperties,
-                    &list->drmFormatModifierCount);
+   VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierPropertiesEXT, out,
+                          list->pDrmFormatModifierProperties,
+                          &list->drmFormatModifierCount);
 
    isl_drm_modifier_info_for_each(isl_mod_info) {
-      VkFormatFeatureFlags features =
-         anv_get_image_format_features(devinfo, vk_format, anv_format,
-                                       VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT,
-                                       isl_mod_info);
+      VkFormatFeatureFlags2 features2 =
+         anv_get_image_format_features2(physical_device, vk_format, anv_format,
+                                        VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT,
+                                        isl_mod_info);
+      VkFormatFeatureFlags features = vk_format_features2_to_features(features2);
       if (!features)
          continue;
 
-      uint32_t planes = anv_format->n_planes;
-      if (isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE)
-         ++planes;
+      const uint32_t planes =
+         isl_drm_modifier_get_plane_count(&physical_device->info,
+                                          isl_mod_info->modifier,
+                                          anv_format->n_planes);
 
-      vk_outarray_append(&out, out_props) {
+      vk_outarray_append_typed(VkDrmFormatModifierPropertiesEXT, &out, out_props) {
          *out_props = (VkDrmFormatModifierPropertiesEXT) {
             .drmFormatModifier = isl_mod_info->modifier,
             .drmFormatModifierPlaneCount = planes,
@@ -873,43 +934,89 @@ get_drm_format_modifier_properties_list(const struct anv_physical_device *physic
    }
 }
 
-void anv_GetPhysicalDeviceFormatProperties(
-    VkPhysicalDevice                            physicalDevice,
-    VkFormat                                    vk_format,
-    VkFormatProperties*                         pFormatProperties)
+static void
+get_drm_format_modifier_properties_list_2(const struct anv_physical_device *physical_device,
+                                          VkFormat vk_format,
+                                          VkDrmFormatModifierPropertiesList2EXT *list)
 {
-   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
-   const struct intel_device_info *devinfo = &physical_device->info;
    const struct anv_format *anv_format = anv_get_format(vk_format);
 
-   *pFormatProperties = (VkFormatProperties) {
-      .linearTilingFeatures =
-         anv_get_image_format_features(devinfo, vk_format, anv_format,
-                                       VK_IMAGE_TILING_LINEAR, NULL),
-      .optimalTilingFeatures =
-         anv_get_image_format_features(devinfo, vk_format, anv_format,
-                                       VK_IMAGE_TILING_OPTIMAL, NULL),
-      .bufferFeatures =
-         get_buffer_format_features(devinfo, vk_format, anv_format),
-   };
+   VK_OUTARRAY_MAKE_TYPED(VkDrmFormatModifierProperties2EXT, out,
+                          list->pDrmFormatModifierProperties,
+                          &list->drmFormatModifierCount);
+
+   isl_drm_modifier_info_for_each(isl_mod_info) {
+      VkFormatFeatureFlags2 features2 =
+         anv_get_image_format_features2(physical_device, vk_format, anv_format,
+                                        VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT,
+                                        isl_mod_info);
+      if (!features2)
+         continue;
+
+      const uint32_t planes =
+         isl_drm_modifier_get_plane_count(&physical_device->info,
+                                          isl_mod_info->modifier,
+                                          anv_format->n_planes);
+
+      vk_outarray_append_typed(VkDrmFormatModifierProperties2EXT, &out, out_props) {
+         *out_props = (VkDrmFormatModifierProperties2EXT) {
+            .drmFormatModifier = isl_mod_info->modifier,
+            .drmFormatModifierPlaneCount = planes,
+            .drmFormatModifierTilingFeatures = features2,
+         };
+      };
+   }
 }
 
 void anv_GetPhysicalDeviceFormatProperties2(
     VkPhysicalDevice                            physicalDevice,
-    VkFormat                                    format,
+    VkFormat                                    vk_format,
     VkFormatProperties2*                        pFormatProperties)
 {
    ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
-   anv_GetPhysicalDeviceFormatProperties(physicalDevice, format,
-                                         &pFormatProperties->formatProperties);
+   const struct intel_device_info *devinfo = &physical_device->info;
+   const struct anv_format *anv_format = anv_get_format(vk_format);
+
+   assert(pFormatProperties->sType == VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2);
+
+   VkFormatFeatureFlags2 linear2, optimal2, buffer2;
+   linear2 = anv_get_image_format_features2(physical_device, vk_format,
+                                            anv_format,
+                                            VK_IMAGE_TILING_LINEAR, NULL);
+   optimal2 = anv_get_image_format_features2(physical_device, vk_format,
+                                             anv_format,
+                                             VK_IMAGE_TILING_OPTIMAL, NULL);
+   buffer2 = get_buffer_format_features2(devinfo, vk_format, anv_format);
+
+   pFormatProperties->formatProperties = (VkFormatProperties) {
+      .linearTilingFeatures = vk_format_features2_to_features(linear2),
+      .optimalTilingFeatures = vk_format_features2_to_features(optimal2),
+      .bufferFeatures = vk_format_features2_to_features(buffer2),
+   };
 
    vk_foreach_struct(ext, pFormatProperties->pNext) {
       /* Use unsigned since some cases are not in the VkStructureType enum. */
       switch ((unsigned)ext->sType) {
       case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_EXT:
-         get_drm_format_modifier_properties_list(physical_device, format,
+         get_drm_format_modifier_properties_list(physical_device, vk_format,
                                                  (void *)ext);
          break;
+
+      case VK_STRUCTURE_TYPE_DRM_FORMAT_MODIFIER_PROPERTIES_LIST_2_EXT:
+         get_drm_format_modifier_properties_list_2(physical_device, vk_format,
+                                                   (void *)ext);
+         break;
+
+      case VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_3: {
+         VkFormatProperties3 *props = (VkFormatProperties3 *)ext;
+         props->linearTilingFeatures = linear2;
+         props->optimalTilingFeatures = optimal2;
+         props->bufferFeatures = buffer2;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_VIDEO_PROFILE_LIST_INFO_KHR:
+         /* don't have any thing to use this for yet */
+         break;
       default:
          anv_debug_ignored_stype(ext->sType);
          break;
@@ -917,61 +1024,347 @@ void anv_GetPhysicalDeviceFormatProperties2(
    }
 }
 
+static bool
+anv_format_supports_usage(
+   VkFormatFeatureFlags2 format_feature_flags,
+   VkImageUsageFlags usage_flags)
+{
+   if (usage_flags & VK_IMAGE_USAGE_TRANSFER_SRC_BIT) {
+      if (!(format_feature_flags & (VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT |
+                                    VK_FORMAT_FEATURE_2_BLIT_SRC_BIT))) {
+         return false;
+      }
+   }
+
+   if (usage_flags & VK_IMAGE_USAGE_TRANSFER_DST_BIT) {
+      if (!(format_feature_flags & (VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT |
+                                    VK_FORMAT_FEATURE_2_BLIT_DST_BIT))) {
+         return false;
+      }
+   }
+
+   if (usage_flags & VK_IMAGE_USAGE_SAMPLED_BIT) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT)) {
+         return false;
+      }
+   }
+
+   if (usage_flags & VK_IMAGE_USAGE_STORAGE_BIT) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_2_STORAGE_IMAGE_BIT)) {
+         return false;
+      }
+   }
+
+   if (usage_flags & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT)) {
+         return false;
+      }
+   }
+
+   if (usage_flags & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT)) {
+         return false;
+      }
+   }
+
+   if (usage_flags & VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT) {
+      /* Nothing to check. */
+   }
+
+   if (usage_flags & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) {
+      if (!(format_feature_flags & (VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT |
+                                    VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT))) {
+         return false;
+      }
+   }
+
+   if (usage_flags & VK_IMAGE_USAGE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR) {
+      if (!(format_feature_flags & VK_FORMAT_FEATURE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR)) {
+         return false;
+      }
+   }
+
+   return true;
+}
+
+static bool
+anv_formats_are_compatible(
+   const struct anv_format *img_fmt, const struct anv_format *img_view_fmt,
+   const struct intel_device_info *devinfo, VkImageTiling tiling,
+   bool allow_texel_compatible)
+{
+   if (img_view_fmt->vk_format == VK_FORMAT_UNDEFINED)
+      return false;
+
+   if (img_fmt == img_view_fmt)
+      return true;
+
+   /* TODO: Handle multi-planar images that can have view of a plane with
+    * possibly different type.
+    */
+   if (img_fmt->n_planes != 1 || img_view_fmt->n_planes != 1)
+      return false;
+
+   const enum isl_format img_isl_fmt =
+      anv_get_format_plane(devinfo, img_fmt->vk_format, 0, tiling).isl_format;
+   const enum isl_format img_view_isl_fmt =
+      anv_get_format_plane(devinfo, img_view_fmt->vk_format, 0, tiling).isl_format;
+   if (img_isl_fmt == ISL_FORMAT_UNSUPPORTED ||
+       img_view_isl_fmt == ISL_FORMAT_UNSUPPORTED)
+      return false;
+
+   const struct isl_format_layout *img_fmt_layout =
+         isl_format_get_layout(img_isl_fmt);
+   const struct isl_format_layout *img_view_fmt_layout =
+         isl_format_get_layout(img_view_isl_fmt);
+
+   /* From the Vulkan 1.3.230 spec "12.5. Image Views"
+    *
+    *    "If image was created with the
+    *    VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT flag, format must be
+    *    compatible with the image’s format as described above; or must be
+    *    an uncompressed format, in which case it must be size-compatible
+    *    with the image’s format."
+    */
+   if (allow_texel_compatible &&
+       isl_format_is_compressed(img_isl_fmt) &&
+       !isl_format_is_compressed(img_view_isl_fmt) &&
+       img_fmt_layout->bpb == img_view_fmt_layout->bpb)
+      return true;
+
+   if (isl_format_is_compressed(img_isl_fmt) !=
+       isl_format_is_compressed(img_view_isl_fmt))
+      return false;
+
+   if (!isl_format_is_compressed(img_isl_fmt)) {
+      /* From the Vulkan 1.3.224 spec "43.1.6. Format Compatibility Classes":
+       *
+       *    "Uncompressed color formats are compatible with each other if they
+       *    occupy the same number of bits per texel block."
+       */
+      return img_fmt_layout->bpb == img_view_fmt_layout->bpb;
+   }
+
+   /* From the Vulkan 1.3.224 spec "43.1.6. Format Compatibility Classes":
+    *
+    *    "Compressed color formats are compatible with each other if the only
+    *    difference between them is the numerical type of the uncompressed
+    *    pixels (e.g. signed vs. unsigned, or SRGB vs. UNORM encoding)."
+    */
+   return img_fmt_layout->txc == img_view_fmt_layout->txc &&
+          isl_formats_have_same_bits_per_channel(img_isl_fmt, img_view_isl_fmt);
+}
+
+/* Returns a set of feature flags supported by any of the VkFormat listed in
+ * format_list_info or any VkFormat compatible with format.
+ */
+static VkFormatFeatureFlags2
+anv_formats_gather_format_features(
+   const struct anv_physical_device *physical_device,
+   const struct anv_format *format,
+   VkImageTiling tiling,
+   const struct isl_drm_modifier_info *isl_mod_info,
+   const VkImageFormatListCreateInfo *format_list_info,
+   bool allow_texel_compatible)
+{
+   const struct intel_device_info *devinfo = &physical_device->info;
+   VkFormatFeatureFlags2 all_formats_feature_flags = 0;
+
+   /* We need to check that each of the usage bits are allowed for at least
+    * one of the potential formats.
+    */
+   if (!format_list_info || format_list_info->viewFormatCount == 0) {
+      /* If we specify no list of possible formats, we need to assume that
+       * every compatible format is possible and consider the features
+       * supported by each of them.
+       */
+      for (uint32_t fmt_arr_ind = 0;
+           fmt_arr_ind < ARRAY_SIZE(anv_formats);
+           ++fmt_arr_ind) {
+         for (uint32_t fmt_ind = 0;
+              fmt_ind < anv_formats[fmt_arr_ind].n_formats;
+              ++fmt_ind) {
+            const struct anv_format *possible_anv_format =
+               &(anv_formats[fmt_arr_ind].formats[fmt_ind]);
+
+            if (anv_formats_are_compatible(format, possible_anv_format,
+                                           devinfo, tiling,
+                                           allow_texel_compatible)) {
+               VkFormatFeatureFlags2 view_format_features =
+                  anv_get_image_format_features2(physical_device,
+                                                 possible_anv_format->vk_format,
+                                                 possible_anv_format, tiling,
+                                                 isl_mod_info);
+               all_formats_feature_flags |= view_format_features;
+            }
+         }
+      }
+   } else {
+      /* If we provide the list of possible formats, then check just them. */
+      for (uint32_t i = 0; i < format_list_info->viewFormatCount; ++i) {
+         VkFormat vk_view_format = format_list_info->pViewFormats[i];
+
+         if (vk_view_format == VK_FORMAT_UNDEFINED)
+            continue;
+
+         const struct anv_format *anv_view_format =
+            anv_get_format(vk_view_format);
+         VkFormatFeatureFlags2 view_format_features =
+            anv_get_image_format_features2(physical_device,
+                                           vk_view_format, anv_view_format,
+                                           tiling, isl_mod_info);
+         all_formats_feature_flags |= view_format_features;
+      }
+   }
+
+   return all_formats_feature_flags;
+}
+
+/* Supports opaque fd but not dma_buf. */
+static const VkExternalMemoryProperties opaque_fd_only_props = {
+   .externalMemoryFeatures =
+      VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
+      VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
+   .exportFromImportedHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
+   .compatibleHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
+};
+
+/* Supports opaque fd and dma_buf. */
+static const VkExternalMemoryProperties opaque_fd_dma_buf_props = {
+   .externalMemoryFeatures =
+      VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
+      VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
+   .exportFromImportedHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT |
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
+   .compatibleHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT |
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
+};
+
+static const VkExternalMemoryProperties userptr_props = {
+   .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
+   .exportFromImportedHandleTypes = 0,
+   .compatibleHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT,
+};
+
+static const VkExternalMemoryProperties android_buffer_props = {
+   .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
+                             VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
+   .exportFromImportedHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
+   .compatibleHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
+};
+
+
+static const VkExternalMemoryProperties android_image_props = {
+   /* VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT will be set dynamically */
+   .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT |
+                             VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT,
+   .exportFromImportedHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
+   .compatibleHandleTypes =
+      VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
+};
+
 static VkResult
 anv_get_image_format_properties(
    struct anv_physical_device *physical_device,
    const VkPhysicalDeviceImageFormatInfo2 *info,
-   VkImageFormatProperties *pImageFormatProperties,
-   VkSamplerYcbcrConversionImageFormatProperties *pYcbcrImageFormatProperties)
+   VkImageFormatProperties2 *props)
 {
-   VkFormatFeatureFlags format_feature_flags;
+   VkFormatFeatureFlags2 format_feature_flags;
    VkExtent3D maxExtent;
    uint32_t maxMipLevels;
    uint32_t maxArraySize;
    VkSampleCountFlags sampleCounts;
-   struct anv_instance *instance = physical_device->instance;
    const struct intel_device_info *devinfo = &physical_device->info;
    const struct anv_format *format = anv_get_format(info->format);
    const struct isl_drm_modifier_info *isl_mod_info = NULL;
-   const VkImageFormatListCreateInfo *format_list_info =
-      vk_find_struct_const(info->pNext, IMAGE_FORMAT_LIST_CREATE_INFO);
+   const VkPhysicalDeviceImageDrmFormatModifierInfoEXT *modifier_info = NULL;
+   const VkImageFormatListCreateInfo *format_list_info = NULL;
+   const VkPhysicalDeviceExternalImageFormatInfo *external_info = NULL;
+   VkExternalImageFormatProperties *external_props = NULL;
+   VkSamplerYcbcrConversionImageFormatProperties *ycbcr_props = NULL;
+   VkAndroidHardwareBufferUsageANDROID *android_usage = NULL;
+   VkTextureLODGatherFormatPropertiesAMD *texture_lod_gather_props = NULL;
+   VkImageCompressionPropertiesEXT *comp_props = NULL;
+   bool from_wsi = false;
+
+   /* Extract input structs */
+   vk_foreach_struct_const(s, info->pNext) {
+      switch ((unsigned)s->sType) {
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO:
+         external_info = (const void *) s;
+         break;
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT:
+         modifier_info = (const void *)s;
+         break;
+      case VK_STRUCTURE_TYPE_IMAGE_FORMAT_LIST_CREATE_INFO:
+         format_list_info = (const void *)s;
+         break;
+      case VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO:
+         /* Ignore but don't warn */
+         break;
+      case VK_STRUCTURE_TYPE_WSI_IMAGE_CREATE_INFO_MESA:
+         from_wsi = true;
+         break;
+      case VK_STRUCTURE_TYPE_VIDEO_PROFILE_LIST_INFO_KHR:
+         /* Ignore but don't warn */
+         break;
+      case VK_STRUCTURE_TYPE_IMAGE_COMPRESSION_CONTROL_EXT:
+         /* Ignore but don't warn */
+         break;
+      default:
+         anv_debug_ignored_stype(s->sType);
+         break;
+      }
+   }
+
+   /* Extract output structs */
+   vk_foreach_struct(s, props->pNext) {
+      switch (s->sType) {
+      case VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES:
+         external_props = (void *) s;
+         break;
+      case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES:
+         ycbcr_props = (void *) s;
+         break;
+      case VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_USAGE_ANDROID:
+         android_usage = (void *) s;
+         break;
+      case VK_STRUCTURE_TYPE_TEXTURE_LOD_GATHER_FORMAT_PROPERTIES_AMD:
+         texture_lod_gather_props = (void *) s;
+         break;
+      case VK_STRUCTURE_TYPE_IMAGE_COMPRESSION_PROPERTIES_EXT:
+         comp_props = (void *) s;
+         break;
+      default:
+         anv_debug_ignored_stype(s->sType);
+         break;
+      }
+   }
 
    if (format == NULL)
       goto unsupported;
 
    if (info->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
-      const VkPhysicalDeviceImageDrmFormatModifierInfoEXT *vk_mod_info =
-         vk_find_struct_const(info->pNext, PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT);
-
-      isl_mod_info = isl_drm_modifier_get_info(vk_mod_info->drmFormatModifier);
+      isl_mod_info = isl_drm_modifier_get_info(modifier_info->drmFormatModifier);
       if (isl_mod_info == NULL)
          goto unsupported;
-   }
 
-   assert(format->vk_format == info->format);
-   format_feature_flags = anv_get_image_format_features(devinfo, info->format,
-                                                        format, info->tiling,
-                                                        isl_mod_info);
-
-   /* Remove the VkFormatFeatureFlags that are incompatible with any declared
-    * image view format. (Removals are more likely to occur when a DRM format
-    * modifier is present).
-    */
-   if ((info->flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT) && format_list_info) {
-      for (uint32_t i = 0; i < format_list_info->viewFormatCount; ++i) {
-         VkFormat vk_view_format = format_list_info->pViewFormats[i];
-         const struct anv_format *anv_view_format = anv_get_format(vk_view_format);
-         VkFormatFeatureFlags view_format_features =
-            anv_get_image_format_features(devinfo, vk_view_format,
-                                          anv_view_format,
-                                          info->tiling,
-                                          isl_mod_info);
-         format_feature_flags &= view_format_features;
+      /* only allow Y-tiling/Tile4 for video decode. */
+      if (info->usage & VK_IMAGE_USAGE_VIDEO_DECODE_DST_BIT_KHR) {
+         if (isl_mod_info->tiling != ISL_TILING_Y0 && isl_mod_info->tiling != ISL_TILING_4)
+            goto unsupported;
       }
    }
 
-   if (!format_feature_flags)
-      goto unsupported;
+   assert(format->vk_format == info->format);
 
    switch (info->type) {
    default:
@@ -999,29 +1392,90 @@ anv_get_image_format_properties(
       maxExtent.width = 2048;
       maxExtent.height = 2048;
       maxExtent.depth = 2048;
-      /* Prior to SKL, the mipmaps for 3D surfaces are laid out in a way
-       * that make it impossible to represent in the way that
-       * VkSubresourceLayout expects. Since we can't tell users how to make
-       * sense of them, don't report them as available.
-       */
-      if (devinfo->ver < 9 && info->tiling == VK_IMAGE_TILING_LINEAR)
-         maxMipLevels = 1;
-      else
-         maxMipLevels = 12; /* log2(maxWidth) + 1 */
+      maxMipLevels = 12; /* log2(maxWidth) + 1 */
       maxArraySize = 1;
       sampleCounts = VK_SAMPLE_COUNT_1_BIT;
       break;
    }
 
+   /* If any of the format in VkImageFormatListCreateInfo is completely
+    * unsupported, report unsupported.
+    */
+   if ((info->flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT) &&
+       format_list_info != NULL) {
+      for (uint32_t i = 0; i < format_list_info->viewFormatCount; i++) {
+         const struct anv_format *view_format =
+            anv_get_format(format_list_info->pViewFormats[i]);
+         if (view_format == NULL)
+            goto unsupported;
+      }
+   }
+
+   /* From the Vulkan 1.3.218 spec:
+    *
+    *    "For images created without VK_IMAGE_CREATE_EXTENDED_USAGE_BIT a usage
+    *    bit is valid if it is supported for the format the image is created with.
+    *    For images created with VK_IMAGE_CREATE_EXTENDED_USAGE_BIT a usage bit
+    *    is valid if it is supported for at least one of the formats
+    *    a VkImageView created from the image can have."
+    *
+    *    "VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT specifies that the image can be
+    *    used to create a VkImageView with a different format from the image."
+    *
+    * So, if both VK_IMAGE_CREATE_EXTENDED_USAGE_BIT and
+    * VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT are set, views can be created with
+    * different usage than the image, so we can't always filter on usage.
+    * There is one exception to this below for storage.
+    */
+   format_feature_flags = anv_get_image_format_features2(physical_device,
+                                                         info->format, format,
+                                                         info->tiling,
+                                                         isl_mod_info);
+
+   if (!anv_format_supports_usage(format_feature_flags, info->usage)) {
+      /* If image format itself does not support the usage, and we don't allow
+       * views formats to support it, then we can't support this usage at all.
+       */
+      if (!(info->flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT) ||
+          !(info->flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT))
+         goto unsupported;
+
+      /* We don't want emulated formats to gain unexpected usage (storage in
+       * particular) from its compatible view formats.
+       */
+      if (anv_is_format_emulated(physical_device, info->format))
+         goto unsupported;
+
+      /* From the Vulkan 1.3.224 spec "43.1.6. Format Compatibility Classes":
+       *
+       *    "Each depth/stencil format is only compatible with itself."
+       *
+       * So, other formats also can't help.
+       */
+      if (vk_format_is_depth_or_stencil(info->format))
+         goto unsupported;
+
+      /* Gather all possible format feature flags for the formats listed in
+       * the format list or all the compatible formats.
+       */
+      VkFormatFeatureFlags2 all_formats_feature_flags = format_feature_flags |
+         anv_formats_gather_format_features(physical_device, format,
+                                            info->tiling, isl_mod_info,
+                                            format_list_info,
+                                            info->flags & VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT);
+
+      if (!anv_format_supports_usage(all_formats_feature_flags, info->usage))
+         goto unsupported;
+   }
+
    if (info->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
       /* We support modifiers only for "simple" (that is, non-array
        * non-mipmapped single-sample) 2D images.
        */
       if (info->type != VK_IMAGE_TYPE_2D) {
-         vk_errorfi(instance, &physical_device->vk.base,
-                    VK_ERROR_FORMAT_NOT_SUPPORTED,
-                    "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT "
-                    "requires VK_IMAGE_TYPE_2D");
+         vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED,
+                   "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT "
+                   "requires VK_IMAGE_TYPE_2D");
          goto unsupported;
       }
 
@@ -1029,9 +1483,10 @@ anv_get_image_format_properties(
       maxMipLevels = 1;
       sampleCounts = VK_SAMPLE_COUNT_1_BIT;
 
-      if (isl_mod_info->aux_usage == ISL_AUX_USAGE_CCS_E &&
+      if (isl_drm_modifier_has_aux(isl_mod_info->modifier) &&
           !anv_formats_ccs_e_compatible(devinfo, info->flags, info->format,
-                                        info->tiling, format_list_info)) {
+                                        info->tiling, info->usage,
+                                        format_list_info)) {
          goto unsupported;
       }
    }
@@ -1049,45 +1504,23 @@ anv_get_image_format_properties(
 
    if (info->tiling == VK_IMAGE_TILING_OPTIMAL &&
        info->type == VK_IMAGE_TYPE_2D &&
-       (format_feature_flags & (VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
-                                VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) &&
+       (format_feature_flags & (VK_FORMAT_FEATURE_2_COLOR_ATTACHMENT_BIT |
+                                VK_FORMAT_FEATURE_2_DEPTH_STENCIL_ATTACHMENT_BIT)) &&
        !(info->flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT) &&
-       !(info->usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
+       !(info->usage & VK_IMAGE_USAGE_STORAGE_BIT) &&
+       isl_format_supports_multisampling(devinfo, format->planes[0].isl_format)) {
       sampleCounts = isl_device_get_sample_counts(&physical_device->isl_dev);
    }
 
-   if (info->usage & (VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
-                      VK_IMAGE_USAGE_TRANSFER_DST_BIT)) {
-      /* Accept transfers on anything we can sample from or renderer to. */
-      if (!(format_feature_flags & (VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
-                                    VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT |
-                                    VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT))) {
-         goto unsupported;
-      }
-   }
-
-   if (info->usage & VK_IMAGE_USAGE_SAMPLED_BIT) {
-      if (!(format_feature_flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT)) {
-         goto unsupported;
-      }
-   }
-
    if (info->usage & VK_IMAGE_USAGE_STORAGE_BIT) {
-      if (!(format_feature_flags & VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT)) {
-         goto unsupported;
-      }
-   }
-
-   if (info->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
-      if (!(format_feature_flags & VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT)) {
-         goto unsupported;
-      }
-   }
-
-   if (info->usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
-      if (!(format_feature_flags & VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT)) {
+      /* Non-power-of-two formats can never be used as storage images.  We
+       * only check plane 0 because there are no YCbCr formats with
+       * non-power-of-two planes.
+       */
+      const struct isl_format_layout *isl_layout =
+         isl_format_get_layout(format->planes[0].isl_format);
+      if (!util_is_power_of_two_or_zero(isl_layout->bpb))
          goto unsupported;
-      }
    }
 
    if (info->flags & VK_IMAGE_CREATE_DISJOINT_BIT) {
@@ -1095,11 +1528,11 @@ anv_get_image_format_properties(
        *
        *    If format is a multi-planar format, and if imageCreateFormatFeatures
        *    (as defined in Image Creation Limits) does not contain
-       *    VK_FORMAT_FEATURE_DISJOINT_BIT, then flags must not contain
+       *    VK_FORMAT_FEATURE_2_DISJOINT_BIT, then flags must not contain
        *    VK_IMAGE_CREATE_DISJOINT_BIT.
        */
       if (format->n_planes > 1 &&
-          !(format_feature_flags & VK_FORMAT_FEATURE_DISJOINT_BIT)) {
+          !(format_feature_flags & VK_FORMAT_FEATURE_2_DISJOINT_BIT)) {
          goto unsupported;
       }
 
@@ -1115,7 +1548,7 @@ anv_get_image_format_properties(
       }
 
       if (info->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT &&
-          isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE) {
+          isl_drm_modifier_has_aux(isl_mod_info->modifier)) {
          /* Rejection DISJOINT for consistency with the GL driver. In
           * eglCreateImage, we require that the dma_buf for the primary surface
           * and the dma_buf for its aux surface refer to the same bo.
@@ -1124,7 +1557,7 @@ anv_get_image_format_properties(
       }
    }
 
-   if (info->flags & VK_IMAGE_CREATE_ALIAS_BIT) {
+   if ((info->flags & VK_IMAGE_CREATE_ALIAS_BIT) && !from_wsi) {
       /* Reject aliasing of images with non-linear DRM format modifiers because:
        *
        * 1. For modifiers with compression, we store aux tracking state in
@@ -1134,6 +1567,9 @@ anv_get_image_format_properties(
        * 2. For tiled modifiers without compression, we may attempt to compress
        *    them behind the scenes, in which case both the aux tracking state
        *    and the CCS data are bound to ANV_IMAGE_MEMORY_BINDING_PRIVATE.
+       *
+       * 3. For WSI we should ignore ALIAS_BIT because we have the ability to
+       *    bind the ANV_MEMORY_BINDING_PRIVATE from the other WSI image.
        */
       if (info->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT &&
           isl_mod_info->modifier != DRM_FORMAT_MOD_LINEAR) {
@@ -1141,30 +1577,20 @@ anv_get_image_format_properties(
       }
    }
 
-   if (info->usage & VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT) {
-      /* Nothing to check. */
-   }
-
-   if (info->usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) {
-      /* Ignore this flag because it was removed from the
-       * provisional_I_20150910 header.
-       */
-   }
+   if ((info->usage & VK_IMAGE_USAGE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR) &&
+       !devinfo->has_coarse_pixel_primitive_and_cb)
+      goto unsupported;
 
    /* From the bspec section entitled "Surface Layout and Tiling",
-    * pre-gfx9 has a 2 GB limitation of the size in bytes,
-    * gfx9 and gfx10 have a 256 GB limitation and gfx11+
-    * has a 16 TB limitation.
+    * Gfx9 has a 256 GB limitation and Gfx11+ has a 16 TB limitation.
     */
    uint64_t maxResourceSize = 0;
-   if (devinfo->ver < 9)
-      maxResourceSize = (uint64_t) 1 << 31;
-   else if (devinfo->ver < 11)
+   if (devinfo->ver < 11)
       maxResourceSize = (uint64_t) 1 << 38;
    else
       maxResourceSize = (uint64_t) 1 << 44;
 
-   *pImageFormatProperties = (VkImageFormatProperties) {
+   props->imageFormatProperties = (VkImageFormatProperties) {
       .maxExtent = maxExtent,
       .maxMipLevels = maxMipLevels,
       .maxArrayLayers = maxArraySize,
@@ -1176,166 +1602,23 @@ anv_get_image_format_properties(
       .maxResourceSize = maxResourceSize,
    };
 
-   if (pYcbcrImageFormatProperties) {
-      pYcbcrImageFormatProperties->combinedImageSamplerDescriptorCount =
-         format->n_planes;
-   }
-
-   return VK_SUCCESS;
-
-unsupported:
-   *pImageFormatProperties = (VkImageFormatProperties) {
-      .maxExtent = { 0, 0, 0 },
-      .maxMipLevels = 0,
-      .maxArrayLayers = 0,
-      .sampleCounts = 0,
-      .maxResourceSize = 0,
-   };
-
-   return VK_ERROR_FORMAT_NOT_SUPPORTED;
-}
-
-VkResult anv_GetPhysicalDeviceImageFormatProperties(
-    VkPhysicalDevice                            physicalDevice,
-    VkFormat                                    format,
-    VkImageType                                 type,
-    VkImageTiling                               tiling,
-    VkImageUsageFlags                           usage,
-    VkImageCreateFlags                          createFlags,
-    VkImageFormatProperties*                    pImageFormatProperties)
-{
-   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
-
-   const VkPhysicalDeviceImageFormatInfo2 info = {
-      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
-      .pNext = NULL,
-      .format = format,
-      .type = type,
-      .tiling = tiling,
-      .usage = usage,
-      .flags = createFlags,
-   };
-
-   return anv_get_image_format_properties(physical_device, &info,
-                                          pImageFormatProperties, NULL);
-}
-
-
-/* Supports opaque fd but not dma_buf. */
-static const VkExternalMemoryProperties opaque_fd_only_props = {
-   .externalMemoryFeatures =
-      VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
-      VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
-   .exportFromImportedHandleTypes =
-      VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
-   .compatibleHandleTypes =
-      VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
-};
-
-/* Supports opaque fd and dma_buf. */
-static const VkExternalMemoryProperties opaque_fd_dma_buf_props = {
-   .externalMemoryFeatures =
-      VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
-      VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
-   .exportFromImportedHandleTypes =
-      VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT |
-      VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
-   .compatibleHandleTypes =
-      VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT |
-      VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT,
-};
-
-static const VkExternalMemoryProperties userptr_props = {
-   .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
-   .exportFromImportedHandleTypes = 0,
-   .compatibleHandleTypes =
-      VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT,
-};
-
-static const VkExternalMemoryProperties android_buffer_props = {
-   .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
-                             VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT,
-   .exportFromImportedHandleTypes =
-      VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
-   .compatibleHandleTypes =
-      VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
-};
-
-
-static const VkExternalMemoryProperties android_image_props = {
-   .externalMemoryFeatures = VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT |
-                             VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT |
-                             VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT,
-   .exportFromImportedHandleTypes =
-      VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
-   .compatibleHandleTypes =
-      VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
-};
-
-VkResult anv_GetPhysicalDeviceImageFormatProperties2(
-    VkPhysicalDevice                            physicalDevice,
-    const VkPhysicalDeviceImageFormatInfo2*     base_info,
-    VkImageFormatProperties2*                   base_props)
-{
-   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
-   struct anv_instance *instance = physical_device->instance;
-   const VkPhysicalDeviceExternalImageFormatInfo *external_info = NULL;
-   VkExternalImageFormatProperties *external_props = NULL;
-   VkSamplerYcbcrConversionImageFormatProperties *ycbcr_props = NULL;
-   VkAndroidHardwareBufferUsageANDROID *android_usage = NULL;
-   VkResult result;
-
-   /* Extract input structs */
-   vk_foreach_struct_const(s, base_info->pNext) {
-      switch (s->sType) {
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO:
-         external_info = (const void *) s;
-         break;
-      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_DRM_FORMAT_MODIFIER_INFO_EXT:
-         /* anv_get_image_format_properties will handle this */
-         break;
-      case VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO_EXT:
-         /* Ignore but don't warn */
-         break;
-      default:
-         anv_debug_ignored_stype(s->sType);
-         break;
-      }
-   }
+   if (ycbcr_props)
+      ycbcr_props->combinedImageSamplerDescriptorCount = format->n_planes;
 
-   /* Extract output structs */
-   vk_foreach_struct(s, base_props->pNext) {
-      switch (s->sType) {
-      case VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES:
-         external_props = (void *) s;
-         break;
-      case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_IMAGE_FORMAT_PROPERTIES:
-         ycbcr_props = (void *) s;
-         break;
-      case VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_USAGE_ANDROID:
-         android_usage = (void *) s;
-         break;
-      default:
-         anv_debug_ignored_stype(s->sType);
-         break;
-      }
+   if (texture_lod_gather_props) {
+      texture_lod_gather_props->supportsTextureGatherLODBiasAMD =
+         physical_device->info.ver >= 20;
    }
 
-   result = anv_get_image_format_properties(physical_device, base_info,
-               &base_props->imageFormatProperties, ycbcr_props);
-   if (result != VK_SUCCESS)
-      goto fail;
-
    bool ahw_supported =
       physical_device->vk.supported_extensions.ANDROID_external_memory_android_hardware_buffer;
 
    if (ahw_supported && android_usage) {
       android_usage->androidHardwareBufferUsage =
-         anv_ahw_usage_from_vk_usage(base_info->flags,
-                                     base_info->usage);
+         vk_image_usage_to_ahb_usage(info->flags, info->usage);
 
       /* Limit maxArrayLayers to 1 for AHardwareBuffer based images for now. */
-      base_props->imageFormatProperties.maxArrayLayers = 1;
+      props->imageFormatProperties.maxArrayLayers = 1;
    }
 
    /* From the Vulkan 1.0.42 spec:
@@ -1350,7 +1633,7 @@ VkResult anv_GetPhysicalDeviceImageFormatProperties2(
        */
       bool tiling_has_explicit_layout;
 
-      switch (base_info->tiling) {
+      switch (info->tiling) {
       default:
          unreachable("bad VkImageTiling");
       case VK_IMAGE_TILING_LINEAR:
@@ -1379,12 +1662,12 @@ VkResult anv_GetPhysicalDeviceImageFormatProperties2(
        * method exists, then we reject image creation here.
        *
        * If the memory handle requires matching
-       * VkPhysicalDeviceIDPropertiesKHR::driverUUID and ::deviceUUID, then the
+       * VkPhysicalDeviceIDProperties::driverUUID and ::deviceUUID, then the
        * match-requirement guarantees that all users of the image agree on the
        * image's memory layout.
        *
        * If the memory handle does not require matching
-       * VkPhysicalDeviceIDPropertiesKHR::driverUUID nor ::deviceUUID, then we
+       * VkPhysicalDeviceIDProperties::driverUUID nor ::deviceUUID, then we
        * require that the app and driver be able to explicitly communicate to
        * each other the image's memory layout.
        *
@@ -1414,12 +1697,11 @@ VkResult anv_GetPhysicalDeviceImageFormatProperties2(
           * and therefore requires explicit memory layout.
           */
          if (!tiling_has_explicit_layout) {
-            result = vk_errorfi(instance, &physical_device->vk.base,
-                                VK_ERROR_FORMAT_NOT_SUPPORTED,
-                                "VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT "
-                                "requires VK_IMAGE_TILING_LINEAR or "
-                                "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT");
-            goto fail;
+            vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED,
+                      "VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT "
+                      "requires VK_IMAGE_TILING_LINEAR or "
+                      "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT");
+            goto unsupported;
          }
 
          /* With an explicit memory layout, we don't care which type of fd
@@ -1434,12 +1716,11 @@ VkResult anv_GetPhysicalDeviceImageFormatProperties2(
           * and therefore requires explicit memory layout.
           */
          if (!tiling_has_explicit_layout) {
-            result = vk_errorfi(instance, &physical_device->vk.base,
-                                VK_ERROR_FORMAT_NOT_SUPPORTED,
-                                "VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT "
-                                "requires VK_IMAGE_TILING_LINEAR or "
-                                "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT");
-            goto fail;
+            vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED,
+                      "VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT "
+                      "requires VK_IMAGE_TILING_LINEAR or "
+                      "VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT");
+            goto unsupported;
          }
 
          if (external_props)
@@ -1451,8 +1732,14 @@ VkResult anv_GetPhysicalDeviceImageFormatProperties2(
           * requires support for VK_IMAGE_TILING_OPTIMAL. Android systems
           * communicate the image's memory layout through backdoor channels.
           */
-         if (ahw_supported && external_props) {
-            external_props->externalMemoryProperties = android_image_props;
+         if (ahw_supported) {
+            if (external_props) {
+               external_props->externalMemoryProperties = android_image_props;
+               if (anv_ahb_format_for_vk_format(info->format)) {
+                  external_props->externalMemoryProperties.externalMemoryFeatures |=
+                     VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT;
+               }
+            }
             break;
          }
          FALLTHROUGH; /* If ahw not supported */
@@ -1464,43 +1751,56 @@ VkResult anv_GetPhysicalDeviceImageFormatProperties2(
           *    vkGetPhysicalDeviceImageFormatProperties2 returns
           *    VK_ERROR_FORMAT_NOT_SUPPORTED.
           */
-         result = vk_errorfi(instance, &physical_device->vk.base,
-                             VK_ERROR_FORMAT_NOT_SUPPORTED,
-                             "unsupported VkExternalMemoryTypeFlagBits 0x%x",
-                             external_info->handleType);
-         goto fail;
+         vk_errorf(physical_device, VK_ERROR_FORMAT_NOT_SUPPORTED,
+                   "unsupported VkExternalMemoryTypeFlagBits 0x%x",
+                   external_info->handleType);
+         goto unsupported;
       }
    }
 
+   if (comp_props) {
+      bool ccs_supported =
+         anv_formats_ccs_e_compatible(devinfo, info->flags, info->format,
+                                      info->tiling, info->usage,
+                                      format_list_info);
+      comp_props->imageCompressionFixedRateFlags =
+         VK_IMAGE_COMPRESSION_FIXED_RATE_NONE_EXT;
+      comp_props->imageCompressionFlags = ccs_supported ?
+         VK_IMAGE_COMPRESSION_DEFAULT_EXT :
+         VK_IMAGE_COMPRESSION_DISABLED_EXT;
+   }
+
    return VK_SUCCESS;
 
- fail:
-   if (result == VK_ERROR_FORMAT_NOT_SUPPORTED) {
-      /* From the Vulkan 1.0.42 spec:
-       *
-       *    If the combination of parameters to
-       *    vkGetPhysicalDeviceImageFormatProperties2 is not supported by
-       *    the implementation for use in vkCreateImage, then all members of
-       *    imageFormatProperties will be filled with zero.
-       */
-      base_props->imageFormatProperties = (VkImageFormatProperties) {};
-   }
+unsupported:
+   /* From the Vulkan 1.0.42 spec:
+    *
+    *    If the combination of parameters to
+    *    vkGetPhysicalDeviceImageFormatProperties2 is not supported by the
+    *    implementation for use in vkCreateImage, then all members of
+    *    imageFormatProperties will be filled with zero.
+    */
+   props->imageFormatProperties = (VkImageFormatProperties) {
+      .maxExtent = { 0, 0, 0 },
+      .maxMipLevels = 0,
+      .maxArrayLayers = 0,
+      .sampleCounts = 0,
+      .maxResourceSize = 0,
+   };
 
-   return result;
+   return VK_ERROR_FORMAT_NOT_SUPPORTED;
 }
 
-void anv_GetPhysicalDeviceSparseImageFormatProperties(
+VkResult anv_GetPhysicalDeviceImageFormatProperties2(
     VkPhysicalDevice                            physicalDevice,
-    VkFormat                                    format,
-    VkImageType                                 type,
-    uint32_t                                    samples,
-    VkImageUsageFlags                           usage,
-    VkImageTiling                               tiling,
-    uint32_t*                                   pNumProperties,
-    VkSparseImageFormatProperties*              pProperties)
+    const VkPhysicalDeviceImageFormatInfo2*     pImageFormatInfo,
+    VkImageFormatProperties2*                   pImageFormatProperties)
 {
-   /* Sparse images are not yet supported. */
-   *pNumProperties = 0;
+   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
+
+   return anv_get_image_format_properties(physical_device,
+                                          pImageFormatInfo,
+                                          pImageFormatProperties);
 }
 
 void anv_GetPhysicalDeviceSparseImageFormatProperties2(
@@ -1509,8 +1809,125 @@ void anv_GetPhysicalDeviceSparseImageFormatProperties2(
     uint32_t*                                   pPropertyCount,
     VkSparseImageFormatProperties2*             pProperties)
 {
-   /* Sparse images are not yet supported. */
-   *pPropertyCount = 0;
+   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
+   const struct intel_device_info *devinfo = &physical_device->info;
+   VkImageAspectFlags aspects = vk_format_aspects(pFormatInfo->format);
+   VK_OUTARRAY_MAKE_TYPED(VkSparseImageFormatProperties2, props,
+                          pProperties, pPropertyCount);
+
+   if (physical_device->sparse_type == ANV_SPARSE_TYPE_NOT_SUPPORTED) {
+      if (INTEL_DEBUG(DEBUG_SPARSE))
+         fprintf(stderr, "=== [%s:%d] [%s]\n", __FILE__, __LINE__, __func__);
+      return;
+   }
+
+   vk_foreach_struct_const(ext, pFormatInfo->pNext)
+      anv_debug_ignored_stype(ext->sType);
+
+   /* Check if the image is supported at all (regardless of being Sparse). */
+   const VkPhysicalDeviceImageFormatInfo2 img_info = {
+      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2,
+      .pNext = NULL,
+      .format = pFormatInfo->format,
+      .type = pFormatInfo->type,
+      .tiling = pFormatInfo->tiling,
+      .usage = pFormatInfo->usage,
+      .flags = VK_IMAGE_CREATE_SPARSE_BINDING_BIT |
+               VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT,
+   };
+   VkImageFormatProperties2 img_props = {};
+   if (anv_get_image_format_properties(physical_device,
+                                       &img_info, &img_props) != VK_SUCCESS)
+      return;
+
+   if (anv_sparse_image_check_support(physical_device,
+                                      VK_IMAGE_CREATE_SPARSE_BINDING_BIT |
+                                      VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT,
+                                      pFormatInfo->tiling,
+                                      pFormatInfo->samples,
+                                      pFormatInfo->type,
+                                      pFormatInfo->format) != VK_SUCCESS) {
+      return;
+   }
+
+   VkExtent3D ds_granularity = {};
+   VkSparseImageFormatProperties2 *ds_props_ptr = NULL;
+
+   u_foreach_bit(b, aspects) {
+      VkImageAspectFlagBits aspect = 1 << b;
+
+      const uint32_t plane =
+         anv_aspect_to_plane(vk_format_aspects(pFormatInfo->format), aspect);
+      struct anv_format_plane anv_format_plane =
+         anv_get_format_plane(devinfo, pFormatInfo->format, plane,
+                              pFormatInfo->tiling);
+      enum isl_format isl_format = anv_format_plane.isl_format;
+      assert(isl_format != ISL_FORMAT_UNSUPPORTED);
+
+      VkImageCreateFlags vk_create_flags =
+         VK_IMAGE_CREATE_SPARSE_BINDING_BIT |
+         VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT;
+
+      isl_surf_usage_flags_t isl_usage =
+         anv_image_choose_isl_surf_usage(physical_device,
+                                         vk_create_flags, pFormatInfo->usage,
+                                         0, aspect,
+                                         VK_IMAGE_COMPRESSION_DEFAULT_EXT);
+
+      const enum isl_surf_dim isl_surf_dim =
+         pFormatInfo->type == VK_IMAGE_TYPE_1D ? ISL_SURF_DIM_1D :
+         pFormatInfo->type == VK_IMAGE_TYPE_2D ? ISL_SURF_DIM_2D :
+         ISL_SURF_DIM_3D;
+
+      struct isl_surf isl_surf;
+      bool ok = isl_surf_init(&physical_device->isl_dev, &isl_surf,
+                  .dim = isl_surf_dim,
+                  .format = isl_format,
+                  .width = 1,
+                  .height = 1,
+                  .depth = 1,
+                  .levels = 1,
+                  .array_len = 1,
+                  .samples = pFormatInfo->samples,
+                  .min_alignment_B = 0,
+                  .row_pitch_B = 0,
+                  .usage = isl_usage,
+                  .tiling_flags = ISL_TILING_ANY_MASK);
+      if (!ok) {
+         /* There's no way to return an error code! */
+         assert(false);
+         *pPropertyCount = 0;
+         return;
+      }
+
+      VkSparseImageFormatProperties format_props =
+         anv_sparse_calc_image_format_properties(physical_device, aspect,
+                                                 pFormatInfo->type,
+                                                 &isl_surf);
+
+      /* If both depth and stencil are the same, unify them if possible. */
+      if (aspect & (VK_IMAGE_ASPECT_DEPTH_BIT |
+                    VK_IMAGE_ASPECT_STENCIL_BIT)) {
+         if (!ds_props_ptr) {
+            ds_granularity = format_props.imageGranularity;
+         } else if (ds_granularity.width ==
+                    format_props.imageGranularity.width &&
+                    ds_granularity.height ==
+                    format_props.imageGranularity.height &&
+                    ds_granularity.depth ==
+                    format_props.imageGranularity.depth) {
+            ds_props_ptr->properties.aspectMask |= aspect;
+            continue;
+         }
+      }
+
+      vk_outarray_append_typed(VkSparseImageFormatProperties2, &props, p) {
+         p->properties = format_props;
+         if (aspect & (VK_IMAGE_ASPECT_DEPTH_BIT |
+                       VK_IMAGE_ASPECT_STENCIL_BIT))
+            ds_props_ptr = p;
+      }
+   }
 }
 
 void anv_GetPhysicalDeviceExternalBufferProperties(
@@ -1563,82 +1980,3 @@ void anv_GetPhysicalDeviceExternalBufferProperties(
          .compatibleHandleTypes = pExternalBufferInfo->handleType,
       };
 }
-
-VkResult anv_CreateSamplerYcbcrConversion(
-    VkDevice                                    _device,
-    const VkSamplerYcbcrConversionCreateInfo*   pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkSamplerYcbcrConversion*                   pYcbcrConversion)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   struct anv_ycbcr_conversion *conversion;
-
-   /* Search for VkExternalFormatANDROID and resolve the format. */
-   struct anv_format *ext_format = NULL;
-   const VkExternalFormatANDROID *ext_info =
-      vk_find_struct_const(pCreateInfo->pNext, EXTERNAL_FORMAT_ANDROID);
-
-   uint64_t format = ext_info ? ext_info->externalFormat : 0;
-   if (format) {
-      assert(pCreateInfo->format == VK_FORMAT_UNDEFINED);
-      ext_format = (struct anv_format *) (uintptr_t) format;
-   }
-
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_CREATE_INFO);
-
-   conversion = vk_object_zalloc(&device->vk, pAllocator, sizeof(*conversion),
-                                 VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION);
-   if (!conversion)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   conversion->format = anv_get_format(pCreateInfo->format);
-   conversion->ycbcr_model = pCreateInfo->ycbcrModel;
-   conversion->ycbcr_range = pCreateInfo->ycbcrRange;
-
-   /* The Vulkan 1.1.95 spec says "When creating an external format conversion,
-    * the value of components if ignored."
-    */
-   if (!ext_format) {
-      conversion->mapping[0] = pCreateInfo->components.r;
-      conversion->mapping[1] = pCreateInfo->components.g;
-      conversion->mapping[2] = pCreateInfo->components.b;
-      conversion->mapping[3] = pCreateInfo->components.a;
-   }
-
-   conversion->chroma_offsets[0] = pCreateInfo->xChromaOffset;
-   conversion->chroma_offsets[1] = pCreateInfo->yChromaOffset;
-   conversion->chroma_filter = pCreateInfo->chromaFilter;
-
-   /* Setup external format. */
-   if (ext_format)
-      conversion->format = ext_format;
-
-   bool has_chroma_subsampled = false;
-   for (uint32_t p = 0; p < conversion->format->n_planes; p++) {
-      if (conversion->format->planes[p].has_chroma &&
-          (conversion->format->planes[p].denominator_scales[0] > 1 ||
-           conversion->format->planes[p].denominator_scales[1] > 1))
-         has_chroma_subsampled = true;
-   }
-   conversion->chroma_reconstruction = has_chroma_subsampled &&
-      (conversion->chroma_offsets[0] == VK_CHROMA_LOCATION_COSITED_EVEN ||
-       conversion->chroma_offsets[1] == VK_CHROMA_LOCATION_COSITED_EVEN);
-
-   *pYcbcrConversion = anv_ycbcr_conversion_to_handle(conversion);
-
-   return VK_SUCCESS;
-}
-
-void anv_DestroySamplerYcbcrConversion(
-    VkDevice                                    _device,
-    VkSamplerYcbcrConversion                    YcbcrConversion,
-    const VkAllocationCallbacks*                pAllocator)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_ycbcr_conversion, conversion, YcbcrConversion);
-
-   if (!conversion)
-      return;
-
-   vk_object_free(&device->vk, pAllocator, conversion);
-}
diff --git a/src/intel/vulkan/anv_gem.c b/src/intel/vulkan/anv_gem.c
index dd4c860a565..e721885cb55 100644
--- a/src/intel/vulkan/anv_gem.c
+++ b/src/intel/vulkan/anv_gem.c
@@ -30,200 +30,9 @@
 #include <fcntl.h>
 
 #include "anv_private.h"
-#include "common/intel_defines.h"
 #include "common/intel_gem.h"
-#include "drm-uapi/sync_file.h"
 
-/**
- * Wrapper around DRM_IOCTL_I915_GEM_CREATE.
- *
- * Return gem handle, or 0 on failure. Gem handles are never 0.
- */
-uint32_t
-anv_gem_create(struct anv_device *device, uint64_t size)
-{
-   struct drm_i915_gem_create gem_create = {
-      .size = size,
-   };
-
-   int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create);
-   if (ret != 0) {
-      /* FIXME: What do we do if this fails? */
-      return 0;
-   }
-
-   return gem_create.handle;
-}
-
-void
-anv_gem_close(struct anv_device *device, uint32_t gem_handle)
-{
-   struct drm_gem_close close = {
-      .handle = gem_handle,
-   };
-
-   intel_ioctl(device->fd, DRM_IOCTL_GEM_CLOSE, &close);
-}
-
-uint32_t
-anv_gem_create_regions(struct anv_device *device, uint64_t anv_bo_size,
-                       uint32_t num_regions,
-                       struct drm_i915_gem_memory_class_instance *regions)
-{
-   struct drm_i915_gem_create_ext_memory_regions ext_regions = {
-      .base = { .name = I915_GEM_CREATE_EXT_MEMORY_REGIONS },
-      .num_regions = num_regions,
-      .regions = (uintptr_t)regions,
-   };
-
-   struct drm_i915_gem_create_ext gem_create = {
-      .size = anv_bo_size,
-      .extensions = (uintptr_t) &ext_regions,
-   };
-
-   int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CREATE_EXT,
-                         &gem_create);
-   if (ret != 0) {
-      return 0;
-   }
-
-   return gem_create.handle;
-}
-
-/**
- * Wrapper around DRM_IOCTL_I915_GEM_MMAP. Returns MAP_FAILED on error.
- */
-static void*
-anv_gem_mmap_offset(struct anv_device *device, uint32_t gem_handle,
-                    uint64_t offset, uint64_t size, uint32_t flags)
-{
-   struct drm_i915_gem_mmap_offset gem_mmap = {
-      .handle = gem_handle,
-      .flags = device->info.has_local_mem ? I915_MMAP_OFFSET_FIXED :
-         (flags & I915_MMAP_WC) ? I915_MMAP_OFFSET_WC : I915_MMAP_OFFSET_WB,
-   };
-   assert(offset == 0);
-
-   /* Get the fake offset back */
-   int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_MMAP_OFFSET, &gem_mmap);
-   if (ret != 0)
-      return MAP_FAILED;
-
-   /* And map it */
-   void *map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
-                    device->fd, gem_mmap.offset);
-   return map;
-}
-
-static void*
-anv_gem_mmap_legacy(struct anv_device *device, uint32_t gem_handle,
-                    uint64_t offset, uint64_t size, uint32_t flags)
-{
-   assert(!device->info.has_local_mem);
-
-   struct drm_i915_gem_mmap gem_mmap = {
-      .handle = gem_handle,
-      .offset = offset,
-      .size = size,
-      .flags = flags,
-   };
-
-   int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_MMAP, &gem_mmap);
-   if (ret != 0)
-      return MAP_FAILED;
-
-   return (void *)(uintptr_t) gem_mmap.addr_ptr;
-}
-
-/**
- * Wrapper around DRM_IOCTL_I915_GEM_MMAP. Returns MAP_FAILED on error.
- */
-void*
-anv_gem_mmap(struct anv_device *device, uint32_t gem_handle,
-             uint64_t offset, uint64_t size, uint32_t flags)
-{
-   void *map;
-   if (device->physical->has_mmap_offset)
-      map = anv_gem_mmap_offset(device, gem_handle, offset, size, flags);
-   else
-      map = anv_gem_mmap_legacy(device, gem_handle, offset, size, flags);
-
-   if (map != MAP_FAILED)
-      VG(VALGRIND_MALLOCLIKE_BLOCK(map, size, 0, 1));
-
-   return map;
-}
-
-/* This is just a wrapper around munmap, but it also notifies valgrind that
- * this map is no longer valid.  Pair this with anv_gem_mmap().
- */
-void
-anv_gem_munmap(struct anv_device *device, void *p, uint64_t size)
-{
-   VG(VALGRIND_FREELIKE_BLOCK(p, 0));
-   munmap(p, size);
-}
-
-uint32_t
-anv_gem_userptr(struct anv_device *device, void *mem, size_t size)
-{
-   struct drm_i915_gem_userptr userptr = {
-      .user_ptr = (__u64)((unsigned long) mem),
-      .user_size = size,
-      .flags = 0,
-   };
-
-   if (device->physical->has_userptr_probe)
-      userptr.flags |= I915_USERPTR_PROBE;
-
-   int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_USERPTR, &userptr);
-   if (ret == -1)
-      return 0;
-
-   return userptr.handle;
-}
-
-int
-anv_gem_set_caching(struct anv_device *device,
-                    uint32_t gem_handle, uint32_t caching)
-{
-   struct drm_i915_gem_caching gem_caching = {
-      .handle = gem_handle,
-      .caching = caching,
-   };
-
-   return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_SET_CACHING, &gem_caching);
-}
-
-int
-anv_gem_set_domain(struct anv_device *device, uint32_t gem_handle,
-                   uint32_t read_domains, uint32_t write_domain)
-{
-   struct drm_i915_gem_set_domain gem_set_domain = {
-      .handle = gem_handle,
-      .read_domains = read_domains,
-      .write_domain = write_domain,
-   };
-
-   return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &gem_set_domain);
-}
-
-/**
- * Returns 0, 1, or negative to indicate error
- */
-int
-anv_gem_busy(struct anv_device *device, uint32_t gem_handle)
-{
-   struct drm_i915_gem_busy busy = {
-      .handle = gem_handle,
-   };
-
-   int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_BUSY, &busy);
-   if (ret < 0)
-      return ret;
-
-   return busy.busy != 0;
-}
+#include "i915/anv_gem.h"
 
 /**
  * On error, \a timeout_ns holds the remaining time.
@@ -231,319 +40,45 @@ anv_gem_busy(struct anv_device *device, uint32_t gem_handle)
 int
 anv_gem_wait(struct anv_device *device, uint32_t gem_handle, int64_t *timeout_ns)
 {
-   struct drm_i915_gem_wait wait = {
-      .bo_handle = gem_handle,
-      .timeout_ns = *timeout_ns,
-      .flags = 0,
-   };
-
-   int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_WAIT, &wait);
-   *timeout_ns = wait.timeout_ns;
-
-   return ret;
-}
-
-int
-anv_gem_execbuffer(struct anv_device *device,
-                   struct drm_i915_gem_execbuffer2 *execbuf)
-{
-   if (execbuf->flags & I915_EXEC_FENCE_OUT)
-      return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2_WR, execbuf);
-   else
-      return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, execbuf);
+   switch (device->info->kmd_type) {
+   case INTEL_KMD_TYPE_I915:
+      return anv_i915_gem_wait(device, gem_handle, timeout_ns);
+   case INTEL_KMD_TYPE_XE:
+      return -1;
+   default:
+      unreachable("missing");
+      return -1;
+   }
 }
 
 /** Return -1 on error. */
 int
 anv_gem_get_tiling(struct anv_device *device, uint32_t gem_handle)
 {
-   struct drm_i915_gem_get_tiling get_tiling = {
-      .handle = gem_handle,
-   };
-
-   /* FIXME: On discrete platforms we don't have DRM_IOCTL_I915_GEM_GET_TILING
-    * anymore, so we will need another way to get the tiling. Apparently this
-    * is only used in Android code, so we may need some other way to
-    * communicate the tiling mode.
-    */
-   if (intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling)) {
-      assert(!"Failed to get BO tiling");
+   switch (device->info->kmd_type) {
+   case INTEL_KMD_TYPE_I915:
+      return anv_i915_gem_get_tiling(device, gem_handle);
+   case INTEL_KMD_TYPE_XE:
+      return -1;
+   default:
+      unreachable("missing");
       return -1;
    }
-
-   return get_tiling.tiling_mode;
 }
 
 int
 anv_gem_set_tiling(struct anv_device *device,
                    uint32_t gem_handle, uint32_t stride, uint32_t tiling)
 {
-   int ret;
-
-   /* On discrete platforms we don't have DRM_IOCTL_I915_GEM_SET_TILING. So
-    * nothing needs to be done.
-    */
-   if (!device->info.has_tiling_uapi)
+   switch (device->info->kmd_type) {
+   case INTEL_KMD_TYPE_I915:
+      return anv_i915_gem_set_tiling(device, gem_handle, stride, tiling);
+   case INTEL_KMD_TYPE_XE:
       return 0;
-
-   /* set_tiling overwrites the input on the error path, so we have to open
-    * code intel_ioctl.
-    */
-   do {
-      struct drm_i915_gem_set_tiling set_tiling = {
-         .handle = gem_handle,
-         .tiling_mode = tiling,
-         .stride = stride,
-      };
-
-      ret = ioctl(device->fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling);
-   } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
-
-   return ret;
-}
-
-int
-anv_gem_get_param(int fd, uint32_t param)
-{
-   int tmp;
-
-   drm_i915_getparam_t gp = {
-      .param = param,
-      .value = &tmp,
-   };
-
-   int ret = intel_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp);
-   if (ret == 0)
-      return tmp;
-
-   return 0;
-}
-
-uint64_t
-anv_gem_get_drm_cap(int fd, uint32_t capability)
-{
-   struct drm_get_cap cap = {
-      .capability = capability,
-   };
-
-   intel_ioctl(fd, DRM_IOCTL_GET_CAP, &cap);
-   return cap.value;
-}
-
-bool
-anv_gem_get_bit6_swizzle(int fd, uint32_t tiling)
-{
-   struct drm_gem_close close;
-   int ret;
-
-   struct drm_i915_gem_create gem_create = {
-      .size = 4096,
-   };
-
-   if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create)) {
-      assert(!"Failed to create GEM BO");
-      return false;
-   }
-
-   bool swizzled = false;
-
-   /* set_tiling overwrites the input on the error path, so we have to open
-    * code intel_ioctl.
-    */
-   do {
-      struct drm_i915_gem_set_tiling set_tiling = {
-         .handle = gem_create.handle,
-         .tiling_mode = tiling,
-         .stride = tiling == I915_TILING_X ? 512 : 128,
-      };
-
-      ret = ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling);
-   } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
-
-   if (ret != 0) {
-      assert(!"Failed to set BO tiling");
-      goto close_and_return;
-   }
-
-   struct drm_i915_gem_get_tiling get_tiling = {
-      .handle = gem_create.handle,
-   };
-
-   if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling)) {
-      assert(!"Failed to get BO tiling");
-      goto close_and_return;
-   }
-
-   swizzled = get_tiling.swizzle_mode != I915_BIT_6_SWIZZLE_NONE;
-
-close_and_return:
-
-   memset(&close, 0, sizeof(close));
-   close.handle = gem_create.handle;
-   intel_ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close);
-
-   return swizzled;
-}
-
-bool
-anv_gem_has_context_priority(int fd)
-{
-   return !anv_gem_set_context_param(fd, 0, I915_CONTEXT_PARAM_PRIORITY,
-                                     INTEL_CONTEXT_MEDIUM_PRIORITY);
-}
-
-int
-anv_gem_create_context(struct anv_device *device)
-{
-   struct drm_i915_gem_context_create create = { 0 };
-
-   int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &create);
-   if (ret == -1)
-      return -1;
-
-   return create.ctx_id;
-}
-
-int
-anv_gem_create_context_engines(struct anv_device *device,
-                               const struct drm_i915_query_engine_info *info,
-                               int num_engines, uint16_t *engine_classes)
-{
-   const size_t engine_inst_sz = 2 * sizeof(__u16); /* 1 class, 1 instance */
-   const size_t engines_param_size =
-      sizeof(__u64) /* extensions */ + num_engines * engine_inst_sz;
-
-   void *engines_param = malloc(engines_param_size);
-   assert(engines_param);
-   *(__u64*)engines_param = 0;
-   __u16 *class_inst_ptr = (__u16*)(((__u64*)engines_param) + 1);
-
-   /* For each type of drm_i915_gem_engine_class of interest, we keep track of
-    * the previous engine instance used.
-    */
-   int last_engine_idx[] = {
-      [I915_ENGINE_CLASS_RENDER] = -1,
-   };
-
-   int i915_engine_counts[] = {
-      [I915_ENGINE_CLASS_RENDER] =
-         anv_gem_count_engines(info, I915_ENGINE_CLASS_RENDER),
-   };
-
-   /* For each queue, we look for the next instance that matches the class we
-    * need.
-    */
-   for (int i = 0; i < num_engines; i++) {
-      uint16_t engine_class = engine_classes[i];
-      if (i915_engine_counts[engine_class] <= 0) {
-         free(engines_param);
-         return -1;
-      }
-
-      /* Run through the engines reported by the kernel looking for the next
-       * matching instance. We loop in case we want to create multiple
-       * contexts on an engine instance.
-       */
-      int engine_instance = -1;
-      for (int i = 0; i < info->num_engines; i++) {
-         int *idx = &last_engine_idx[engine_class];
-         if (++(*idx) >= info->num_engines)
-            *idx = 0;
-         if (info->engines[*idx].engine.engine_class == engine_class) {
-            engine_instance = info->engines[*idx].engine.engine_instance;
-            break;
-         }
-      }
-      if (engine_instance < 0) {
-         free(engines_param);
-         return -1;
-      }
-
-      *class_inst_ptr++ = engine_class;
-      *class_inst_ptr++ = engine_instance;
-   }
-
-   assert((uintptr_t)engines_param + engines_param_size ==
-          (uintptr_t)class_inst_ptr);
-
-   struct drm_i915_gem_context_create_ext_setparam set_engines = {
-      .base = {
-         .name = I915_CONTEXT_CREATE_EXT_SETPARAM,
-      },
-      .param = {
-	 .param = I915_CONTEXT_PARAM_ENGINES,
-         .value = (uintptr_t)engines_param,
-         .size = engines_param_size,
-      }
-   };
-   struct drm_i915_gem_context_create_ext create = {
-      .flags = I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS,
-      .extensions = (uintptr_t)&set_engines,
-   };
-   int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE_EXT, &create);
-   free(engines_param);
-   if (ret == -1)
-      return -1;
-
-   return create.ctx_id;
-}
-
-int
-anv_gem_destroy_context(struct anv_device *device, int context)
-{
-   struct drm_i915_gem_context_destroy destroy = {
-      .ctx_id = context,
-   };
-
-   return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CONTEXT_DESTROY, &destroy);
-}
-
-int
-anv_gem_set_context_param(int fd, int context, uint32_t param, uint64_t value)
-{
-   struct drm_i915_gem_context_param p = {
-      .ctx_id = context,
-      .param = param,
-      .value = value,
-   };
-   int err = 0;
-
-   if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &p))
-      err = -errno;
-   return err;
-}
-
-int
-anv_gem_get_context_param(int fd, int context, uint32_t param, uint64_t *value)
-{
-   struct drm_i915_gem_context_param gp = {
-      .ctx_id = context,
-      .param = param,
-   };
-
-   int ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &gp);
-   if (ret == -1)
+   default:
+      unreachable("missing");
       return -1;
-
-   *value = gp.value;
-   return 0;
-}
-
-int
-anv_gem_context_get_reset_stats(int fd, int context,
-                                uint32_t *active, uint32_t *pending)
-{
-   struct drm_i915_reset_stats stats = {
-      .ctx_id = context,
-   };
-
-   int ret = intel_ioctl(fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats);
-   if (ret == 0) {
-      *active = stats.batch_active;
-      *pending = stats.batch_pending;
    }
-
-   return ret;
 }
 
 int
@@ -575,220 +110,27 @@ anv_gem_fd_to_handle(struct anv_device *device, int fd)
    return args.handle;
 }
 
-int
-anv_gem_reg_read(int fd, uint32_t offset, uint64_t *result)
-{
-   struct drm_i915_reg_read args = {
-      .offset = offset
-   };
-
-   int ret = intel_ioctl(fd, DRM_IOCTL_I915_REG_READ, &args);
-
-   *result = args.val;
-   return ret;
-}
-
-int
-anv_gem_sync_file_merge(struct anv_device *device, int fd1, int fd2)
-{
-   struct sync_merge_data args = {
-      .name = "anv merge fence",
-      .fd2 = fd2,
-      .fence = -1,
-   };
-
-   int ret = intel_ioctl(fd1, SYNC_IOC_MERGE, &args);
-   if (ret == -1)
-      return -1;
-
-   return args.fence;
-}
-
-uint32_t
-anv_gem_syncobj_create(struct anv_device *device, uint32_t flags)
-{
-   struct drm_syncobj_create args = {
-      .flags = flags,
-   };
-
-   int ret = intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_CREATE, &args);
-   if (ret)
-      return 0;
-
-   return args.handle;
-}
-
-void
-anv_gem_syncobj_destroy(struct anv_device *device, uint32_t handle)
-{
-   struct drm_syncobj_destroy args = {
-      .handle = handle,
-   };
-
-   intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_DESTROY, &args);
-}
-
-int
-anv_gem_syncobj_handle_to_fd(struct anv_device *device, uint32_t handle)
-{
-   struct drm_syncobj_handle args = {
-      .handle = handle,
-   };
-
-   int ret = intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, &args);
-   if (ret)
-      return -1;
-
-   return args.fd;
-}
-
-uint32_t
-anv_gem_syncobj_fd_to_handle(struct anv_device *device, int fd)
-{
-   struct drm_syncobj_handle args = {
-      .fd = fd,
-   };
-
-   int ret = intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE, &args);
-   if (ret)
-      return 0;
-
-   return args.handle;
-}
-
-int
-anv_gem_syncobj_export_sync_file(struct anv_device *device, uint32_t handle)
-{
-   struct drm_syncobj_handle args = {
-      .handle = handle,
-      .flags = DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE,
-   };
-
-   int ret = intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, &args);
-   if (ret)
-      return -1;
-
-   return args.fd;
-}
-
-int
-anv_gem_syncobj_import_sync_file(struct anv_device *device,
-                                 uint32_t handle, int fd)
-{
-   struct drm_syncobj_handle args = {
-      .handle = handle,
-      .fd = fd,
-      .flags = DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE,
-   };
-
-   return intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE, &args);
-}
-
-void
-anv_gem_syncobj_reset(struct anv_device *device, uint32_t handle)
-{
-   struct drm_syncobj_array args = {
-      .handles = (uint64_t)(uintptr_t)&handle,
-      .count_handles = 1,
-   };
-
-   intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_RESET, &args);
-}
-
-bool
-anv_gem_supports_syncobj_wait(int fd)
-{
-   return intel_gem_supports_syncobj_wait(fd);
-}
-
-int
-anv_gem_syncobj_wait(struct anv_device *device,
-                     const uint32_t *handles, uint32_t num_handles,
-                     int64_t abs_timeout_ns, bool wait_all)
-{
-   struct drm_syncobj_wait args = {
-      .handles = (uint64_t)(uintptr_t)handles,
-      .count_handles = num_handles,
-      .timeout_nsec = abs_timeout_ns,
-      .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT,
-   };
-
-   if (wait_all)
-      args.flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL;
-
-   return intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_WAIT, &args);
-}
-
-int
-anv_gem_syncobj_timeline_wait(struct anv_device *device,
-                              const uint32_t *handles, const uint64_t *points,
-                              uint32_t num_items, int64_t abs_timeout_ns,
-                              bool wait_all, bool wait_materialize)
-{
-   assert(device->physical->has_syncobj_wait_available);
-
-   struct drm_syncobj_timeline_wait args = {
-      .handles = (uint64_t)(uintptr_t)handles,
-      .points = (uint64_t)(uintptr_t)points,
-      .count_handles = num_items,
-      .timeout_nsec = abs_timeout_ns,
-      .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT,
-   };
-
-   if (wait_all)
-      args.flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL;
-   if (wait_materialize)
-      args.flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE;
-
-   return intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, &args);
-}
-
-int
-anv_gem_syncobj_timeline_signal(struct anv_device *device,
-                                const uint32_t *handles, const uint64_t *points,
-                                uint32_t num_items)
-{
-   assert(device->physical->has_syncobj_wait_available);
-
-   struct drm_syncobj_timeline_array args = {
-      .handles = (uint64_t)(uintptr_t)handles,
-      .points = (uint64_t)(uintptr_t)points,
-      .count_handles = num_items,
-   };
-
-   return intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL, &args);
-}
-
-int
-anv_gem_syncobj_timeline_query(struct anv_device *device,
-                               const uint32_t *handles, uint64_t *points,
-                               uint32_t num_items)
-{
-   assert(device->physical->has_syncobj_wait_available);
-
-   struct drm_syncobj_timeline_array args = {
-      .handles = (uint64_t)(uintptr_t)handles,
-      .points = (uint64_t)(uintptr_t)points,
-      .count_handles = num_items,
-   };
-
-   return intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_QUERY, &args);
-}
-
-struct drm_i915_query_engine_info *
-anv_gem_get_engine_info(int fd)
-{
-   return intel_i915_query_alloc(fd, DRM_I915_QUERY_ENGINE_INFO);
+VkResult
+anv_gem_import_bo_alloc_flags_to_bo_flags(struct anv_device *device,
+                                          struct anv_bo *bo,
+                                          enum anv_bo_alloc_flags alloc_flags,
+                                          uint32_t *bo_flags)
+{
+   switch (device->info->kmd_type) {
+   case INTEL_KMD_TYPE_I915:
+      return anv_i915_gem_import_bo_alloc_flags_to_bo_flags(device, bo,
+                                                            alloc_flags,
+                                                            bo_flags);
+   case INTEL_KMD_TYPE_XE:
+      *bo_flags = device->kmd_backend->bo_alloc_flags_to_bo_flags(device, alloc_flags);
+      return VK_SUCCESS;
+   default:
+      unreachable("missing");
+      return VK_ERROR_UNKNOWN;
+   }
 }
 
-int
-anv_gem_count_engines(const struct drm_i915_query_engine_info *info,
-                      uint16_t engine_class)
+const struct anv_kmd_backend *anv_stub_kmd_backend_get(void)
 {
-   int count = 0;
-   for (int i = 0; i < info->num_engines; i++) {
-      if (info->engines[i].engine.engine_class == engine_class)
-         count++;
-   }
-   return count;
+   return NULL;
 }
diff --git a/src/intel/vulkan/anv_gem_stubs.c b/src/intel/vulkan/anv_gem_stubs.c
index c552b7c6dc2..48795b431a8 100644
--- a/src/intel/vulkan/anv_gem_stubs.c
+++ b/src/intel/vulkan/anv_gem_stubs.c
@@ -27,8 +27,18 @@
 #include "util/anon_file.h"
 #include "anv_private.h"
 
-uint32_t
-anv_gem_create(struct anv_device *device, uint64_t size)
+static void
+stub_gem_close(struct anv_device *device, struct anv_bo *bo)
+{
+   close(bo->gem_handle);
+}
+
+static uint32_t
+stub_gem_create(struct anv_device *device,
+                const struct intel_memory_class_instance **regions,
+                uint16_t num_regions, uint64_t size,
+                enum anv_bo_alloc_flags alloc_flags,
+                uint64_t *actual_size)
 {
    int fd = os_create_anonymous_file(size, "fake bo");
    if (fd == -1)
@@ -36,45 +46,62 @@ anv_gem_create(struct anv_device *device, uint64_t size)
 
    assert(fd != 0);
 
+   *actual_size = size;
    return fd;
 }
 
-void
-anv_gem_close(struct anv_device *device, uint32_t gem_handle)
+static void *
+stub_gem_mmap(struct anv_device *device, struct anv_bo *bo, uint64_t offset,
+              uint64_t size, void *placed_addr)
 {
-   close(gem_handle);
+   return mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, bo->gem_handle,
+               offset);
 }
 
-uint32_t
-anv_gem_create_regions(struct anv_device *device, uint64_t anv_bo_size,
-                       uint32_t num_regions,
-                       struct drm_i915_gem_memory_class_instance *regions)
+static VkResult
+stub_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
+                          uint32_t batch_bo_size, bool is_companion_rcs_batch)
 {
-   return 0;
+   return VK_ERROR_UNKNOWN;
 }
 
-void*
-anv_gem_mmap(struct anv_device *device, uint32_t gem_handle,
-             uint64_t offset, uint64_t size, uint32_t flags)
+static VkResult
+stub_execute_trtt_batch(struct anv_sparse_submission *submit,
+                        struct anv_trtt_batch_bo *trtt_bbo)
 {
-   /* Ignore flags, as they're specific to I915_GEM_MMAP. */
-   (void) flags;
+   return VK_ERROR_UNKNOWN;
+}
 
-   return mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
-               gem_handle, offset);
+static VkResult
+stub_queue_exec_locked(struct anv_queue *queue,
+                       uint32_t wait_count,
+                       const struct vk_sync_wait *waits,
+                       uint32_t cmd_buffer_count,
+                       struct anv_cmd_buffer **cmd_buffers,
+                       uint32_t signal_count,
+                       const struct vk_sync_signal *signals,
+                       struct anv_query_pool *perf_query_pool,
+                       uint32_t perf_query_pass,
+                       struct anv_utrace_submit *utrace_submit)
+{
+   return VK_ERROR_UNKNOWN;
 }
 
-/* This is just a wrapper around munmap, but it also notifies valgrind that
- * this map is no longer valid.  Pair this with anv_gem_mmap().
- */
-void
-anv_gem_munmap(struct anv_device *device, void *p, uint64_t size)
+static VkResult
+stub_queue_exec_trace(struct anv_queue *queue, struct anv_utrace_submit *submit)
 {
-   munmap(p, size);
+   return VK_ERROR_UNKNOWN;
 }
 
-uint32_t
-anv_gem_userptr(struct anv_device *device, void *mem, size_t size)
+static uint32_t
+stub_bo_alloc_flags_to_bo_flags(struct anv_device *device,
+                                enum anv_bo_alloc_flags alloc_flags)
+{
+   return 0;
+}
+
+static uint32_t
+stub_gem_create_userptr(struct anv_device *device, void *mem, uint64_t size)
 {
    int fd = os_create_anonymous_file(size, "fake bo");
    if (fd == -1)
@@ -86,25 +113,12 @@ anv_gem_userptr(struct anv_device *device, void *mem, size_t size)
 }
 
 int
-anv_gem_busy(struct anv_device *device, uint32_t gem_handle)
-{
-   return 0;
-}
-
-int
 anv_gem_wait(struct anv_device *device, uint32_t gem_handle, int64_t *timeout_ns)
 {
    return 0;
 }
 
 int
-anv_gem_execbuffer(struct anv_device *device,
-                   struct drm_i915_gem_execbuffer2 *execbuf)
-{
-   return 0;
-}
-
-int
 anv_gem_set_tiling(struct anv_device *device,
                    uint32_t gem_handle, uint32_t stride, uint32_t tiling)
 {
@@ -118,75 +132,6 @@ anv_gem_get_tiling(struct anv_device *device, uint32_t gem_handle)
 }
 
 int
-anv_gem_set_caching(struct anv_device *device, uint32_t gem_handle,
-                    uint32_t caching)
-{
-   return 0;
-}
-
-int
-anv_gem_set_domain(struct anv_device *device, uint32_t gem_handle,
-                   uint32_t read_domains, uint32_t write_domain)
-{
-   return 0;
-}
-
-int
-anv_gem_get_param(int fd, uint32_t param)
-{
-   unreachable("Unused");
-}
-
-uint64_t
-anv_gem_get_drm_cap(int fd, uint32_t capability)
-{
-   return 0;
-}
-
-bool
-anv_gem_get_bit6_swizzle(int fd, uint32_t tiling)
-{
-   unreachable("Unused");
-}
-
-int
-anv_gem_create_context(struct anv_device *device)
-{
-   unreachable("Unused");
-}
-
-int
-anv_gem_destroy_context(struct anv_device *device, int context)
-{
-   unreachable("Unused");
-}
-
-int
-anv_gem_set_context_param(int fd, int context, uint32_t param, uint64_t value)
-{
-   unreachable("Unused");
-}
-
-int
-anv_gem_get_context_param(int fd, int context, uint32_t param, uint64_t *value)
-{
-   unreachable("Unused");
-}
-
-bool
-anv_gem_has_context_priority(int fd)
-{
-   unreachable("Unused");
-}
-
-int
-anv_gem_context_get_reset_stats(int fd, int context,
-                                uint32_t *active, uint32_t *pending)
-{
-   unreachable("Unused");
-}
-
-int
 anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle)
 {
    unreachable("Unused");
@@ -198,125 +143,43 @@ anv_gem_fd_to_handle(struct anv_device *device, int fd)
    unreachable("Unused");
 }
 
-int
-anv_gem_sync_file_merge(struct anv_device *device, int fd1, int fd2)
-{
-   unreachable("Unused");
-}
-
-int
-anv_gem_syncobj_export_sync_file(struct anv_device *device, uint32_t handle)
-{
-   unreachable("Unused");
-}
-
-int
-anv_gem_syncobj_import_sync_file(struct anv_device *device,
-                                 uint32_t handle, int fd)
-{
-   unreachable("Unused");
-}
-
-uint32_t
-anv_gem_syncobj_create(struct anv_device *device, uint32_t flags)
-{
-   unreachable("Unused");
-}
-
-void
-anv_gem_syncobj_destroy(struct anv_device *device, uint32_t handle)
-{
-   unreachable("Unused");
-}
-
-int
-anv_gem_syncobj_handle_to_fd(struct anv_device *device, uint32_t handle)
-{
-   unreachable("Unused");
-}
-
-uint32_t
-anv_gem_syncobj_fd_to_handle(struct anv_device *device, int fd)
-{
-   unreachable("Unused");
-}
-
-void
-anv_gem_syncobj_reset(struct anv_device *device, uint32_t handle)
-{
-   unreachable("Unused");
-}
-
-bool
-anv_gem_supports_syncobj_wait(int fd)
+VkResult
+anv_gem_import_bo_alloc_flags_to_bo_flags(struct anv_device *device,
+                                          struct anv_bo *bo,
+                                          enum anv_bo_alloc_flags alloc_flags,
+                                          uint32_t *bo_flags)
 {
-   return false;
+   return VK_SUCCESS;
 }
 
-int
-anv_i915_query(int fd, uint64_t query_id, void *buffer,
-               int32_t *buffer_len)
+static VkResult
+stub_vm_bind(struct anv_device *device, struct anv_sparse_submission *submit,
+             enum anv_vm_bind_flags flags)
 {
-   unreachable("Unused");
+   return VK_SUCCESS;
 }
 
-int
-anv_gem_create_context_engines(struct anv_device *device,
-                               const struct drm_i915_query_engine_info *info,
-                               int num_engines,
-                               uint16_t *engine_classes)
+static VkResult
+stub_vm_bind_bo(struct anv_device *device, struct anv_bo *bo)
 {
-   unreachable("Unused");
+   return VK_SUCCESS;
 }
 
-struct drm_i915_query_engine_info *
-anv_gem_get_engine_info(int fd)
+const struct anv_kmd_backend *anv_stub_kmd_backend_get(void)
 {
-   unreachable("Unused");
-}
-
-int
-anv_gem_count_engines(const struct drm_i915_query_engine_info *info,
-                      uint16_t engine_class)
-{
-   unreachable("Unused");
-}
-
-int
-anv_gem_syncobj_wait(struct anv_device *device,
-                     const uint32_t *handles, uint32_t num_handles,
-                     int64_t abs_timeout_ns, bool wait_all)
-{
-   unreachable("Unused");
-}
-
-int
-anv_gem_reg_read(int fd, uint32_t offset, uint64_t *result)
-{
-   unreachable("Unused");
-}
-
-int
-anv_gem_syncobj_timeline_wait(struct anv_device *device,
-                              const uint32_t *handles, const uint64_t *points,
-                              uint32_t num_items, int64_t abs_timeout_ns,
-                              bool wait_all, bool wait_materialize)
-{
-   unreachable("Unused");
-}
-
-int
-anv_gem_syncobj_timeline_signal(struct anv_device *device,
-                                const uint32_t *handles, const uint64_t *points,
-                                uint32_t num_items)
-{
-   unreachable("Unused");
-}
-
-int
-anv_gem_syncobj_timeline_query(struct anv_device *device,
-                               const uint32_t *handles, uint64_t *points,
-                               uint32_t num_items)
-{
-   unreachable("Unused");
+   static const struct anv_kmd_backend stub_backend = {
+      .gem_create = stub_gem_create,
+      .gem_create_userptr = stub_gem_create_userptr,
+      .gem_close = stub_gem_close,
+      .gem_mmap = stub_gem_mmap,
+      .vm_bind = stub_vm_bind,
+      .vm_bind_bo = stub_vm_bind_bo,
+      .vm_unbind_bo = stub_vm_bind_bo,
+      .execute_simple_batch = stub_execute_simple_batch,
+      .execute_trtt_batch = stub_execute_trtt_batch,
+      .queue_exec_locked = stub_queue_exec_locked,
+      .queue_exec_trace = stub_queue_exec_trace,
+      .bo_alloc_flags_to_bo_flags = stub_bo_alloc_flags_to_bo_flags,
+   };
+   return &stub_backend;
 }
diff --git a/src/intel/vulkan/anv_genX.h b/src/intel/vulkan/anv_genX.h
index 025ceff8a95..9370a9dc7a2 100644
--- a/src/intel/vulkan/anv_genX.h
+++ b/src/intel/vulkan/anv_genX.h
@@ -36,6 +36,14 @@
 #error This file is included by means other than anv_private.h
 #endif
 
+struct intel_sample_positions;
+struct intel_urb_config;
+struct anv_embedded_sampler;
+struct anv_pipeline_embedded_sampler_binding;
+
+typedef struct nir_builder nir_builder;
+typedef struct nir_shader nir_shader;
+
 extern const uint32_t genX(vk_to_intel_cullmode)[];
 
 extern const uint32_t genX(vk_to_intel_front_face)[];
@@ -48,16 +56,36 @@ extern const uint32_t genX(vk_to_intel_stencil_op)[];
 
 extern const uint32_t genX(vk_to_intel_logic_op)[];
 
+extern const uint32_t genX(vk_to_intel_fillmode)[];
+
 void genX(init_physical_device_state)(struct anv_physical_device *device);
 
 VkResult genX(init_device_state)(struct anv_device *device);
 
+void genX(init_cps_device_state)(struct anv_device *device);
+
+nir_shader *genX(load_libanv_shader)(struct anv_device *device, void *mem_ctx);
+
+uint32_t genX(call_internal_shader)(nir_builder *b,
+                                    enum anv_internal_kernel_name shader_name);
+
+void
+genX(set_fast_clear_state)(struct anv_cmd_buffer *cmd_buffer,
+                           const struct anv_image *image,
+                           const enum isl_format format,
+                           union isl_color_value clear_color);
+
+void
+genX(load_image_clear_color)(struct anv_cmd_buffer *cmd_buffer,
+                             struct anv_state surface_state,
+                             const struct anv_image *image);
+
+void genX(cmd_buffer_emit_bt_pool_base_address)(struct anv_cmd_buffer *cmd_buffer);
+
 void genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer);
 
 void genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer);
 
-void genX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer);
-
 void genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer,
                                           const struct isl_surf *surf);
 
@@ -73,8 +101,46 @@ void genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
                                         unsigned width, unsigned height,
                                         unsigned scale);
 
+void genX(urb_workaround)(struct anv_cmd_buffer *cmd_buffer,
+                          const struct intel_urb_config *urb_cfg);
+
 void genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer);
 void genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer);
+void genX(emit_pipeline_select)(struct anv_batch *batch, uint32_t pipeline,
+                                const struct anv_device *device);
+
+void genX(apply_task_urb_workaround)(struct anv_cmd_buffer *cmd_buffer);
+
+void genX(emit_vertex_input)(struct anv_batch *batch,
+                             uint32_t *vertex_element_dws,
+                             struct anv_graphics_pipeline *pipeline,
+                             const struct vk_vertex_input_state *vi,
+                             bool emit_in_pipeline);
+
+enum anv_pipe_bits
+genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
+                              struct anv_device *device,
+                              uint32_t current_pipeline,
+                              enum anv_pipe_bits bits,
+                              enum anv_pipe_bits *emitted_flush_bits);
+void
+genX(invalidate_aux_map)(struct anv_batch *batch,
+                         struct anv_device *device,
+                         enum intel_engine_class engine_class,
+                         enum anv_pipe_bits bits);
+
+
+void genX(emit_so_memcpy_init)(struct anv_memcpy_state *state,
+                               struct anv_device *device,
+                               struct anv_batch *batch);
+
+void genX(emit_so_memcpy_fini)(struct anv_memcpy_state *state);
+
+void genX(emit_so_memcpy_end)(struct anv_memcpy_state *state);
+
+void genX(emit_so_memcpy)(struct anv_memcpy_state *state,
+                          struct anv_address dst, struct anv_address src,
+                          uint32_t size);
 
 void genX(emit_l3_config)(struct anv_batch *batch,
                           const struct anv_device *device,
@@ -83,10 +149,21 @@ void genX(emit_l3_config)(struct anv_batch *batch,
 void genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
                                 const struct intel_l3_config *cfg);
 
-void genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer);
-void genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer);
+void genX(flush_descriptor_buffers)(struct anv_cmd_buffer *cmd_buffer,
+                                    struct anv_cmd_pipeline_state *pipe_state);
+
+uint32_t
+genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer,
+                                       struct anv_cmd_pipeline_state *pipe_state,
+                                       const VkShaderStageFlags dirty,
+                                       struct anv_shader_bin **shaders,
+                                       uint32_t num_shaders);
+
+void genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer);
+
+void genX(cmd_buffer_flush_gfx_runtime_state)(struct anv_cmd_buffer *cmd_buffer);
 
-void genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer);
+void genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer);
 
 void genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
                                      bool enable);
@@ -101,46 +178,216 @@ void genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
 
 void genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer);
 
+struct anv_address genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer);
+
+void genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
+                                       uint32_t total_scratch);
+
 void
 genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
                      const struct intel_l3_config *l3_config,
                      VkShaderStageFlags active_stages,
-                     const unsigned entry_size[4],
+                     const struct intel_urb_config *urb_cfg_in,
+                     struct intel_urb_config *urb_cfg_out,
                      enum intel_urb_deref_block_size *deref_block_size);
 
-void genX(emit_multisample)(struct anv_batch *batch, uint32_t samples,
-                            const VkSampleLocationEXT *locations);
-
-void genX(emit_sample_pattern)(struct anv_batch *batch, uint32_t samples,
-                               const VkSampleLocationEXT *locations);
-
-void genX(emit_shading_rate)(struct anv_batch *batch,
-                             const struct anv_graphics_pipeline *pipeline,
-                             struct anv_state cps_states,
-                             struct anv_dynamic_state *dynamic_state);
+void genX(emit_sample_pattern)(struct anv_batch *batch,
+                               const struct vk_sample_locations_state *sl);
 
 void genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
                                 struct anv_address dst, struct anv_address src,
                                 uint32_t size);
 
+void genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer,
+                                      struct anv_kernel *kernel,
+                                      const uint32_t *global_size, /* NULL for indirect */
+                                      uint32_t arg_count,
+                                      const struct anv_kernel_arg *args);
+
+void genX(blorp_init_dynamic_states)(struct blorp_context *context);
+
 void genX(blorp_exec)(struct blorp_batch *batch,
                       const struct blorp_params *params);
 
+void genX(batch_emit_secondary_call)(struct anv_batch *batch,
+                                     struct anv_address secondary_addr,
+                                     struct anv_address secondary_return_addr);
+
+void *genX(batch_emit_return)(struct anv_batch *batch);
+
 void genX(cmd_emit_timestamp)(struct anv_batch *batch,
-                              struct anv_bo *bo,
-                              uint32_t offset);
+                              struct anv_device *device,
+                              struct anv_address addr,
+                              enum anv_timestamp_capture_type type,
+                              void *data);
 
 void
-genX(rasterization_mode)(VkPolygonMode raster_mode,
-                         VkLineRasterizationModeEXT line_mode,
-                         float line_width,
-                         uint32_t *api_mode,
-                         bool *msaa_rasterization_enable);
+genX(batch_emit_post_3dprimitive_was)(struct anv_batch *batch,
+                                      const struct anv_device *device,
+                                      uint32_t primitive_topology,
+                                      uint32_t vertex_count);
 
-uint32_t
-genX(ms_rasterization_mode)(struct anv_graphics_pipeline *pipeline,
-                            VkPolygonMode raster_mode);
+void genX(batch_emit_fast_color_dummy_blit)(struct anv_batch *batch,
+                                            struct anv_device *device);
 
 VkPolygonMode
-genX(raster_polygon_mode)(struct anv_graphics_pipeline *pipeline,
+genX(raster_polygon_mode)(const struct anv_graphics_pipeline *pipeline,
+                          VkPolygonMode polygon_mode,
                           VkPrimitiveTopology primitive_topology);
+
+void
+genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
+                             const struct vk_graphics_pipeline_state *state);
+
+void
+genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline);
+
+void
+genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline);
+
+#define anv_shader_bin_get_bsr(bin, local_arg_offset) ({             \
+   assert((local_arg_offset) % 8 == 0);                              \
+   const struct brw_bs_prog_data *prog_data =                        \
+      brw_bs_prog_data_const(bin->prog_data);                        \
+   assert(prog_data->simd_size == 8 || prog_data->simd_size == 16);  \
+                                                                     \
+   (struct GENX(BINDLESS_SHADER_RECORD)) {                           \
+      .OffsetToLocalArguments = (local_arg_offset) / 8,              \
+      .BindlessShaderDispatchMode =                                  \
+         prog_data->simd_size == 16 ? RT_SIMD16 : RT_SIMD8,          \
+      .KernelStartPointer = bin->kernel.offset,                      \
+   };                                                                \
+})
+
+void
+genX(batch_set_preemption)(struct anv_batch *batch,
+                           const struct intel_device_info *devinfo,
+                           uint32_t current_pipeline,
+                           bool value);
+
+void
+genX(cmd_buffer_set_preemption)(struct anv_cmd_buffer *cmd_buffer, bool value);
+
+void
+genX(batch_emit_pipe_control)(struct anv_batch *batch,
+                              const struct intel_device_info *devinfo,
+                              uint32_t current_pipeline,
+                              enum anv_pipe_bits bits,
+                              const char *reason);
+
+void
+genX(batch_emit_pipe_control_write)(struct anv_batch *batch,
+                                    const struct intel_device_info *devinfo,
+                                    uint32_t current_pipeline,
+                                    uint32_t post_sync_op,
+                                    struct anv_address address,
+                                    uint32_t imm_data,
+                                    enum anv_pipe_bits bits,
+                                    const char *reason);
+
+#define genx_batch_emit_pipe_control(a, b, c, d) \
+genX(batch_emit_pipe_control) (a, b, c, d, __func__)
+
+#define genx_batch_emit_pipe_control_write(a, b, c, d, e, f, g) \
+genX(batch_emit_pipe_control_write) (a, b, c, d, e, f, g, __func__)
+
+void genX(batch_emit_breakpoint)(struct anv_batch *batch,
+                                 struct anv_device *device,
+                                 bool emit_before_draw);
+
+static inline void
+genX(emit_breakpoint)(struct anv_batch *batch,
+                      struct anv_device *device,
+                      bool emit_before_draw)
+{
+   if (INTEL_DEBUG(DEBUG_DRAW_BKP))
+      genX(batch_emit_breakpoint)(batch, device, emit_before_draw);
+}
+
+void
+genX(cmd_buffer_begin_companion)(struct anv_cmd_buffer *buffer,
+                                 VkCommandBufferLevel level);
+
+struct anv_state
+genX(cmd_buffer_begin_companion_rcs_syncpoint)(struct anv_cmd_buffer *cmd_buffer);
+
+void
+genX(cmd_buffer_end_companion_rcs_syncpoint)(struct anv_cmd_buffer *cmd_buffer,
+                                             struct anv_state syncpoint);
+
+void
+genX(emit_simple_shader_init)(struct anv_simple_shader *state);
+
+void
+genX(emit_simple_shader_dispatch)(struct anv_simple_shader *state,
+                                  uint32_t num_threads,
+                                  struct anv_state push_state);
+
+struct anv_state
+genX(simple_shader_alloc_push)(struct anv_simple_shader *state, uint32_t size);
+
+struct anv_address
+genX(simple_shader_push_state_address)(struct anv_simple_shader *state,
+                                       struct anv_state push_state);
+
+void
+genX(emit_simple_shader_end)(struct anv_simple_shader *state);
+
+VkResult genX(init_trtt_context_state)(struct anv_queue *queue);
+
+VkResult genX(write_trtt_entries)(struct anv_trtt_submission *submit);
+
+void
+genX(cmd_buffer_emit_push_descriptor_buffer_surface)(struct anv_cmd_buffer *cmd_buffer,
+                                                     struct anv_descriptor_set *set);
+
+void
+genX(cmd_buffer_emit_push_descriptor_surfaces)(struct anv_cmd_buffer *cmd_buffer,
+                                               struct anv_descriptor_set *set);
+
+static inline VkShaderStageFlags
+genX(cmd_buffer_flush_push_descriptors)(struct anv_cmd_buffer *cmd_buffer,
+                                        struct anv_cmd_pipeline_state *state,
+                                        struct anv_pipeline *pipeline)
+{
+   if (!pipeline->use_push_descriptor && !pipeline->use_push_descriptor_buffer)
+      return 0;
+
+   assert(pipeline->layout.push_descriptor_set_index != -1);
+   struct anv_descriptor_set *set =
+      state->descriptors[pipeline->layout.push_descriptor_set_index];
+   assert(set->is_push);
+
+   const VkShaderStageFlags push_buffer_dirty =
+      cmd_buffer->state.push_descriptors_dirty &
+      pipeline->use_push_descriptor_buffer;
+   if (push_buffer_dirty) {
+      if (set->desc_surface_state.map == NULL)
+         genX(cmd_buffer_emit_push_descriptor_buffer_surface)(cmd_buffer, set);
+
+      /* Force the next push descriptor update to allocate a new descriptor set. */
+      state->push_descriptor.set_used_on_gpu = true;
+   }
+
+   const VkShaderStageFlags push_descriptor_dirty =
+      cmd_buffer->state.push_descriptors_dirty & pipeline->use_push_descriptor;
+   if (push_descriptor_dirty) {
+      genX(cmd_buffer_emit_push_descriptor_surfaces)(cmd_buffer, set);
+
+      /* Force the next push descriptor update to allocate a new descriptor set. */
+      state->push_descriptor.set_used_on_gpu = true;
+   }
+
+   /* Clear the dirty stages now that we've generated the surface states for
+    * them.
+    */
+   cmd_buffer->state.push_descriptors_dirty &=
+      ~(push_descriptor_dirty | push_buffer_dirty);
+
+   /* Return the binding table stages that need to be updated */
+   return push_buffer_dirty | push_descriptor_dirty;
+}
+
+void genX(emit_embedded_sampler)(struct anv_device *device,
+                                 struct anv_embedded_sampler *sampler,
+                                 struct anv_pipeline_embedded_sampler_binding *binding);
diff --git a/src/intel/vulkan/anv_image.c b/src/intel/vulkan/anv_image.c
index 97062a067cc..7d1c57b806d 100644
--- a/src/intel/vulkan/anv_image.c
+++ b/src/intel/vulkan/anv_image.c
@@ -30,7 +30,8 @@
 #include "drm-uapi/drm_fourcc.h"
 
 #include "anv_private.h"
-#include "util/debug.h"
+#include "common/intel_aux_map.h"
+#include "util/u_debug.h"
 #include "vk_util.h"
 #include "util/u_math.h"
 
@@ -53,27 +54,35 @@ memory_range_end(struct anv_image_memory_range memory_range)
 }
 
 /**
- * Get binding for VkImagePlaneMemoryRequirementsInfo and
- * VkBindImagePlaneMemoryInfo.
+ * Get binding for VkImagePlaneMemoryRequirementsInfo,
+ * VkBindImagePlaneMemoryInfo and VkDeviceImageMemoryRequirements.
  */
-static struct anv_image_binding *
-image_aspect_to_binding(struct anv_image *image, VkImageAspectFlags aspect)
+struct anv_image_binding *
+anv_image_aspect_to_binding(struct anv_image *image,
+                            VkImageAspectFlags aspect)
 {
-   uint32_t plane;
+   uint32_t plane = 0;
 
    assert(image->disjoint);
 
    if (image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
       /* Spec requires special aspects for modifier images. */
-      assert(aspect >= VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT &&
-             aspect <= VK_IMAGE_ASPECT_MEMORY_PLANE_3_BIT_EXT);
+      assert(aspect == VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT ||
+             aspect == VK_IMAGE_ASPECT_MEMORY_PLANE_1_BIT_EXT ||
+             aspect == VK_IMAGE_ASPECT_MEMORY_PLANE_2_BIT_EXT ||
+             aspect == VK_IMAGE_ASPECT_MEMORY_PLANE_3_BIT_EXT);
 
       /* We don't advertise DISJOINT for modifiers with aux, and therefore we
        * don't handle queries of the modifier's "aux plane" here.
        */
       assert(!isl_drm_modifier_has_aux(image->vk.drm_format_mod));
 
-      plane = aspect - VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT;
+      switch(aspect) {
+         case VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT: plane = 0; break;
+         case VK_IMAGE_ASPECT_MEMORY_PLANE_1_BIT_EXT: plane = 1; break;
+         case VK_IMAGE_ASPECT_MEMORY_PLANE_2_BIT_EXT: plane = 2; break;
+         case VK_IMAGE_ASPECT_MEMORY_PLANE_3_BIT_EXT: plane = 3; break;
+      }
    } else {
       plane = anv_image_aspect_to_plane(image, aspect);
    }
@@ -126,51 +135,44 @@ image_binding_grow(const struct anv_device *device,
       &image->bindings[binding].memory_range;
 
    if (has_implicit_offset) {
-      offset = align_u64(container->offset + container->size, alignment);
+      offset = align64(container->offset + container->size, alignment);
    } else {
       /* Offset must be validated because it comes from
        * VkImageDrmFormatModifierExplicitCreateInfoEXT.
        */
       if (unlikely(!anv_is_aligned(offset, alignment))) {
-         return vk_errorf(device, &device->vk.base,
+         return vk_errorf(device,
                           VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT,
                           "VkImageDrmFormatModifierExplicitCreateInfoEXT::"
                           "pPlaneLayouts[]::offset is misaligned");
       }
-
-      /* We require that surfaces be added in memory-order. This simplifies the
-       * layout validation required by
-       * VkImageDrmFormatModifierExplicitCreateInfoEXT,
-       */
-      if (unlikely(offset < container->size)) {
-         return vk_errorf(device, &device->vk.base,
-                          VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT,
-                          "VkImageDrmFormatModifierExplicitCreateInfoEXT::"
-                          "pPlaneLayouts[]::offset is too small");
-      }
    }
 
-   if (__builtin_add_overflow(offset, size, &container->size)) {
+   /* Surfaces can be added out of memory-order. Track the end of each memory
+    * plane to update the binding size properly.
+    */
+   uint64_t memory_range_end;
+   if (__builtin_add_overflow(offset, size, &memory_range_end)) {
       if (has_implicit_offset) {
          assert(!"overflow");
-         return vk_errorf(device, &device->vk.base,
-                          VK_ERROR_UNKNOWN,
+         return vk_errorf(device, VK_ERROR_UNKNOWN,
                           "internal error: overflow in %s", __func__);
       } else {
-         return vk_errorf(device, &device->vk.base,
+         return vk_errorf(device,
                           VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT,
                           "VkImageDrmFormatModifierExplicitCreateInfoEXT::"
                           "pPlaneLayouts[]::offset is too large");
       }
    }
 
+   container->size = MAX2(container->size, memory_range_end);
    container->alignment = MAX2(container->alignment, alignment);
 
    *out_range = (struct anv_image_memory_range) {
       .binding = binding,
-      .offset = offset,
-      .size = size,
       .alignment = alignment,
+      .size = size,
+      .offset = offset,
    };
 
    return VK_SUCCESS;
@@ -200,26 +202,55 @@ memory_range_merge(struct anv_image_memory_range *a,
    a->size = MAX2(a->size, b.offset + b.size);
 }
 
-static isl_surf_usage_flags_t
-choose_isl_surf_usage(VkImageCreateFlags vk_create_flags,
-                      VkImageUsageFlags vk_usage,
-                      isl_surf_usage_flags_t isl_extra_usage,
-                      VkImageAspectFlagBits aspect)
+isl_surf_usage_flags_t
+anv_image_choose_isl_surf_usage(struct anv_physical_device *device,
+                                VkImageCreateFlags vk_create_flags,
+                                VkImageUsageFlags vk_usage,
+                                isl_surf_usage_flags_t isl_extra_usage,
+                                VkImageAspectFlagBits aspect,
+                                VkImageCompressionFlagsEXT comp_flags)
 {
    isl_surf_usage_flags_t isl_usage = isl_extra_usage;
 
+   /* On platform like MTL, we choose to allocate additional CCS memory at the
+    * back of the VkDeviceMemory objects since different images can share the
+    * AUX-TT PTE because the HW doesn't care about the image format in the
+    * PTE. That means we can always ignore the AUX-TT alignment requirement
+    * from an ISL point of view.
+    */
+   if (device->alloc_aux_tt_mem)
+      isl_usage |= ISL_SURF_USAGE_NO_AUX_TT_ALIGNMENT_BIT;
+
    if (vk_usage & VK_IMAGE_USAGE_SAMPLED_BIT)
       isl_usage |= ISL_SURF_USAGE_TEXTURE_BIT;
 
    if (vk_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)
       isl_usage |= ISL_SURF_USAGE_TEXTURE_BIT;
 
+   if (vk_usage & VK_IMAGE_USAGE_STORAGE_BIT)
+      isl_usage |= ISL_SURF_USAGE_STORAGE_BIT;
+
    if (vk_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)
       isl_usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT;
 
+   if (vk_usage & VK_IMAGE_USAGE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR)
+      isl_usage |= ISL_SURF_USAGE_CPB_BIT;
+
+   if (vk_create_flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT)
+      isl_usage |= ISL_SURF_USAGE_SPARSE_BIT |
+                   ISL_SURF_USAGE_DISABLE_AUX_BIT;
+
+   if (vk_usage & VK_IMAGE_USAGE_VIDEO_DECODE_DST_BIT_KHR ||
+       vk_usage & VK_IMAGE_USAGE_VIDEO_DECODE_DPB_BIT_KHR)
+      isl_usage |= ISL_SURF_USAGE_VIDEO_DECODE_BIT;
+
    if (vk_create_flags & VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT)
       isl_usage |= ISL_SURF_USAGE_CUBE_BIT;
 
+   if (vk_create_flags & (VK_IMAGE_CREATE_2D_VIEW_COMPATIBLE_BIT_EXT |
+                          VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT))
+      isl_usage |= ISL_SURF_USAGE_2D_3D_COMPATIBLE_BIT;
+
    /* Even if we're only using it for transfer operations, clears to depth and
     * stencil images happen as depth and stencil so they need the right ISL
     * usage bits or else things will fall apart.
@@ -253,6 +284,9 @@ choose_isl_surf_usage(VkImageCreateFlags vk_create_flags,
       isl_usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT;
    }
 
+   if (comp_flags & VK_IMAGE_COMPRESSION_DISABLED_EXT)
+      isl_usage |= ISL_SURF_USAGE_DISABLE_AUX_BIT;
+
    return isl_usage;
 }
 
@@ -319,67 +353,138 @@ add_surface(struct anv_device *device,
                              &surf->memory_range);
 }
 
+static bool
+can_fast_clear_with_non_zero_color(const struct intel_device_info *devinfo,
+                                   const struct anv_image *image,
+                                   uint32_t plane,
+                                   const VkImageFormatListCreateInfo *fmt_list)
+{
+   /* If we don't have an AUX surface where fast clears apply, we can return
+    * early.
+    */
+   if (!isl_aux_usage_has_fast_clears(image->planes[plane].aux_usage))
+      return false;
+
+   /* On TGL (< C0), if a block of fragment shader outputs match the surface's
+    * clear color, the HW may convert them to fast-clears (see HSD 1607794140).
+    * This can lead to rendering corruptions if not handled properly. We
+    * restrict the clear color to zero to avoid issues that can occur with:
+    *     - Texture view rendering (including blorp_copy calls)
+    *     - Images with multiple levels or array layers
+    */
+   if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E)
+      return false;
+
+   /* Turning on non zero fast clears for CCS_E introduces a performance
+    * regression for games such as F1 22 and RDR2 by introducing additional
+    * partial resolves. Let's turn non zero fast clears back off till we can
+    * fix performance.
+    */
+   if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E &&
+       devinfo->ver >= 12)
+      return false;
+
+   /* Non mutable image, we can fast clear with any color supported by HW.
+    */
+   if (!(image->vk.create_flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT))
+      return true;
+
+   /* Mutable image with no format list, we have to assume all formats */
+   if (!fmt_list || fmt_list->viewFormatCount == 0)
+      return false;
+
+   enum isl_format img_format = image->planes[plane].primary_surface.isl.format;
+
+   /* Check bit compatibility for clear color components */
+   for (uint32_t i = 0; i < fmt_list->viewFormatCount; i++) {
+      if (fmt_list->pViewFormats[i] == VK_FORMAT_UNDEFINED)
+         continue;
+
+      struct anv_format_plane view_format_plane =
+         anv_get_format_plane(devinfo, fmt_list->pViewFormats[i],
+                              plane, image->vk.tiling);
+
+      enum isl_format view_format = view_format_plane.isl_format;
+
+      if (!isl_formats_have_same_bits_per_channel(img_format, view_format))
+         return false;
+   }
+
+   return true;
+}
+
 /**
- * Do hardware limitations require the image plane to use a shadow surface?
+ * Return true if the storage image could be used with atomics.
  *
- * If hardware limitations force us to use a shadow surface, then the same
- * limitations may also constrain the tiling of the primary surface; therefore
- * paramater @a inout_primary_tiling_flags.
- *
- * If the image plane is a separate stencil plane and if the user provided
- * VkImageStencilUsageCreateInfoEXT, then @a usage must be stencilUsage.
- *
- * @see anv_image::planes[]::shadow_surface
+ * If the image was created with an explicit format, we check it for typed
+ * atomic support.  If MUTABLE_FORMAT_BIT is set, then we check the optional
+ * format list, seeing if /any/ of the formats support typed atomics.  If no
+ * list is supplied, we fall back to using the bpb, as the application could
+ * make an image view with a format that does use atomics.
  */
 static bool
-anv_image_plane_needs_shadow_surface(const struct intel_device_info *devinfo,
-                                     struct anv_format_plane plane_format,
+storage_image_format_supports_atomic(const struct intel_device_info *devinfo,
+                                     VkImageCreateFlags create_flags,
+                                     enum isl_format format,
                                      VkImageTiling vk_tiling,
-                                     VkImageUsageFlags vk_plane_usage,
-                                     VkImageCreateFlags vk_create_flags,
-                                     isl_tiling_flags_t *inout_primary_tiling_flags)
-{
-   if (devinfo->ver <= 8 &&
-       (vk_create_flags & VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT) &&
-       vk_tiling == VK_IMAGE_TILING_OPTIMAL) {
-      /* We must fallback to a linear surface because we may not be able to
-       * correctly handle the offsets if tiled. (On gfx9,
-       * RENDER_SURFACE_STATE::X/Y Offset are sufficient). To prevent garbage
-       * performance while texturing, we maintain a tiled shadow surface.
-       */
-      assert(isl_format_is_compressed(plane_format.isl_format));
+                                     const VkImageFormatListCreateInfo *fmt_list)
+{
+   if (isl_format_supports_typed_atomics(devinfo, format))
+      return true;
 
-      if (inout_primary_tiling_flags) {
-         *inout_primary_tiling_flags = ISL_TILING_LINEAR_BIT;
-      }
+   if (!(create_flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT))
+      return false;
 
-      return true;
-   }
+   if (fmt_list) {
+      for (uint32_t i = 0; i < fmt_list->viewFormatCount; i++) {
+         if (fmt_list->pViewFormats[i] == VK_FORMAT_UNDEFINED)
+            continue;
 
-   if (devinfo->ver <= 7 &&
-       plane_format.aspect == VK_IMAGE_ASPECT_STENCIL_BIT &&
-       (vk_plane_usage & VK_IMAGE_USAGE_SAMPLED_BIT)) {
-      /* gfx7 can't sample from W-tiled surfaces. */
-      return true;
+         enum isl_format view_format =
+            anv_get_isl_format(devinfo, fmt_list->pViewFormats[i],
+                               VK_IMAGE_ASPECT_COLOR_BIT, vk_tiling);
+
+         if (isl_format_supports_typed_atomics(devinfo, view_format))
+            return true;
+      }
+
+      return false;
    }
 
-   return false;
+   /* No explicit format list.  Any 16/32/64bpp format could be used with atomics. */
+   unsigned bpb = isl_format_get_layout(format)->bpb;
+   return bpb == 16 || bpb == 32 || bpb == 64;
 }
 
-bool
-anv_formats_ccs_e_compatible(const struct intel_device_info *devinfo,
-                             VkImageCreateFlags create_flags,
-                             VkFormat vk_format,
-                             VkImageTiling vk_tiling,
-                             const VkImageFormatListCreateInfoKHR *fmt_list)
+static enum isl_format
+anv_get_isl_format_with_usage(const struct intel_device_info *devinfo,
+                              VkFormat vk_format,
+                              VkImageAspectFlagBits vk_aspect,
+                              VkImageUsageFlags vk_usage,
+                              VkImageTiling vk_tiling)
 {
-   enum isl_format format =
-      anv_get_isl_format(devinfo, vk_format,
-                         VK_IMAGE_ASPECT_COLOR_BIT, vk_tiling);
+   assert(util_bitcount(vk_usage) == 1);
+   struct anv_format_plane format =
+      anv_get_format_aspect(devinfo, vk_format, vk_aspect,
+                            vk_tiling);
+
+   return format.isl_format;
+}
 
-   if (!isl_format_supports_ccs_e(devinfo, format))
+static bool
+formats_ccs_e_compatible(const struct intel_device_info *devinfo,
+                         VkImageCreateFlags create_flags,
+                         enum isl_format format, VkImageTiling vk_tiling,
+                         VkImageUsageFlags vk_usage,
+                         const VkImageFormatListCreateInfo *fmt_list)
+{
+   if (!anv_format_supports_ccs_e(devinfo, format))
       return false;
 
+   /* For images created without MUTABLE_FORMAT_BIT set, we know that they will
+    * always be used with the original format. In particular, they will always
+    * be used with a format that supports color compression.
+    */
    if (!(create_flags & VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT))
       return true;
 
@@ -387,9 +492,13 @@ anv_formats_ccs_e_compatible(const struct intel_device_info *devinfo,
       return false;
 
    for (uint32_t i = 0; i < fmt_list->viewFormatCount; i++) {
+      if (fmt_list->pViewFormats[i] == VK_FORMAT_UNDEFINED)
+         continue;
+
       enum isl_format view_format =
-         anv_get_isl_format(devinfo, fmt_list->pViewFormats[i],
-                            VK_IMAGE_ASPECT_COLOR_BIT, vk_tiling);
+         anv_get_isl_format_with_usage(devinfo, fmt_list->pViewFormats[i],
+                                       VK_IMAGE_ASPECT_COLOR_BIT, vk_usage,
+                                       vk_tiling);
 
       if (!isl_formats_are_ccs_e_compatible(devinfo, format, view_format))
          return false;
@@ -398,6 +507,62 @@ anv_formats_ccs_e_compatible(const struct intel_device_info *devinfo,
    return true;
 }
 
+bool
+anv_format_supports_ccs_e(const struct intel_device_info *devinfo,
+                          const enum isl_format format)
+{
+   /* CCS_E for YCRCB_NORMAL and YCRCB_SWAP_UV is not currently supported by
+    * ANV so leave it disabled for now.
+    */
+   if (isl_format_is_yuv(format))
+      return false;
+
+   return isl_format_supports_ccs_e(devinfo, format);
+}
+
+bool
+anv_formats_ccs_e_compatible(const struct intel_device_info *devinfo,
+                             VkImageCreateFlags create_flags,
+                             VkFormat vk_format, VkImageTiling vk_tiling,
+                             VkImageUsageFlags vk_usage,
+                             const VkImageFormatListCreateInfo *fmt_list)
+{
+   enum isl_format format =
+      anv_get_isl_format_with_usage(devinfo, vk_format,
+                                    VK_IMAGE_ASPECT_COLOR_BIT,
+                                    VK_IMAGE_USAGE_SAMPLED_BIT, vk_tiling);
+
+   if (!formats_ccs_e_compatible(devinfo, create_flags, format, vk_tiling,
+                                 VK_IMAGE_USAGE_SAMPLED_BIT, fmt_list))
+      return false;
+
+   if (vk_usage & VK_IMAGE_USAGE_STORAGE_BIT) {
+      if (devinfo->verx10 < 125)
+         return false;
+
+      enum isl_format lower_format =
+         anv_get_isl_format_with_usage(devinfo, vk_format,
+                                       VK_IMAGE_ASPECT_COLOR_BIT,
+                                       VK_IMAGE_USAGE_STORAGE_BIT, vk_tiling);
+
+      if (!isl_formats_are_ccs_e_compatible(devinfo, format, lower_format))
+         return false;
+
+      if (!formats_ccs_e_compatible(devinfo, create_flags, format, vk_tiling,
+                                    VK_IMAGE_USAGE_STORAGE_BIT, fmt_list))
+         return false;
+
+      /* Disable compression when surface can be potentially used for atomic
+       * operation.
+       */
+      if (storage_image_format_supports_atomic(devinfo, create_flags, format,
+                                               vk_tiling, fmt_list))
+         return false;
+   }
+
+   return true;
+}
+
 /**
  * For color images that have an auxiliary surface, request allocation for an
  * additional buffer that mainly stores fast-clear values. Use of this buffer
@@ -455,6 +620,7 @@ anv_formats_ccs_e_compatible(const struct intel_device_info *devinfo,
 static VkResult MUST_CHECK
 add_aux_state_tracking_buffer(struct anv_device *device,
                               struct anv_image *image,
+                              uint64_t state_offset,
                               uint32_t plane)
 {
    assert(image && device);
@@ -462,18 +628,43 @@ add_aux_state_tracking_buffer(struct anv_device *device,
           image->vk.aspects & (VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV |
                                VK_IMAGE_ASPECT_DEPTH_BIT));
 
-   const unsigned clear_color_state_size = device->info.ver >= 10 ?
-      device->isl_dev.ss.clear_color_state_size :
-      device->isl_dev.ss.clear_value_size;
+   unsigned clear_color_state_size;
+   if (device->info->ver >= 11) {
+      /* When importing an image from another source with a drm modifier that
+       * supports clear color, the clear color values are in a 32-byte struct
+       * defined in drm_fourcc.h. The fast clear type and compression state
+       * are not defined in these drm_fourcc.h, so there won't be memory
+       * allocated for these extra meta data by the source.
+       *
+       * We use the last 2 dwords of the clear color struct's memory to store
+       * the fast clear type and the first compression state, so the driver
+       * doesn't assume the extra size or need another allocation later.
+       *
+       * So far, the 2 stolen dwords are either not used in the clear color
+       * struct or for features not enabled. There should be no side effect to
+       * the hardware and destinations of images exported by this driver.
+       *
+       * Images with multiple levels or layers are not supported by drm
+       * modifiers, so we don't have to apply the above approach or face a
+       * bigger shortage from multiple compression states. We just apply the
+       * approach to all cases to keep the design unified.
+       *
+       * As a result, the state starts 8 bytes lower than where it should be.
+       */
+      assert(device->isl_dev.ss.clear_color_state_size >= 32);
+      clear_color_state_size = device->isl_dev.ss.clear_color_state_size - 8;
+   } else {
+      clear_color_state_size = device->isl_dev.ss.clear_value_size;
+   }
 
    /* Clear color and fast clear type */
    unsigned state_size = clear_color_state_size + 4;
 
    /* We only need to track compression on CCS_E surfaces. */
-   if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) {
+   if (isl_aux_usage_has_ccs_e(image->planes[plane].aux_usage)) {
       if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
          for (uint32_t l = 0; l < image->vk.mip_levels; l++)
-            state_size += anv_minify(image->vk.extent.depth, l) * 4;
+            state_size += u_minify(image->vk.extent.depth, l) * 4;
       } else {
          state_size += image->vk.mip_levels * image->vk.array_layers * 4;
       }
@@ -482,17 +673,53 @@ add_aux_state_tracking_buffer(struct anv_device *device,
    enum anv_image_memory_binding binding =
       ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane;
 
-   if (image->vk.drm_format_mod != DRM_FORMAT_MOD_INVALID)
-       binding = ANV_IMAGE_MEMORY_BINDING_PRIVATE;
+   /* If an auxiliary surface is used for an externally-shareable image,
+    * we have to hide this from the memory of the image since other
+    * processes with access to the memory may not be aware of it or of
+    * its current state. So put that auxiliary data into a separate
+    * buffer (ANV_IMAGE_MEMORY_BINDING_PRIVATE).
+    *
+    * But when the image is created with a drm modifier that supports
+    * clear color, it will be exported along with main surface.
+    */
+   if (anv_image_is_externally_shared(image)
+       && !isl_drm_modifier_get_info(image->vk.drm_format_mod)->supports_clear_color) {
+      binding = ANV_IMAGE_MEMORY_BINDING_PRIVATE;
+   }
 
    /* We believe that 256B alignment may be sufficient, but we choose 4K due to
     * lack of testing.  And MI_LOAD/STORE operations require dword-alignment.
     */
    return image_binding_grow(device, image, binding,
-                             ANV_OFFSET_IMPLICIT, state_size, 4096,
+                             state_offset, state_size, 4096,
                              &image->planes[plane].fast_clear_memory_range);
 }
 
+static VkResult MUST_CHECK
+add_compression_control_buffer(struct anv_device *device,
+                               struct anv_image *image,
+                               uint32_t plane,
+                               uint32_t binding,
+                               uint64_t offset)
+{
+   assert(device->info->has_aux_map);
+
+   uint64_t ratio = intel_aux_get_main_to_aux_ratio(device->aux_map_ctx);
+   assert(image->planes[plane].primary_surface.isl.size_B % ratio == 0);
+   uint64_t size = image->planes[plane].primary_surface.isl.size_B / ratio;
+
+   /* The diagram in the Bspec section, Memory Compression - Gfx12 (44930),
+    * shows that the CCS is indexed in 256B chunks for TGL, 4K chunks for MTL.
+    * When modifiers are in use, the 4K alignment requirement of the
+    * PLANE_AUX_DIST::Auxiliary Surface Distance field must be considered
+    * (Bspec 50379). Keep things simple and just use 4K.
+    */
+   uint32_t alignment = 4096;
+
+   return image_binding_grow(device, image, binding, offset, size, alignment,
+                             &image->planes[plane].compr_ctrl_memory_range);
+}
+
 /**
  * The return code indicates whether creation of the VkImage should continue
  * or fail, not whether the creation of the aux surface succeeded.  If the aux
@@ -507,9 +734,10 @@ add_aux_surface_if_supported(struct anv_device *device,
                              struct anv_image *image,
                              uint32_t plane,
                              struct anv_format_plane plane_format,
-                             const VkImageFormatListCreateInfoKHR *fmt_list,
+                             const VkImageFormatListCreateInfo *fmt_list,
                              uint64_t offset,
                              uint32_t stride,
+                             uint64_t aux_state_offset,
                              isl_surf_usage_flags_t isl_extra_usage_flags)
 {
    VkImageAspectFlags aspect = plane_format.aspect;
@@ -522,6 +750,27 @@ add_aux_surface_if_supported(struct anv_device *device,
    if ((isl_extra_usage_flags & ISL_SURF_USAGE_DISABLE_AUX_BIT))
       return VK_SUCCESS;
 
+   /* TODO: consider whether compression with sparse is workable. */
+   if (anv_image_is_sparse(image))
+      return VK_SUCCESS;
+
+   /* If resource created with sharing mode CONCURRENT when multiple queues
+    * are supported, we can't support the compression since we can't do
+    * FULL_RESOLVE/PARTIAL_RESOLVE to construct the main surface data without
+    * barrier.
+    */
+   if (image->vk.sharing_mode == VK_SHARING_MODE_CONCURRENT &&
+       device->queue_count > 1)
+      return VK_SUCCESS;
+
+   uint32_t binding;
+   if (image->vk.drm_format_mod == DRM_FORMAT_MOD_INVALID ||
+       isl_drm_modifier_has_aux(image->vk.drm_format_mod)) {
+      binding = ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane;
+   } else {
+      binding = ANV_IMAGE_MEMORY_BINDING_PRIVATE;
+   }
+
    if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT) {
       /* We don't advertise that depth buffers could be used as storage
        * images.
@@ -536,24 +785,12 @@ add_aux_surface_if_supported(struct anv_device *device,
          return VK_SUCCESS;
       }
 
-      if (device->info.ver == 7) {
-         anv_perf_warn(device, &image->vk.base, "Implement gfx7 HiZ");
+      /* TODO: Adjust blorp for multi-LOD HiZ surface on Gen8 - Gen9*/
+      if (image->vk.mip_levels > 1 && device->info->ver <= 9) {
+         anv_perf_warn(VK_LOG_OBJS(&image->vk.base), "Enable multi-LOD HiZ");
          return VK_SUCCESS;
       }
 
-      if (image->vk.mip_levels > 1) {
-         anv_perf_warn(device, &image->vk.base, "Enable multi-LOD HiZ");
-         return VK_SUCCESS;
-      }
-
-      if (device->info.ver == 8 && image->vk.samples > 1) {
-         anv_perf_warn(device, &image->vk.base, "Enable gfx8 multisampled HiZ");
-         return VK_SUCCESS;
-      }
-
-      if (INTEL_DEBUG & DEBUG_NO_HIZ)
-         return VK_SUCCESS;
-
       ok = isl_surf_get_hiz_surf(&device->isl_dev,
                                  &image->planes[plane].primary_surface.isl,
                                  &image->planes[plane].aux_surface.isl);
@@ -575,32 +812,45 @@ add_aux_surface_if_supported(struct anv_device *device,
           *
           * TODO: This is a heuristic trade-off; we haven't tuned it at all.
           */
-         assert(device->info.ver >= 12);
+         assert(device->info->ver >= 12);
          image->planes[plane].aux_usage = ISL_AUX_USAGE_HIZ_CCS_WT;
       } else {
-         assert(device->info.ver >= 12);
+         assert(device->info->ver >= 12);
          image->planes[plane].aux_usage = ISL_AUX_USAGE_HIZ_CCS;
       }
 
       result = add_surface(device, image, &image->planes[plane].aux_surface,
-                           ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane,
-                           ANV_OFFSET_IMPLICIT);
+                           binding, ANV_OFFSET_IMPLICIT);
       if (result != VK_SUCCESS)
          return result;
 
+      if (anv_image_plane_uses_aux_map(device, image, plane)) {
+         result = add_compression_control_buffer(device, image, plane,
+                                                 binding,
+                                                 ANV_OFFSET_IMPLICIT);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+
       if (image->planes[plane].aux_usage == ISL_AUX_USAGE_HIZ_CCS_WT)
-         return add_aux_state_tracking_buffer(device, image, plane);
+         return add_aux_state_tracking_buffer(device, image,
+                                              aux_state_offset,
+                                              plane);
    } else if (aspect == VK_IMAGE_ASPECT_STENCIL_BIT) {
-
-      if (INTEL_DEBUG & DEBUG_NO_RBC)
-         return VK_SUCCESS;
-
       if (!isl_surf_supports_ccs(&device->isl_dev,
                                  &image->planes[plane].primary_surface.isl,
                                  NULL))
          return VK_SUCCESS;
 
       image->planes[plane].aux_usage = ISL_AUX_USAGE_STC_CCS;
+
+      if (device->info->has_aux_map) {
+         result = add_compression_control_buffer(device, image, plane,
+                                                 binding,
+                                                 ANV_OFFSET_IMPLICIT);
+         if (result != VK_SUCCESS)
+            return result;
+      }
    } else if ((aspect & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) && image->vk.samples == 1) {
       if (image->n_planes != 1) {
          /* Multiplanar images seem to hit a sampler bug with CCS and R16G16
@@ -610,7 +860,7 @@ add_aux_surface_if_supported(struct anv_device *device,
          return VK_SUCCESS;
       }
 
-      if ((image->vk.create_flags & VK_IMAGE_CREATE_ALIAS_BIT)) {
+      if ((image->vk.create_flags & VK_IMAGE_CREATE_ALIAS_BIT) && !image->from_wsi) {
          /* The image may alias a plane of a multiplanar image. Above we ban
           * CCS on multiplanar images.
           *
@@ -622,35 +872,6 @@ add_aux_surface_if_supported(struct anv_device *device,
          return VK_SUCCESS;
       }
 
-      if (!isl_format_supports_rendering(&device->info,
-                                         plane_format.isl_format)) {
-         /* Disable CCS because it is not useful (we can't render to the image
-          * with CCS enabled).  While it may be technically possible to enable
-          * CCS for this case, we currently don't have things hooked up to get
-          * it working.
-          */
-         anv_perf_warn(device, &image->vk.base,
-                       "This image format doesn't support rendering. "
-                       "Not allocating an CCS buffer.");
-         return VK_SUCCESS;
-      }
-
-      if (device->info.ver >= 12 && image->vk.array_layers > 1) {
-         /* HSD 14010672564: On TGL, if a block of fragment shader outputs
-          * match the surface's clear color, the HW may convert them to
-          * fast-clears. Anv only does clear color tracking for the first
-          * slice unfortunately. Disable CCS until anv gains more clear color
-          * tracking abilities.
-          */
-         anv_perf_warn(device, &image->vk.base,
-                       "HW may put fast-clear blocks on more slices than SW "
-                       "currently tracks. Not allocating a CCS buffer.");
-         return VK_SUCCESS;
-      }
-
-      if (INTEL_DEBUG & DEBUG_NO_RBC)
-         return VK_SUCCESS;
-
       ok = isl_surf_get_ccs_surf(&device->isl_dev,
                                  &image->planes[plane].primary_surface.isl,
                                  NULL,
@@ -660,23 +881,30 @@ add_aux_surface_if_supported(struct anv_device *device,
          return VK_SUCCESS;
 
       /* Choose aux usage */
-      if (!(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT) &&
-          anv_formats_ccs_e_compatible(&device->info,
-                                       image->vk.create_flags,
-                                       image->vk.format,
-                                       image->vk.tiling,
-                                       fmt_list)) {
-         /* For images created without MUTABLE_FORMAT_BIT set, we know that
-          * they will always be used with the original format.  In particular,
-          * they will always be used with a format that supports color
-          * compression.  If it's never used as a storage image, then it will
-          * only be used through the sampler or the as a render target.  This
-          * means that it's safe to just leave compression on at all times for
-          * these formats.
-          */
-         image->planes[plane].aux_usage = ISL_AUX_USAGE_CCS_E;
-      } else if (device->info.ver >= 12) {
-         anv_perf_warn(device, &image->vk.base,
+      if (anv_formats_ccs_e_compatible(device->info, image->vk.create_flags,
+                                       image->vk.format, image->vk.tiling,
+                                       image->vk.usage, fmt_list)) {
+         if (intel_needs_workaround(device->info, 1607794140)) {
+            /* FCV is permanently enabled on this HW. */
+            image->planes[plane].aux_usage = ISL_AUX_USAGE_FCV_CCS_E;
+         } else if (device->info->verx10 >= 125 &&
+                    !device->physical->disable_fcv) {
+            /* FCV is enabled via 3DSTATE_3D_MODE. We'd expect plain CCS_E to
+             * perform better because it allows for non-zero fast clear colors,
+             * but we've run into regressions in several benchmarks (F1 22 and
+             * RDR2) when trying to enable it. When non-zero clear colors are
+             * enabled, we've observed many partial resolves. We haven't yet
+             * root-caused what layout transitions are causing these resolves,
+             * so in the meantime, we choose to reduce our clear color support.
+             * With only zero clear colors being supported, we might as well
+             * turn on FCV.
+             */
+            image->planes[plane].aux_usage = ISL_AUX_USAGE_FCV_CCS_E;
+         } else {
+            image->planes[plane].aux_usage = ISL_AUX_USAGE_CCS_E;
+         }
+      } else if (device->info->ver >= 12) {
+         anv_perf_warn(VK_LOG_OBJS(&image->vk.base),
                        "The CCS_D aux mode is not yet handled on "
                        "Gfx12+. Not allocating a CCS buffer.");
          image->planes[plane].aux_surface.isl.size_B = 0;
@@ -685,21 +913,22 @@ add_aux_surface_if_supported(struct anv_device *device,
          image->planes[plane].aux_usage = ISL_AUX_USAGE_CCS_D;
       }
 
-      if (!device->physical->has_implicit_ccs) {
-         enum anv_image_memory_binding binding =
-            ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane;
-
-         if (image->vk.drm_format_mod != DRM_FORMAT_MOD_INVALID &&
-             !isl_drm_modifier_has_aux(image->vk.drm_format_mod))
-            binding = ANV_IMAGE_MEMORY_BINDING_PRIVATE;
-
-         result = add_surface(device, image, &image->planes[plane].aux_surface,
-                              binding, offset);
-         if (result != VK_SUCCESS)
-            return result;
+      if (device->info->has_flat_ccs) {
+         result = VK_SUCCESS;
+      } else if (device->info->has_aux_map) {
+         result = add_compression_control_buffer(device, image, plane,
+                                                 binding, offset);
+      } else {
+         result = add_surface(device, image,
+                              &image->planes[plane].aux_surface, binding,
+                              offset);
       }
+      if (result != VK_SUCCESS)
+         return result;
 
-      return add_aux_state_tracking_buffer(device, image, plane);
+      return add_aux_state_tracking_buffer(device, image,
+                                           aux_state_offset,
+                                           plane);
    } else if ((aspect & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) && image->vk.samples > 1) {
       assert(!(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT));
       ok = isl_surf_get_mcs_surf(&device->isl_dev,
@@ -711,51 +940,45 @@ add_aux_surface_if_supported(struct anv_device *device,
       image->planes[plane].aux_usage = ISL_AUX_USAGE_MCS;
 
       result = add_surface(device, image, &image->planes[plane].aux_surface,
-                           ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane,
-                           ANV_OFFSET_IMPLICIT);
+                           binding, ANV_OFFSET_IMPLICIT);
       if (result != VK_SUCCESS)
          return result;
 
-      return add_aux_state_tracking_buffer(device, image, plane);
+      return add_aux_state_tracking_buffer(device, image,
+                                           aux_state_offset,
+                                           plane);
    }
 
    return VK_SUCCESS;
 }
 
 static VkResult
-add_shadow_surface(struct anv_device *device,
-                   struct anv_image *image,
-                   uint32_t plane,
-                   struct anv_format_plane plane_format,
-                   uint32_t stride,
-                   VkImageUsageFlags vk_plane_usage)
+add_video_buffers(struct anv_device *device,
+                  struct anv_image *image,
+                  const struct VkVideoProfileListInfoKHR *profile_list)
 {
    ASSERTED bool ok;
+   unsigned size = 0;
 
-   ok = isl_surf_init(&device->isl_dev,
-                      &image->planes[plane].shadow_surface.isl,
-                     .dim = vk_to_isl_surf_dim[image->vk.image_type],
-                     .format = plane_format.isl_format,
-                     .width = image->vk.extent.width,
-                     .height = image->vk.extent.height,
-                     .depth = image->vk.extent.depth,
-                     .levels = image->vk.mip_levels,
-                     .array_len = image->vk.array_layers,
-                     .samples = image->vk.samples,
-                     .min_alignment_B = 0,
-                     .row_pitch_B = stride,
-                     .usage = ISL_SURF_USAGE_TEXTURE_BIT |
-                              (vk_plane_usage & ISL_SURF_USAGE_CUBE_BIT),
-                     .tiling_flags = ISL_TILING_ANY_MASK);
-
-   /* isl_surf_init() will fail only if provided invalid input. Invalid input
-    * here is illegal in Vulkan.
-    */
-   assert(ok);
+   for (unsigned i = 0; i < profile_list->profileCount; i++) {
+      if (profile_list->pProfiles[i].videoCodecOperation == VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR) {
+         unsigned w_mb = DIV_ROUND_UP(image->vk.extent.width, ANV_MB_WIDTH);
+         unsigned h_mb = DIV_ROUND_UP(image->vk.extent.height, ANV_MB_HEIGHT);
+         size = w_mb * h_mb * 128;
+      }
+      else if (profile_list->pProfiles[i].videoCodecOperation == VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR) {
+         unsigned w_mb = DIV_ROUND_UP(image->vk.extent.width, 32);
+         unsigned h_mb = DIV_ROUND_UP(image->vk.extent.height, 32);
+         size = ALIGN(w_mb * h_mb, 2) << 6;
+      }
+   }
 
-   return add_surface(device, image, &image->planes[plane].shadow_surface,
-                      ANV_IMAGE_MEMORY_BINDING_PLANE_0 + plane,
-                      ANV_OFFSET_IMPLICIT);
+   if (size == 0)
+      return VK_SUCCESS;
+
+   ok = image_binding_grow(device, image, ANV_IMAGE_MEMORY_BINDING_PRIVATE,
+                           ANV_OFFSET_IMPLICIT, size, 65536, &image->vid_dmv_top_surface);
+   return ok;
 }
 
 /**
@@ -777,11 +1000,21 @@ add_primary_surface(struct anv_device *device,
    struct anv_surface *anv_surf = &image->planes[plane].primary_surface;
    bool ok;
 
+   uint32_t width = image->vk.extent.width;
+   uint32_t height = image->vk.extent.height;
+   const struct vk_format_ycbcr_info *ycbcr_info =
+      vk_format_get_ycbcr_info(image->vk.format);
+   if (ycbcr_info) {
+      assert(plane < ycbcr_info->n_planes);
+      width /= ycbcr_info->planes[plane].denominator_scales[0];
+      height /= ycbcr_info->planes[plane].denominator_scales[1];
+   }
+
    ok = isl_surf_init(&device->isl_dev, &anv_surf->isl,
       .dim = vk_to_isl_surf_dim[image->vk.image_type],
       .format = plane_format.isl_format,
-      .width = image->vk.extent.width / plane_format.denominator_scales[0],
-      .height = image->vk.extent.height / plane_format.denominator_scales[1],
+      .width = width,
+      .height = height,
       .depth = image->vk.extent.depth,
       .levels = image->vk.mip_levels,
       .array_len = image->vk.array_layers,
@@ -810,6 +1043,16 @@ memory_range_is_aligned(struct anv_image_memory_range memory_range)
 {
    return anv_is_aligned(memory_range.offset, memory_range.alignment);
 }
+
+static bool MUST_CHECK
+memory_ranges_equal(struct anv_image_memory_range a,
+                    struct anv_image_memory_range b)
+{
+   return a.binding == b.binding &&
+          a.alignment == b.alignment &&
+          a.size == b.size &&
+          a.offset == b.offset;
+}
 #endif
 
 struct check_memory_range_params {
@@ -858,7 +1101,7 @@ static void
 check_memory_bindings(const struct anv_device *device,
                      const struct anv_image *image)
 {
-#ifdef DEBUG
+#if MESA_DEBUG
    /* As we inspect each part of the image, we merge the part's memory range
     * into these accumulation ranges.
     */
@@ -878,9 +1121,12 @@ check_memory_bindings(const struct anv_device *device,
          : ANV_IMAGE_MEMORY_BINDING_MAIN;
 
       /* Aliasing is incompatible with the private binding because it does not
-       * live in a VkDeviceMemory.
+       * live in a VkDeviceMemory.  The exception is either swapchain images or
+       * that the private binding is for a video motion vector buffer.
        */
       assert(!(image->vk.create_flags & VK_IMAGE_CREATE_ALIAS_BIT) ||
+             image->from_wsi ||
+             (plane->primary_surface.isl.usage & ISL_SURF_USAGE_VIDEO_DECODE_BIT) ||
              image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE].memory_range.size == 0);
 
       /* Check primary surface */
@@ -888,20 +1134,22 @@ check_memory_bindings(const struct anv_device *device,
                          .test_surface = &plane->primary_surface,
                          .expect_binding = primary_binding);
 
-      /* Check shadow surface */
-      if (anv_surface_is_valid(&plane->shadow_surface)) {
-         check_memory_range(accum_ranges,
-                            .test_surface = &plane->shadow_surface,
-                            .expect_binding = primary_binding);
-      }
-
       /* Check aux_surface */
-      if (anv_surface_is_valid(&plane->aux_surface)) {
+      const struct anv_image_memory_range *aux_mem_range =
+         anv_image_get_aux_memory_range(image, p);
+      if (aux_mem_range->size > 0) {
          enum anv_image_memory_binding binding = primary_binding;
 
-         if (image->vk.drm_format_mod != DRM_FORMAT_MOD_INVALID &&
-             !isl_drm_modifier_has_aux(image->vk.drm_format_mod))
+         /* If an auxiliary surface is used for an externally-shareable image,
+          * we have to hide this from the memory of the image since other
+          * processes with access to the memory may not be aware of it or of
+          * its current state. So put that auxiliary data into a separate
+          * buffer (ANV_IMAGE_MEMORY_BINDING_PRIVATE).
+          */
+         if (anv_image_is_externally_shared(image) &&
+             !isl_drm_modifier_has_aux(image->vk.drm_format_mod)) {
             binding = ANV_IMAGE_MEMORY_BINDING_PRIVATE;
+         }
 
          /* Display hardware requires that the aux surface start at
           * a higher address than the primary surface. The 3D hardware
@@ -909,7 +1157,7 @@ check_memory_bindings(const struct anv_device *device,
           * the image is sent to display.
           */
          check_memory_range(accum_ranges,
-                            .test_surface = &plane->aux_surface,
+                            .test_range = aux_mem_range,
                             .expect_binding = binding);
       }
 
@@ -917,8 +1165,19 @@ check_memory_bindings(const struct anv_device *device,
       if (plane->fast_clear_memory_range.size > 0) {
          enum anv_image_memory_binding binding = primary_binding;
 
-         if (image->vk.drm_format_mod != DRM_FORMAT_MOD_INVALID)
+         /* If an auxiliary surface is used for an externally-shareable image,
+          * we have to hide this from the memory of the image since other
+          * processes with access to the memory may not be aware of it or of
+          * its current state. So put that auxiliary data into a separate
+          * buffer (ANV_IMAGE_MEMORY_BINDING_PRIVATE).
+          *
+          * But when the image is created with a drm modifier that supports
+          * clear color, it will be exported along with main surface.
+          */
+         if (anv_image_is_externally_shared(image)
+             && !isl_drm_modifier_get_info(image->vk.drm_format_mod)->supports_clear_color) {
             binding = ANV_IMAGE_MEMORY_BINDING_PRIVATE;
+         }
 
          /* We believe that 256B alignment may be sufficient, but we choose 4K
           * due to lack of testing.  And MI_LOAD/STORE operations require
@@ -969,7 +1228,7 @@ check_drm_format_mod(const struct anv_device *device,
       isl_drm_modifier_get_info(image->vk.drm_format_mod);
 
    /* Driver must support the modifier. */
-   assert(isl_drm_modifier_get_score(&device->info, isl_mod_info->modifier));
+   assert(isl_drm_modifier_get_score(device->info, isl_mod_info->modifier));
 
    /* Enforced by us, not the Vulkan spec. */
    assert(image->vk.image_type == VK_IMAGE_TYPE_2D);
@@ -988,9 +1247,8 @@ check_drm_format_mod(const struct anv_device *device,
       assert(isl_layout->txc == ISL_TXC_NONE);
       assert(isl_layout->colorspace == ISL_COLORSPACE_LINEAR ||
              isl_layout->colorspace == ISL_COLORSPACE_SRGB);
-      assert(!anv_surface_is_valid(&plane->shadow_surface));
 
-      if (isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE) {
+      if (isl_drm_modifier_has_aux(isl_mod_info->modifier)) {
          /* Reject DISJOINT for consistency with the GL driver. */
          assert(!image->disjoint);
 
@@ -998,8 +1256,11 @@ check_drm_format_mod(const struct anv_device *device,
           * The inverse, however, does not hold; if the modifier has no aux
           * usage, then we may enable a private aux surface.
           */
-         if (plane->aux_usage != isl_mod_info->aux_usage) {
-            return vk_errorf(device, &image->vk.base, VK_ERROR_UNKNOWN,
+         if ((isl_mod_info->supports_media_compression &&
+              plane->aux_usage != ISL_AUX_USAGE_MC) ||
+             (isl_mod_info->supports_render_compression &&
+              !isl_aux_usage_has_ccs_e(plane->aux_usage))) {
+            return vk_errorf(device, VK_ERROR_UNKNOWN,
                              "image with modifier unexpectedly has wrong aux "
                              "usage");
          }
@@ -1020,55 +1281,67 @@ add_all_surfaces_implicit_layout(
    const VkImageFormatListCreateInfo *format_list_info,
    uint32_t stride,
    isl_tiling_flags_t isl_tiling_flags,
-   const struct anv_image_create_info *create_info)
+   isl_surf_usage_flags_t isl_extra_usage_flags)
 {
-   assert(create_info);
-   const struct intel_device_info *devinfo = &device->info;
-   isl_surf_usage_flags_t isl_extra_usage_flags =
-      create_info->isl_extra_usage_flags;
+   const struct intel_device_info *devinfo = device->info;
    VkResult result;
 
+   const struct vk_format_ycbcr_info *ycbcr_info =
+      vk_format_get_ycbcr_info(image->vk.format);
+   if (ycbcr_info)
+      assert(ycbcr_info->n_planes == image->n_planes);
+
+   unsigned num_aspects = 0;
+   VkImageAspectFlagBits aspects[3];
    u_foreach_bit(b, image->vk.aspects) {
-      VkImageAspectFlagBits aspect = 1 << b;
+      assert(num_aspects < 3);
+      aspects[num_aspects++] = 1 << b;
+   }
+   assert(num_aspects == image->n_planes);
+
+   /* The Android hardware buffer YV12 format has the planes ordered as Y-Cr-Cb,
+    * while Vulkan expects VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM to be in Y-Cb-Cr.
+    * Adjust the order we add the ISL surfaces accordingly so the implicit
+    * offset gets calculated correctly.
+    */
+   if (image->from_ahb && image->vk.format == VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM) {
+      assert(num_aspects == 3);
+      assert(aspects[1] == VK_IMAGE_ASPECT_PLANE_1_BIT);
+      assert(aspects[2] == VK_IMAGE_ASPECT_PLANE_2_BIT);
+      aspects[1] = VK_IMAGE_ASPECT_PLANE_2_BIT;
+      aspects[2] = VK_IMAGE_ASPECT_PLANE_1_BIT;
+   }
+
+   for (unsigned i = 0; i < num_aspects; i++) {
+      VkImageAspectFlagBits aspect = aspects[i];
       const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
       const  struct anv_format_plane plane_format =
          anv_get_format_plane(devinfo, image->vk.format, plane, image->vk.tiling);
 
+      enum isl_format isl_fmt = plane_format.isl_format;
+      assert(isl_fmt != ISL_FORMAT_UNSUPPORTED);
+
+      uint32_t plane_stride = stride * isl_format_get_layout(isl_fmt)->bpb / 8;
+      if (ycbcr_info)
+         plane_stride /= ycbcr_info->planes[plane].denominator_scales[0];
+
       VkImageUsageFlags vk_usage = vk_image_usage(&image->vk, aspect);
       isl_surf_usage_flags_t isl_usage =
-         choose_isl_surf_usage(image->vk.create_flags, vk_usage,
-                               isl_extra_usage_flags, aspect);
-
-      /* Must call this before adding any surfaces because it may modify
-       * isl_tiling_flags.
-       */
-      bool needs_shadow =
-         anv_image_plane_needs_shadow_surface(devinfo, plane_format,
-                                              image->vk.tiling, vk_usage,
-                                              image->vk.create_flags,
-                                              &isl_tiling_flags);
+         anv_image_choose_isl_surf_usage(device->physical,
+                                         image->vk.create_flags, vk_usage,
+                                         isl_extra_usage_flags, aspect,
+                                         image->vk.compr_flags);
 
       result = add_primary_surface(device, image, plane, plane_format,
-                                   ANV_OFFSET_IMPLICIT, stride,
+                                   ANV_OFFSET_IMPLICIT, plane_stride,
                                    isl_tiling_flags, isl_usage);
       if (result != VK_SUCCESS)
          return result;
 
-      if (needs_shadow) {
-         result = add_shadow_surface(device, image, plane, plane_format,
-                                     stride, vk_usage);
-         if (result != VK_SUCCESS)
-            return result;
-      }
-
-      /* Disable aux if image supports export without modifiers. */
-      if (image->vk.external_handle_types != 0 &&
-          image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT)
-         continue;
-
       result = add_aux_surface_if_supported(device, image, plane, plane_format,
                                             format_list_info,
-                                            ANV_OFFSET_IMPLICIT, stride,
+                                            ANV_OFFSET_IMPLICIT, plane_stride,
+                                            ANV_OFFSET_IMPLICIT,
                                             isl_extra_usage_flags);
       if (result != VK_SUCCESS)
          return result;
@@ -1089,56 +1362,54 @@ add_all_surfaces_explicit_layout(
    isl_tiling_flags_t isl_tiling_flags,
    isl_surf_usage_flags_t isl_extra_usage_flags)
 {
-   const struct intel_device_info *devinfo = &device->info;
+   const struct intel_device_info *devinfo = device->info;
    const uint32_t mod_plane_count = drm_info->drmFormatModifierPlaneCount;
    const bool mod_has_aux =
       isl_drm_modifier_has_aux(drm_info->drmFormatModifier);
    VkResult result;
 
-   /* About valid usage in the Vulkan spec:
-    *
-    * Unlike vanilla vkCreateImage, which produces undefined behavior on user
-    * error, here the spec requires the implementation to return
-    * VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT if the app provides
-    * a bad plane layout. However, the spec does require
-    * drmFormatModifierPlaneCount to be valid.
-    *
-    * Most validation of plane layout occurs in add_surface().
-    */
-
-   /* We support a restricted set of images with modifiers.
-    *
-    * With aux usage,
-    * - Format plane count must be 1.
-    * - Memory plane count must be 2.
-    * Without aux usage,
-    * - Each format plane must map to a distint memory plane.
-    *
-    * For the other cases, currently there is no way to properly map memory
-    * planes to format planes and aux planes due to the lack of defined ABI
-    * for external multi-planar images.
+   /* Currently there is no way to properly map memory planes to format planes
+    * and aux planes due to the lack of defined ABI for external multi-planar
+    * images.
     */
    if (image->n_planes == 1)
       assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
    else
       assert(!(image->vk.aspects & ~VK_IMAGE_ASPECT_PLANES_BITS_ANV));
 
-   if (mod_has_aux)
-      assert(image->n_planes == 1 && mod_plane_count == 2);
-   else
+   if (mod_has_aux) {
+      assert(image->n_planes == 1);
+
+      /* About valid usage in the Vulkan spec:
+       *
+       * Unlike vanilla vkCreateImage, which produces undefined behavior on user
+       * error, here the spec requires the implementation to return
+       * VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT if the app provides
+       * a bad plane layout. However, the spec does require
+       * drmFormatModifierPlaneCount to be valid.
+       *
+       * Most validation of plane layout occurs in add_surface().
+       */
+      uint32_t n_mod_planes =
+         isl_drm_modifier_get_plane_count(devinfo,
+                                          drm_info->drmFormatModifier,
+                                          image->n_planes);
+      assert(n_mod_planes == mod_plane_count);
+   } else {
       assert(image->n_planes == mod_plane_count);
+   }
 
    /* Reject special values in the app-provided plane layouts. */
    for (uint32_t i = 0; i < mod_plane_count; ++i) {
       if (drm_info->pPlaneLayouts[i].rowPitch == 0) {
-         return vk_errorf(device, &device->vk.base,
+         return vk_errorf(device,
                           VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT,
                           "VkImageDrmFormatModifierExplicitCreateInfoEXT::"
                           "pPlaneLayouts[%u]::rowPitch is 0", i);
       }
 
       if (drm_info->pPlaneLayouts[i].offset == ANV_OFFSET_IMPLICIT) {
-         return vk_errorf(device, &device->vk.base,
+         return vk_errorf(device,
                           VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT,
                           "VkImageDrmFormatModifierExplicitCreateInfoEXT::"
                           "pPlaneLayouts[%u]::offset is %" PRIu64,
@@ -1162,27 +1433,44 @@ add_all_surfaces_explicit_layout(
       if (result != VK_SUCCESS)
          return result;
 
-      if (!mod_has_aux) {
-         /* Even though the modifier does not support aux, try to create
-          * a driver-private aux to improve performance.
+      if (mod_has_aux) {
+         const VkSubresourceLayout flat_ccs_layout = {
+            .offset = ANV_OFFSET_IMPLICIT,
+         };
+
+         const VkSubresourceLayout *aux_layout;
+
+         uint64_t aux_state_offset = ANV_OFFSET_IMPLICIT;
+
+         /* We already asserted on image->n_planes == 1 when mod_has_aux is
+          * true above, so the indexes of aux and clear color are just hard-
+          * coded without ambiguity.
           */
-         result = add_aux_surface_if_supported(device, image, plane,
-                                               format_plane,
-                                               format_list_info,
-                                               ANV_OFFSET_IMPLICIT, 0,
-                                               isl_extra_usage_flags);
-         if (result != VK_SUCCESS)
-            return result;
-      } else {
-         const VkSubresourceLayout *aux_layout = &drm_info->pPlaneLayouts[1];
+         if (devinfo->has_flat_ccs) {
+            aux_layout = &flat_ccs_layout;
+            if (isl_drm_modifier_get_info(
+                  drm_info->drmFormatModifier)->supports_clear_color) {
+               aux_state_offset = drm_info->pPlaneLayouts[1].offset;
+            }
+         } else {
+            aux_layout = &drm_info->pPlaneLayouts[1];
+            if (isl_drm_modifier_get_info(
+                  drm_info->drmFormatModifier)->supports_clear_color) {
+               aux_state_offset = drm_info->pPlaneLayouts[2].offset;
+            }
+         }
+
          result = add_aux_surface_if_supported(device, image, plane,
                                                format_plane,
                                                format_list_info,
                                                aux_layout->offset,
                                                aux_layout->rowPitch,
+                                               aux_state_offset,
                                                isl_extra_usage_flags);
          if (result != VK_SUCCESS)
             return result;
+
+         assert(isl_aux_usage_has_ccs(image->planes[plane].aux_usage));
       }
    }
 
@@ -1248,33 +1536,107 @@ alloc_private_binding(struct anv_device *device,
       return VK_SUCCESS;
    }
 
-   return anv_device_alloc_bo(device, "image-binding-private",
-                              binding->memory_range.size, 0, 0,
-                              &binding->address.bo);
+   VkResult result = anv_device_alloc_bo(device, "image-binding-private",
+                                         binding->memory_range.size, 0, 0,
+                                         &binding->address.bo);
+   if (result == VK_SUCCESS) {
+      pthread_mutex_lock(&device->mutex);
+      list_addtail(&image->link, &device->image_private_objects);
+      pthread_mutex_unlock(&device->mutex);
+   }
+
+   return result;
+}
+
+static void
+anv_image_finish_sparse_bindings(struct anv_image *image)
+{
+   struct anv_device *device =
+      container_of(image->vk.base.device, struct anv_device, vk);
+
+   assert(anv_image_is_sparse(image));
+
+   for (int i = 0; i < ANV_IMAGE_MEMORY_BINDING_END; i++) {
+      struct anv_image_binding *b = &image->bindings[i];
+
+      if (b->sparse_data.size != 0) {
+         assert(b->memory_range.size == b->sparse_data.size);
+         assert(b->address.offset == b->sparse_data.address);
+         anv_free_sparse_bindings(device, &b->sparse_data);
+      }
+   }
+}
+
+static VkResult MUST_CHECK
+anv_image_init_sparse_bindings(struct anv_image *image,
+                               const struct anv_image_create_info *create_info)
+{
+   struct anv_device *device =
+      container_of(image->vk.base.device, struct anv_device, vk);
+   VkResult result;
+
+   assert(anv_image_is_sparse(image));
+
+   enum anv_bo_alloc_flags alloc_flags = 0;
+   uint64_t explicit_address = 0;
+   if (image->vk.create_flags & VK_IMAGE_CREATE_DESCRIPTOR_BUFFER_CAPTURE_REPLAY_BIT_EXT) {
+      alloc_flags |= ANV_BO_ALLOC_FIXED_ADDRESS;
+
+      const VkOpaqueCaptureDescriptorDataCreateInfoEXT *opaque_info =
+         vk_find_struct_const(create_info->vk_info->pNext,
+                              OPAQUE_CAPTURE_DESCRIPTOR_DATA_CREATE_INFO_EXT);
+      if (opaque_info)
+         explicit_address = *((const uint64_t *)opaque_info->opaqueCaptureDescriptorData);
+   }
+
+   for (int i = 0; i < ANV_IMAGE_MEMORY_BINDING_END; i++) {
+      struct anv_image_binding *b = &image->bindings[i];
+
+      if (b->memory_range.size != 0) {
+         assert(b->sparse_data.size == 0);
+
+         /* From the spec, Custom Sparse Image Block Shapes section:
+          *   "... the size in bytes of the custom sparse image block shape
+          *    will be reported in VkMemoryRequirements::alignment."
+          *
+          * ISL should have set this for us, so just assert it here.
+          */
+         assert(b->memory_range.alignment == ANV_SPARSE_BLOCK_SIZE);
+         assert(b->memory_range.size % ANV_SPARSE_BLOCK_SIZE == 0);
+
+         result = anv_init_sparse_bindings(device,
+                                           b->memory_range.size,
+                                           &b->sparse_data,
+                                           alloc_flags,
+                                           explicit_address,
+                                           &b->address);
+         if (result != VK_SUCCESS) {
+            anv_image_finish_sparse_bindings(image);
+            return result;
+         }
+      }
+   }
+
+   return VK_SUCCESS;
 }
 
 VkResult
-anv_image_create(VkDevice _device,
-                 const struct anv_image_create_info *create_info,
-                 const VkAllocationCallbacks* alloc,
-                 VkImage *pImage)
+anv_image_init(struct anv_device *device, struct anv_image *image,
+               const struct anv_image_create_info *create_info)
 {
-   ANV_FROM_HANDLE(anv_device, device, _device);
    const VkImageCreateInfo *pCreateInfo = create_info->vk_info;
    const struct VkImageDrmFormatModifierExplicitCreateInfoEXT *mod_explicit_info = NULL;
    const struct isl_drm_modifier_info *isl_mod_info = NULL;
-   struct anv_image *image = NULL;
    VkResult r;
 
-   image = vk_image_create(&device->vk, pCreateInfo, alloc, sizeof(*image));
-   if (image == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   vk_image_init(&device->vk, &image->vk, pCreateInfo);
 
    image->vk.usage = anv_image_create_usage(pCreateInfo, image->vk.usage);
    image->vk.stencil_usage =
       anv_image_create_usage(pCreateInfo, image->vk.stencil_usage);
 
    if (pCreateInfo->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
+      assert(!image->vk.wsi_legacy_scanout);
       mod_explicit_info =
          vk_find_struct_const(pCreateInfo->pNext,
                               IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT);
@@ -1304,12 +1666,17 @@ anv_image_create(VkDevice _device,
    if (image->vk.external_handle_types &
        VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID) {
       image->from_ahb = true;
-      *pImage = anv_image_to_handle(image);
+#if DETECT_OS_ANDROID
+      image->vk.ahb_format = anv_ahb_format_for_vk_format(image->vk.format);
+#endif
       return VK_SUCCESS;
    }
 
    image->n_planes = anv_get_format_planes(image->vk.format);
 
+   image->from_wsi =
+      vk_find_struct_const(pCreateInfo->pNext, WSI_IMAGE_CREATE_INFO_MESA) != NULL;
+
    /* The Vulkan 1.2.165 glossary says:
     *
     *    A disjoint image consists of multiple disjoint planes, and is created
@@ -1318,30 +1685,88 @@ anv_image_create(VkDevice _device,
    image->disjoint = image->n_planes > 1 &&
                      (pCreateInfo->flags & VK_IMAGE_CREATE_DISJOINT_BIT);
 
+   isl_surf_usage_flags_t isl_extra_usage_flags = create_info->isl_extra_usage_flags;
+   if (anv_is_format_emulated(device->physical, pCreateInfo->format)) {
+      assert(image->n_planes == 1 &&
+             vk_format_is_compressed(image->vk.format));
+      assert(!(image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT));
+
+      image->emu_plane_format =
+         anv_get_emulation_format(device->physical, image->vk.format);
+
+      /* for fetching the raw copmressed data and storing the decompressed
+       * data
+       */
+      image->vk.create_flags |=
+         VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT |
+         VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT;
+      if (image->vk.image_type == VK_IMAGE_TYPE_3D)
+         image->vk.create_flags |= VK_IMAGE_CREATE_2D_VIEW_COMPATIBLE_BIT_EXT;
+      image->vk.usage |=
+         VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT;
+
+      /* TODO: enable compression on emulation plane */
+      isl_extra_usage_flags |= ISL_SURF_USAGE_DISABLE_AUX_BIT;
+   }
+
+   /* Disable aux if image supports export without modifiers. */
+   if (image->vk.external_handle_types != 0 &&
+       image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT)
+      isl_extra_usage_flags |= ISL_SURF_USAGE_DISABLE_AUX_BIT;
+
    const isl_tiling_flags_t isl_tiling_flags =
-      choose_isl_tiling_flags(&device->info, create_info, isl_mod_info,
+      choose_isl_tiling_flags(device->info, create_info, isl_mod_info,
                               image->vk.wsi_legacy_scanout);
 
-   const VkImageFormatListCreateInfoKHR *fmt_list =
+   const VkImageFormatListCreateInfo *fmt_list =
       vk_find_struct_const(pCreateInfo->pNext,
-                           IMAGE_FORMAT_LIST_CREATE_INFO_KHR);
+                           IMAGE_FORMAT_LIST_CREATE_INFO);
 
    if (mod_explicit_info) {
       r = add_all_surfaces_explicit_layout(device, image, fmt_list,
                                            mod_explicit_info, isl_tiling_flags,
-                                           create_info->isl_extra_usage_flags);
+                                           isl_extra_usage_flags);
    } else {
-      r = add_all_surfaces_implicit_layout(device, image, fmt_list, 0,
+      r = add_all_surfaces_implicit_layout(device, image, fmt_list, create_info->stride,
                                            isl_tiling_flags,
-                                           create_info);
+                                           isl_extra_usage_flags);
    }
 
    if (r != VK_SUCCESS)
       goto fail;
 
-   r = alloc_private_binding(device, image, pCreateInfo);
-   if (r != VK_SUCCESS)
-      goto fail;
+   if (image->emu_plane_format != VK_FORMAT_UNDEFINED) {
+      const struct intel_device_info *devinfo = device->info;
+      const uint32_t plane = image->n_planes;
+      const struct anv_format_plane plane_format = anv_get_format_plane(
+            devinfo, image->emu_plane_format, 0, image->vk.tiling);
+
+      isl_surf_usage_flags_t isl_usage = anv_image_choose_isl_surf_usage(
+         device->physical, image->vk.create_flags, image->vk.usage,
+         isl_extra_usage_flags, VK_IMAGE_ASPECT_COLOR_BIT,
+         image->vk.compr_flags);
+
+      r = add_primary_surface(device, image, plane, plane_format,
+                              ANV_OFFSET_IMPLICIT, 0,
+                              isl_tiling_flags, isl_usage);
+      if (r != VK_SUCCESS)
+         goto fail;
+   }
+
+   const VkVideoProfileListInfoKHR *video_profile =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           VIDEO_PROFILE_LIST_INFO_KHR);
+   if (video_profile) {
+      r = add_video_buffers(device, image, video_profile);
+      if (r != VK_SUCCESS)
+         goto fail;
+   }
+
+   if (!create_info->no_private_binding_alloc) {
+      r = alloc_private_binding(device, image, pCreateInfo);
+      if (r != VK_SUCCESS)
+         goto fail;
+   }
 
    check_memory_bindings(device, image);
 
@@ -1349,95 +1774,135 @@ anv_image_create(VkDevice _device,
    if (r != VK_SUCCESS)
       goto fail;
 
-   *pImage = anv_image_to_handle(image);
+   /* Once we have all the bindings, determine whether we can do non 0 fast
+    * clears for each plane.
+    */
+   for (uint32_t p = 0; p < image->n_planes; p++) {
+      image->planes[p].can_non_zero_fast_clear =
+         can_fast_clear_with_non_zero_color(device->info, image, p, fmt_list);
+   }
+
+   if (anv_image_is_sparse(image)) {
+      r = anv_image_init_sparse_bindings(image, create_info);
+      if (r != VK_SUCCESS)
+         goto fail;
+   }
 
    return VK_SUCCESS;
 
 fail:
-   vk_image_destroy(&device->vk, alloc, &image->vk);
+   vk_image_finish(&image->vk);
    return r;
 }
 
-static struct anv_image *
-anv_swapchain_get_image(VkSwapchainKHR swapchain,
-                        uint32_t index)
+void
+anv_image_finish(struct anv_image *image)
 {
-   uint32_t n_images = index + 1;
-   VkImage *images = malloc(sizeof(*images) * n_images);
-   VkResult result = wsi_common_get_images(swapchain, &n_images, images);
+   struct anv_device *device =
+      container_of(image->vk.base.device, struct anv_device, vk);
 
-   if (result != VK_SUCCESS && result != VK_INCOMPLETE) {
-      free(images);
-      return NULL;
+   if (anv_image_is_sparse(image))
+      anv_image_finish_sparse_bindings(image);
+
+   /* Unmap a CCS so that if the bound region of the image is rebound to
+    * another image, the AUX tables will be cleared to allow for a new
+    * mapping.
+    */
+   for (int p = 0; p < image->n_planes; ++p) {
+      if (image->planes[p].aux_tt.mapped) {
+         intel_aux_map_del_mapping(device->aux_map_ctx,
+                                   image->planes[p].aux_tt.addr,
+                                   image->planes[p].aux_tt.size);
+      }
+   }
+
+   if (image->from_gralloc) {
+      assert(!image->disjoint);
+      assert(image->n_planes == 1);
+      assert(image->planes[0].primary_surface.memory_range.binding ==
+             ANV_IMAGE_MEMORY_BINDING_MAIN);
+      assert(image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo != NULL);
+      anv_device_release_bo(device, image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo);
    }
 
-   ANV_FROM_HANDLE(anv_image, image, images[index]);
-   free(images);
+   struct anv_bo *private_bo = image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE].address.bo;
+   if (private_bo) {
+      pthread_mutex_lock(&device->mutex);
+      list_del(&image->link);
+      pthread_mutex_unlock(&device->mutex);
+      anv_device_release_bo(device, private_bo);
+   }
 
-   return image;
+   vk_image_finish(&image->vk);
 }
 
-static VkResult
-anv_image_from_swapchain(VkDevice device,
-                         const VkImageCreateInfo *pCreateInfo,
-                         const VkImageSwapchainCreateInfoKHR *swapchain_info,
-                         const VkAllocationCallbacks *pAllocator,
-                         VkImage *pImage)
+static struct anv_image *
+anv_swapchain_get_image(VkSwapchainKHR swapchain,
+                        uint32_t index)
 {
-   struct anv_image *swapchain_image = anv_swapchain_get_image(swapchain_info->swapchain, 0);
-   assert(swapchain_image);
-
-   VkImageCreateInfo local_create_info = *pCreateInfo;
-   local_create_info.pNext = NULL;
+   VkImage image = wsi_common_get_image(swapchain, index);
+   return anv_image_from_handle(image);
+}
 
-   /* Added by wsi code. */
-   local_create_info.usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+static VkResult
+anv_image_init_from_create_info(struct anv_device *device,
+                                struct anv_image *image,
+                                const VkImageCreateInfo *pCreateInfo,
+                                bool no_private_binding_alloc)
+{
+   if (pCreateInfo->flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT) {
+      VkResult result =
+         anv_sparse_image_check_support(device->physical,
+                                        pCreateInfo->flags,
+                                        pCreateInfo->tiling,
+                                        pCreateInfo->samples,
+                                        pCreateInfo->imageType,
+                                        pCreateInfo->format);
+      if (result != VK_SUCCESS)
+         return result;
+   }
 
-   /* The spec requires TILING_OPTIMAL as input, but the swapchain image may
-    * privately use a different tiling.  See spec anchor
-    * #swapchain-wsi-image-create-info .
-    */
-   assert(local_create_info.tiling == VK_IMAGE_TILING_OPTIMAL);
-   local_create_info.tiling = swapchain_image->vk.tiling;
+   const VkNativeBufferANDROID *gralloc_info =
+      vk_find_struct_const(pCreateInfo->pNext, NATIVE_BUFFER_ANDROID);
+   if (gralloc_info)
+      return anv_image_init_from_gralloc(device, image, pCreateInfo,
+                                         gralloc_info);
 
-   VkImageDrmFormatModifierListCreateInfoEXT local_modifier_info = {
-      .sType = VK_STRUCTURE_TYPE_IMAGE_DRM_FORMAT_MODIFIER_LIST_CREATE_INFO_EXT,
-      .drmFormatModifierCount = 1,
-      .pDrmFormatModifiers = &swapchain_image->vk.drm_format_mod,
+   struct anv_image_create_info create_info = {
+      .vk_info = pCreateInfo,
+      .no_private_binding_alloc = no_private_binding_alloc,
    };
 
-   if (swapchain_image->vk.drm_format_mod != DRM_FORMAT_MOD_INVALID)
-      __vk_append_struct(&local_create_info, &local_modifier_info);
-
-   assert(swapchain_image->vk.image_type == local_create_info.imageType);
-   assert(swapchain_image->vk.format == local_create_info.format);
-   assert(swapchain_image->vk.extent.width == local_create_info.extent.width);
-   assert(swapchain_image->vk.extent.height == local_create_info.extent.height);
-   assert(swapchain_image->vk.extent.depth == local_create_info.extent.depth);
-   assert(swapchain_image->vk.array_layers == local_create_info.arrayLayers);
-   assert(swapchain_image->vk.samples == local_create_info.samples);
-   assert(swapchain_image->vk.tiling == local_create_info.tiling);
-   assert(swapchain_image->vk.usage == local_create_info.usage);
+   /* For dmabuf imports, configure the primary surface without support for
+    * compression if the modifier doesn't specify it. This helps to create
+    * VkImages with memory requirements that are compatible with the buffers
+    * apps provide.
+    */
+   const struct VkImageDrmFormatModifierExplicitCreateInfoEXT *mod_explicit_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           IMAGE_DRM_FORMAT_MODIFIER_EXPLICIT_CREATE_INFO_EXT);
+   if (mod_explicit_info &&
+       !isl_drm_modifier_has_aux(mod_explicit_info->drmFormatModifier))
+      create_info.isl_extra_usage_flags |= ISL_SURF_USAGE_DISABLE_AUX_BIT;
 
-   return anv_image_create(device,
-      &(struct anv_image_create_info) {
-         .vk_info = &local_create_info,
-      },
-      pAllocator,
-      pImage);
+   return anv_image_init(device, image, &create_info);
 }
 
-VkResult
-anv_CreateImage(VkDevice device,
-                const VkImageCreateInfo *pCreateInfo,
-                const VkAllocationCallbacks *pAllocator,
-                VkImage *pImage)
+VkResult anv_CreateImage(
+    VkDevice                                    _device,
+    const VkImageCreateInfo*                    pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkImage*                                    pImage)
 {
-   const VkNativeBufferANDROID *gralloc_info =
-      vk_find_struct_const(pCreateInfo->pNext, NATIVE_BUFFER_ANDROID);
-   if (gralloc_info)
-      return anv_image_from_gralloc(device, pCreateInfo, gralloc_info,
-                                    pAllocator, pImage);
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   if ((device->physical->sparse_type == ANV_SPARSE_TYPE_NOT_SUPPORTED) &&
+       INTEL_DEBUG(DEBUG_SPARSE) &&
+       pCreateInfo->flags & (VK_IMAGE_CREATE_SPARSE_BINDING_BIT |
+                             VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT |
+                             VK_IMAGE_CREATE_SPARSE_ALIASED_BIT))
+      fprintf(stderr, "=== %s %s:%d flags:0x%08x\n", __func__, __FILE__,
+              __LINE__, pCreateInfo->flags);
 
 #ifndef VK_USE_PLATFORM_ANDROID_KHR
    /* Ignore swapchain creation info on Android. Since we don't have an
@@ -1446,17 +1911,33 @@ anv_CreateImage(VkDevice device,
     */
    const VkImageSwapchainCreateInfoKHR *swapchain_info =
       vk_find_struct_const(pCreateInfo->pNext, IMAGE_SWAPCHAIN_CREATE_INFO_KHR);
-   if (swapchain_info && swapchain_info->swapchain != VK_NULL_HANDLE)
-      return anv_image_from_swapchain(device, pCreateInfo, swapchain_info,
-                                      pAllocator, pImage);
+   if (swapchain_info && swapchain_info->swapchain != VK_NULL_HANDLE) {
+      return wsi_common_create_swapchain_image(&device->physical->wsi_device,
+                                               pCreateInfo,
+                                               swapchain_info->swapchain,
+                                               pImage);
+   }
 #endif
 
-   return anv_image_create(device,
-      &(struct anv_image_create_info) {
-         .vk_info = pCreateInfo,
-      },
-      pAllocator,
-      pImage);
+   struct anv_image *image =
+      vk_object_zalloc(&device->vk, pAllocator, sizeof(*image),
+                       VK_OBJECT_TYPE_IMAGE);
+   if (!image)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   VkResult result = anv_image_init_from_create_info(device, image,
+                                                     pCreateInfo,
+                                                     false);
+   if (result != VK_SUCCESS) {
+      vk_object_free(&device->vk, pAllocator, image);
+      return result;
+   }
+
+   ANV_RMV(image_create, device, false, image);
+
+   *pImage = anv_image_to_handle(image);
+
+   return result;
 }
 
 void
@@ -1469,20 +1950,12 @@ anv_DestroyImage(VkDevice _device, VkImage _image,
    if (!image)
       return;
 
-   if (image->from_gralloc) {
-      assert(!image->disjoint);
-      assert(image->n_planes == 1);
-      assert(image->planes[0].primary_surface.memory_range.binding ==
-             ANV_IMAGE_MEMORY_BINDING_MAIN);
-      assert(image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo != NULL);
-      anv_device_release_bo(device, image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address.bo);
-   }
+   ANV_RMV(image_destroy, device, image);
 
-   struct anv_bo *private_bo = image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE].address.bo;
-   if (private_bo)
-      anv_device_release_bo(device, private_bo);
+   assert(&device->vk == image->vk.base.device);
+   anv_image_finish(image);
 
-   vk_image_destroy(&device->vk, pAllocator, &image->vk);
+   vk_free2(&device->vk.alloc, pAllocator, image);
 }
 
 /* We are binding AHardwareBuffer. Get a description, resolve the
@@ -1493,54 +1966,21 @@ resolve_ahw_image(struct anv_device *device,
                   struct anv_image *image,
                   struct anv_device_memory *mem)
 {
-#if defined(ANDROID) && ANDROID_API_LEVEL >= 26
-   assert(mem->ahw);
+#if DETECT_OS_ANDROID && ANDROID_API_LEVEL >= 26
+   assert(mem->vk.ahardware_buffer);
    AHardwareBuffer_Desc desc;
-   AHardwareBuffer_describe(mem->ahw, &desc);
+   AHardwareBuffer_describe(mem->vk.ahardware_buffer, &desc);
    VkResult result;
 
    /* Check tiling. */
-   int i915_tiling = anv_gem_get_tiling(device, mem->bo->gem_handle);
-   VkImageTiling vk_tiling;
-   isl_tiling_flags_t isl_tiling_flags = 0;
-
-   switch (i915_tiling) {
-   case I915_TILING_NONE:
-      vk_tiling = VK_IMAGE_TILING_LINEAR;
-      isl_tiling_flags = ISL_TILING_LINEAR_BIT;
-      break;
-   case I915_TILING_X:
-      vk_tiling = VK_IMAGE_TILING_OPTIMAL;
-      isl_tiling_flags = ISL_TILING_X_BIT;
-      break;
-   case I915_TILING_Y:
-      vk_tiling = VK_IMAGE_TILING_OPTIMAL;
-      isl_tiling_flags = ISL_TILING_Y0_BIT;
-      break;
-   case -1:
-   default:
-      unreachable("Invalid tiling flags.");
-   }
-
-   assert(vk_tiling == VK_IMAGE_TILING_LINEAR ||
-          vk_tiling == VK_IMAGE_TILING_OPTIMAL);
+   enum isl_tiling tiling;
+   result = anv_device_get_bo_tiling(device, mem->bo, &tiling);
+   assert(result == VK_SUCCESS);
+   isl_tiling_flags_t isl_tiling_flags = (1u << tiling);
 
    /* Check format. */
    VkFormat vk_format = vk_format_from_android(desc.format, desc.usage);
-   enum isl_format isl_fmt = anv_get_isl_format(&device->info,
-                                                vk_format,
-                                                VK_IMAGE_ASPECT_COLOR_BIT,
-                                                vk_tiling);
-   assert(isl_fmt != ISL_FORMAT_UNSUPPORTED);
-
-   /* Handle RGB(X)->RGBA fallback. */
-   switch (desc.format) {
-   case AHARDWAREBUFFER_FORMAT_R8G8B8_UNORM:
-   case AHARDWAREBUFFER_FORMAT_R8G8B8X8_UNORM:
-      if (isl_format_is_rgb(isl_fmt))
-         isl_fmt = isl_format_rgb_to_rgba(isl_fmt);
-      break;
-   }
+   assert(vk_format != VK_FORMAT_UNDEFINED);
 
    /* Now we are able to fill anv_image fields properly and create
     * isl_surface for it.
@@ -1548,70 +1988,41 @@ resolve_ahw_image(struct anv_device *device,
    vk_image_set_format(&image->vk, vk_format);
    image->n_planes = anv_get_format_planes(image->vk.format);
 
-   uint32_t stride = desc.stride *
-                     (isl_format_get_layout(isl_fmt)->bpb / 8);
-
-   struct anv_image_create_info create_info = {
-      .isl_extra_usage_flags = ISL_SURF_USAGE_DISABLE_AUX_BIT,
-   };
-
-   result = add_all_surfaces_implicit_layout(device, image, NULL, stride,
+   result = add_all_surfaces_implicit_layout(device, image, NULL, desc.stride,
                                              isl_tiling_flags,
-                                             &create_info);
+                                             ISL_SURF_USAGE_DISABLE_AUX_BIT);
    assert(result == VK_SUCCESS);
 #endif
 }
 
-void anv_GetImageMemoryRequirements2(
-    VkDevice                                    _device,
-    const VkImageMemoryRequirementsInfo2*       pInfo,
-    VkMemoryRequirements2*                      pMemoryRequirements)
+void
+anv_image_get_memory_requirements(struct anv_device *device,
+                                  struct anv_image *image,
+                                  VkImageAspectFlags aspects,
+                                  VkMemoryRequirements2 *pMemoryRequirements)
 {
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_image, image, pInfo->image);
-
-   const VkImagePlaneMemoryRequirementsInfo *plane_reqs = NULL;
-
    /* The Vulkan spec (git aaed022) says:
     *
     *    memoryTypeBits is a bitfield and contains one bit set for every
     *    supported memory type for the resource. The bit `1<<i` is set if and
     *    only if the memory type `i` in the VkPhysicalDeviceMemoryProperties
     *    structure for the physical device is supported.
-    *
-    * All types are currently supported for images.
     */
-   uint32_t memory_types = (1ull << device->physical->memory.type_count) - 1;
-
-   vk_foreach_struct_const(ext, pInfo->pNext) {
-      switch (ext->sType) {
-      case VK_STRUCTURE_TYPE_IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO: {
-         assert(image->disjoint);
-         plane_reqs = (const VkImagePlaneMemoryRequirementsInfo *) ext;
-         const struct anv_image_binding *binding =
-            image_aspect_to_binding(image, plane_reqs->planeAspect);
-
-         pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
-            .size = binding->memory_range.size,
-            .alignment = binding->memory_range.alignment,
-            .memoryTypeBits = memory_types,
-         };
-         break;
-      }
-
-      default:
-         anv_debug_ignored_stype(ext->sType);
-         break;
-      }
-   }
+   uint32_t memory_types =
+      (image->vk.create_flags & VK_IMAGE_CREATE_PROTECTED_BIT) ?
+      device->physical->memory.protected_mem_types :
+      device->physical->memory.default_buffer_mem_types;
 
    vk_foreach_struct(ext, pMemoryRequirements->pNext) {
       switch (ext->sType) {
       case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
          VkMemoryDedicatedRequirements *requirements = (void *)ext;
-         if (image->vk.wsi_legacy_scanout || image->from_ahb) {
-            /* If we need to set the tiling for external consumers, we need a
-             * dedicated allocation.
+         if (image->vk.wsi_legacy_scanout ||
+             image->from_ahb ||
+             (isl_drm_modifier_has_aux(image->vk.drm_format_mod) &&
+              anv_image_uses_aux_map(device, image))) {
+            /* If we need to set the tiling for external consumers or the
+             * modifier involves AUX tables, we need a dedicated allocation.
              *
              * See also anv_AllocateMemory.
              */
@@ -1640,173 +2051,537 @@ void anv_GetImageMemoryRequirements2(
     * and only if the image is disjoint (that is, multi-planar format and
     * VK_IMAGE_CREATE_DISJOINT_BIT).
     */
-   assert(image->disjoint == (plane_reqs != NULL));
+   const struct anv_image_binding *binding;
+   if (image->disjoint) {
+      assert(util_bitcount(aspects) == 1);
+      assert(aspects & image->vk.aspects);
+      binding = anv_image_aspect_to_binding(image, aspects);
+   } else {
+      assert(aspects == image->vk.aspects);
+      binding = &image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN];
+   }
 
-   if (!image->disjoint) {
-      const struct anv_image_binding *binding =
-         &image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN];
+   pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
+      .size = binding->memory_range.size,
+      .alignment = binding->memory_range.alignment,
+      .memoryTypeBits = memory_types,
+   };
+}
 
-      pMemoryRequirements->memoryRequirements = (VkMemoryRequirements) {
-         .size = binding->memory_range.size,
-         .alignment = binding->memory_range.alignment,
-         .memoryTypeBits = memory_types,
-      };
+void anv_GetImageMemoryRequirements2(
+    VkDevice                                    _device,
+    const VkImageMemoryRequirementsInfo2*       pInfo,
+    VkMemoryRequirements2*                      pMemoryRequirements)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_image, image, pInfo->image);
+
+   VkImageAspectFlags aspects = image->vk.aspects;
+
+   vk_foreach_struct_const(ext, pInfo->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO: {
+         assert(image->disjoint);
+         const VkImagePlaneMemoryRequirementsInfo *plane_reqs =
+            (const VkImagePlaneMemoryRequirementsInfo *) ext;
+         aspects = plane_reqs->planeAspect;
+         break;
+      }
+
+      default:
+         anv_debug_ignored_stype(ext->sType);
+         break;
+      }
    }
+
+   anv_image_get_memory_requirements(device, image, aspects,
+                                     pMemoryRequirements);
 }
 
-void anv_GetImageSparseMemoryRequirements(
-    VkDevice                                    device,
-    VkImage                                     image,
-    uint32_t*                                   pSparseMemoryRequirementCount,
-    VkSparseImageMemoryRequirements*            pSparseMemoryRequirements)
+void anv_GetDeviceImageMemoryRequirements(
+    VkDevice                                    _device,
+    const VkDeviceImageMemoryRequirements*   pInfo,
+    VkMemoryRequirements2*                      pMemoryRequirements)
 {
-   *pSparseMemoryRequirementCount = 0;
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_image image = { 0 };
+
+   if ((device->physical->sparse_type == ANV_SPARSE_TYPE_NOT_SUPPORTED) &&
+       INTEL_DEBUG(DEBUG_SPARSE) &&
+       pInfo->pCreateInfo->flags & (VK_IMAGE_CREATE_SPARSE_BINDING_BIT |
+                                    VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT |
+                                    VK_IMAGE_CREATE_SPARSE_ALIASED_BIT))
+      fprintf(stderr, "=== %s %s:%d flags:0x%08x\n", __func__, __FILE__,
+              __LINE__, pInfo->pCreateInfo->flags);
+
+   ASSERTED VkResult result =
+      anv_image_init_from_create_info(device, &image, pInfo->pCreateInfo, true);
+   assert(result == VK_SUCCESS);
+
+   VkImageAspectFlags aspects =
+      image.disjoint ? pInfo->planeAspect : image.vk.aspects;
+
+   anv_image_get_memory_requirements(device, &image, aspects,
+                                     pMemoryRequirements);
+   anv_image_finish(&image);
+}
+
+static void
+anv_image_get_sparse_memory_requirements(
+      struct anv_device *device,
+      struct anv_image *image,
+      VkImageAspectFlags aspects,
+      uint32_t *pSparseMemoryRequirementCount,
+      VkSparseImageMemoryRequirements2 *pSparseMemoryRequirements)
+{
+   VK_OUTARRAY_MAKE_TYPED(VkSparseImageMemoryRequirements2, reqs,
+                          pSparseMemoryRequirements,
+                          pSparseMemoryRequirementCount);
+
+   /* From the spec:
+    *   "The sparse image must have been created using the
+    *    VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT flag to retrieve valid sparse
+    *    image memory requirements."
+    */
+   if (!(image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT))
+      return;
+
+   VkSparseImageMemoryRequirements ds_mem_reqs = {};
+   VkSparseImageMemoryRequirements2 *ds_reqs_ptr = NULL;
+
+   u_foreach_bit(b, aspects) {
+      VkImageAspectFlagBits aspect = 1 << b;
+      const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+      struct isl_surf *surf = &image->planes[plane].primary_surface.isl;
+
+      VkSparseImageFormatProperties format_props =
+         anv_sparse_calc_image_format_properties(device->physical, aspect,
+                                                 image->vk.image_type, surf);
+
+      uint32_t miptail_first_lod;
+      VkDeviceSize miptail_size, miptail_offset, miptail_stride;
+      anv_sparse_calc_miptail_properties(device, image, aspect,
+                                         &miptail_first_lod, &miptail_size,
+                                         &miptail_offset, &miptail_stride);
+
+      VkSparseImageMemoryRequirements mem_reqs = {
+         .formatProperties = format_props,
+         .imageMipTailFirstLod = miptail_first_lod,
+         .imageMipTailSize = miptail_size,
+         .imageMipTailOffset = miptail_offset,
+         .imageMipTailStride = miptail_stride,
+      };
+
+      /* If both depth and stencil are the same, unify them if possible. */
+      if (aspect & (VK_IMAGE_ASPECT_DEPTH_BIT |
+                    VK_IMAGE_ASPECT_STENCIL_BIT)) {
+         if (!ds_reqs_ptr) {
+            ds_mem_reqs = mem_reqs;
+         } else if (ds_mem_reqs.formatProperties.imageGranularity.width ==
+                       mem_reqs.formatProperties.imageGranularity.width &&
+                    ds_mem_reqs.formatProperties.imageGranularity.height ==
+                       mem_reqs.formatProperties.imageGranularity.height &&
+                    ds_mem_reqs.formatProperties.imageGranularity.depth ==
+                       mem_reqs.formatProperties.imageGranularity.depth &&
+                    ds_mem_reqs.imageMipTailFirstLod ==
+                       mem_reqs.imageMipTailFirstLod &&
+                    ds_mem_reqs.imageMipTailSize ==
+                       mem_reqs.imageMipTailSize &&
+                    ds_mem_reqs.imageMipTailOffset ==
+                       mem_reqs.imageMipTailOffset &&
+                    ds_mem_reqs.imageMipTailStride ==
+                       mem_reqs.imageMipTailStride) {
+            ds_reqs_ptr->memoryRequirements.formatProperties.aspectMask |=
+               aspect;
+            continue;
+         }
+      }
+
+      vk_outarray_append_typed(VkSparseImageMemoryRequirements2, &reqs, r) {
+         r->memoryRequirements = mem_reqs;
+         if (aspect & (VK_IMAGE_ASPECT_DEPTH_BIT |
+                       VK_IMAGE_ASPECT_STENCIL_BIT))
+            ds_reqs_ptr = r;
+      }
+   }
 }
 
 void anv_GetImageSparseMemoryRequirements2(
-    VkDevice                                    device,
+    VkDevice                                    _device,
     const VkImageSparseMemoryRequirementsInfo2* pInfo,
     uint32_t*                                   pSparseMemoryRequirementCount,
     VkSparseImageMemoryRequirements2*           pSparseMemoryRequirements)
 {
-   *pSparseMemoryRequirementCount = 0;
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_image, image, pInfo->image);
+
+   if (!anv_sparse_residency_is_enabled(device)) {
+      if ((device->physical->sparse_type == ANV_SPARSE_TYPE_NOT_SUPPORTED) &&
+          INTEL_DEBUG(DEBUG_SPARSE))
+         fprintf(stderr, "=== [%s:%d] [%s]\n", __FILE__, __LINE__, __func__);
+
+      *pSparseMemoryRequirementCount = 0;
+      return;
+   }
+
+   anv_image_get_sparse_memory_requirements(device, image, image->vk.aspects,
+                                            pSparseMemoryRequirementCount,
+                                            pSparseMemoryRequirements);
 }
 
-VkResult anv_BindImageMemory2(
+void anv_GetDeviceImageSparseMemoryRequirements(
     VkDevice                                    _device,
-    uint32_t                                    bindInfoCount,
-    const VkBindImageMemoryInfo*                pBindInfos)
+    const VkDeviceImageMemoryRequirements*      pInfo,
+    uint32_t*                                   pSparseMemoryRequirementCount,
+    VkSparseImageMemoryRequirements2*           pSparseMemoryRequirements)
 {
    ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_image image = { 0 };
 
-   for (uint32_t i = 0; i < bindInfoCount; i++) {
-      const VkBindImageMemoryInfo *bind_info = &pBindInfos[i];
-      ANV_FROM_HANDLE(anv_device_memory, mem, bind_info->memory);
-      ANV_FROM_HANDLE(anv_image, image, bind_info->image);
-      bool did_bind = false;
-
-      /* Resolve will alter the image's aspects, do this first. */
-      if (mem && mem->ahw)
-         resolve_ahw_image(device, image, mem);
-
-      vk_foreach_struct_const(s, bind_info->pNext) {
-         switch (s->sType) {
-         case VK_STRUCTURE_TYPE_BIND_IMAGE_PLANE_MEMORY_INFO: {
-            const VkBindImagePlaneMemoryInfo *plane_info =
-               (const VkBindImagePlaneMemoryInfo *) s;
-
-            /* Workaround for possible spec bug.
-             *
-             * Unlike VkImagePlaneMemoryRequirementsInfo, which requires that
-             * the image be disjoint (that is, multi-planar format and
-             * VK_IMAGE_CREATE_DISJOINT_BIT), VkBindImagePlaneMemoryInfo allows
-             * the image to be non-disjoint and requires only that the image
-             * have the DISJOINT flag. In this case, regardless of the value of
-             * VkImagePlaneMemoryRequirementsInfo::planeAspect, the behavior is
-             * the same as if VkImagePlaneMemoryRequirementsInfo were omitted.
-             */
-            if (!image->disjoint)
-               break;
+   if (!anv_sparse_residency_is_enabled(device)) {
+      if ((device->physical->sparse_type == ANV_SPARSE_TYPE_NOT_SUPPORTED) &&
+          INTEL_DEBUG(DEBUG_SPARSE))
+         fprintf(stderr, "=== [%s:%d] [%s]\n", __FILE__, __LINE__, __func__);
 
-            struct anv_image_binding *binding =
-               image_aspect_to_binding(image, plane_info->planeAspect);
+      *pSparseMemoryRequirementCount = 0;
+      return;
+   }
 
-            binding->address = (struct anv_address) {
-               .bo = mem->bo,
-               .offset = bind_info->memoryOffset,
-            };
+   /* This function is similar to anv_GetDeviceImageMemoryRequirements, in
+    * which it actually creates an image, gets the properties and then
+    * destroys the image.
+    *
+    * We could one day refactor things to allow us to gather the properties
+    * without having to actually create the image, maybe by reworking ISL to
+    * separate creation from parameter computing.
+    */
+   VkResult result =
+      anv_image_init_from_create_info(device, &image, pInfo->pCreateInfo,
+                                      true /* no_private_binding_alloc */);
+   if (result != VK_SUCCESS) {
+      *pSparseMemoryRequirementCount = 0;
+      return;
+   }
 
-            did_bind = true;
-            break;
+   /* The spec says:
+    *  "planeAspect is a VkImageAspectFlagBits value specifying the aspect
+    *   corresponding to the image plane to query. This parameter is ignored
+    *   unless pCreateInfo::tiling is VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT,
+    *   or pCreateInfo::flags has VK_IMAGE_CREATE_DISJOINT_BIT set."
+    */
+   VkImageAspectFlags aspects =
+      (pInfo->pCreateInfo->flags & VK_IMAGE_CREATE_DISJOINT_BIT) ||
+      (pInfo->pCreateInfo->tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT)
+         ? pInfo->planeAspect : image.vk.aspects;
+
+   anv_image_get_sparse_memory_requirements(device, &image, aspects,
+                                            pSparseMemoryRequirementCount,
+                                            pSparseMemoryRequirements);
+
+   anv_image_finish(&image);
+}
+
+static bool
+anv_image_map_aux_tt(struct anv_device *device,
+                     struct anv_image *image, uint32_t plane)
+{
+   const struct anv_address main_addr = anv_image_address(
+      image, &image->planes[plane].primary_surface.memory_range);
+   struct anv_bo *bo = main_addr.bo;
+   assert(bo != NULL);
+
+   /* If the additional memory padding was added at the end of the BO for CCS
+    * data, map this region at the granularity of the main/CCS pages.
+    *
+    * Otherwise the image should have additional CCS data at the computed
+    * offset.
+    */
+   if (device->physical->alloc_aux_tt_mem &&
+       (bo->alloc_flags & ANV_BO_ALLOC_AUX_CCS)) {
+      uint64_t main_aux_alignment =
+         intel_aux_map_get_alignment(device->aux_map_ctx);
+      assert(bo->offset % main_aux_alignment == 0);
+      const struct anv_address start_addr = (struct anv_address) {
+         .bo = bo,
+         .offset = ROUND_DOWN_TO(main_addr.offset, main_aux_alignment),
+      };
+      const struct anv_address aux_addr = (struct anv_address) {
+         .bo = bo,
+         .offset = bo->ccs_offset +
+                   intel_aux_main_to_aux_offset(device->aux_map_ctx,
+                                                start_addr.offset),
+      };
+      const struct isl_surf *surf = &image->planes[plane].primary_surface.isl;
+      const uint64_t format_bits =
+         intel_aux_map_format_bits_for_isl_surf(surf);
+      /* Make sure to have the mapping cover the entire image from the aux
+       * aligned start.
+       */
+      const uint64_t main_size = align(
+         (main_addr.offset - start_addr.offset) + surf->size_B,
+         main_aux_alignment);
+
+      if (intel_aux_map_add_mapping(device->aux_map_ctx,
+                                    anv_address_physical(start_addr),
+                                    anv_address_physical(aux_addr),
+                                    main_size, format_bits)) {
+         image->planes[plane].aux_tt.mapped = true;
+         image->planes[plane].aux_tt.addr = anv_address_physical(start_addr);
+         image->planes[plane].aux_tt.size = main_size;
+         return true;
+      }
+   } else {
+      if (anv_address_allows_aux_map(device, main_addr)) {
+         const struct anv_address aux_addr =
+            anv_image_address(image,
+                              &image->planes[plane].compr_ctrl_memory_range);
+         const struct isl_surf *surf =
+            &image->planes[plane].primary_surface.isl;
+         const uint64_t format_bits =
+            intel_aux_map_format_bits_for_isl_surf(surf);
+         if (intel_aux_map_add_mapping(device->aux_map_ctx,
+                                       anv_address_physical(main_addr),
+                                       anv_address_physical(aux_addr),
+                                       surf->size_B, format_bits)) {
+            image->planes[plane].aux_tt.mapped = true;
+            image->planes[plane].aux_tt.addr = anv_address_physical(main_addr);
+            image->planes[plane].aux_tt.size = surf->size_B;
+            return true;
          }
-         case VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_SWAPCHAIN_INFO_KHR: {
-            /* Ignore this struct on Android, we cannot access swapchain
-             * structures threre.
-             */
-#ifndef VK_USE_PLATFORM_ANDROID_KHR
-            const VkBindImageMemorySwapchainInfoKHR *swapchain_info =
-               (const VkBindImageMemorySwapchainInfoKHR *) s;
-            struct anv_image *swapchain_image =
-               anv_swapchain_get_image(swapchain_info->swapchain,
-                                       swapchain_info->imageIndex);
-            assert(swapchain_image);
-            assert(image->vk.aspects == swapchain_image->vk.aspects);
-            assert(mem == NULL);
-
-            for (int j = 0; j < ARRAY_SIZE(image->bindings); ++j)
-               image->bindings[j].address = swapchain_image->bindings[j].address;
-
-            /* We must bump the private binding's bo's refcount because, unlike the other
-             * bindings, its lifetime is not application-managed.
-             */
-            struct anv_bo *private_bo =
-               image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE].address.bo;
-            if (private_bo)
-               anv_bo_ref(private_bo);
+      }
+   }
 
-            did_bind = true;
-#endif
+   return false;
+
+}
+
+static VkResult
+anv_bind_image_memory(struct anv_device *device,
+                      const VkBindImageMemoryInfo *bind_info)
+{
+   ANV_FROM_HANDLE(anv_device_memory, mem, bind_info->memory);
+   ANV_FROM_HANDLE(anv_image, image, bind_info->image);
+   bool did_bind = false;
+
+   const VkBindMemoryStatusKHR *bind_status =
+      vk_find_struct_const(bind_info->pNext, BIND_MEMORY_STATUS_KHR);
+
+   assert(!anv_image_is_sparse(image));
+
+   /* Resolve will alter the image's aspects, do this first. */
+   if (mem && mem->vk.ahardware_buffer)
+      resolve_ahw_image(device, image, mem);
+
+   vk_foreach_struct_const(s, bind_info->pNext) {
+      switch (s->sType) {
+      case VK_STRUCTURE_TYPE_BIND_IMAGE_PLANE_MEMORY_INFO: {
+         const VkBindImagePlaneMemoryInfo *plane_info =
+            (const VkBindImagePlaneMemoryInfo *) s;
+
+         /* Workaround for possible spec bug.
+          *
+          * Unlike VkImagePlaneMemoryRequirementsInfo, which requires that
+          * the image be disjoint (that is, multi-planar format and
+          * VK_IMAGE_CREATE_DISJOINT_BIT), VkBindImagePlaneMemoryInfo allows
+          * the image to be non-disjoint and requires only that the image
+          * have the DISJOINT flag. In this case, regardless of the value of
+          * VkImagePlaneMemoryRequirementsInfo::planeAspect, the behavior is
+          * the same as if VkImagePlaneMemoryRequirementsInfo were omitted.
+          */
+         if (!image->disjoint)
             break;
+
+         struct anv_image_binding *binding =
+            anv_image_aspect_to_binding(image, plane_info->planeAspect);
+
+         binding->address = (struct anv_address) {
+            .bo = mem->bo,
+            .offset = bind_info->memoryOffset,
+         };
+
+         ANV_RMV(image_bind, device, image,
+                 binding - image->bindings);
+
+         did_bind = true;
+         break;
+      }
+      case VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_SWAPCHAIN_INFO_KHR: {
+         /* Ignore this struct on Android, we cannot access swapchain
+          * structures there.
+          */
+#ifndef VK_USE_PLATFORM_ANDROID_KHR
+         const VkBindImageMemorySwapchainInfoKHR *swapchain_info =
+            (const VkBindImageMemorySwapchainInfoKHR *) s;
+         struct anv_image *swapchain_image =
+            anv_swapchain_get_image(swapchain_info->swapchain,
+                                    swapchain_info->imageIndex);
+         assert(swapchain_image);
+         assert(image->vk.aspects == swapchain_image->vk.aspects);
+         assert(mem == NULL);
+
+         for (int j = 0; j < ARRAY_SIZE(image->bindings); ++j) {
+            assert(memory_ranges_equal(image->bindings[j].memory_range,
+                                       swapchain_image->bindings[j].memory_range));
+            image->bindings[j].address = swapchain_image->bindings[j].address;
          }
+
+         /* We must bump the private binding's bo's refcount because, unlike the other
+          * bindings, its lifetime is not application-managed.
+          */
+         struct anv_bo *private_bo =
+            image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE].address.bo;
+         if (private_bo)
+            anv_bo_ref(private_bo);
+
+         did_bind = true;
+#endif
+         break;
+      }
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wswitch"
-         case VK_STRUCTURE_TYPE_NATIVE_BUFFER_ANDROID: {
-            const VkNativeBufferANDROID *gralloc_info =
-               (const VkNativeBufferANDROID *)s;
-            VkResult result = anv_image_bind_from_gralloc(device, image,
-                                                          gralloc_info);
-            if (result != VK_SUCCESS)
-               return result;
-            did_bind = true;
-            break;
-         }
+      case VK_STRUCTURE_TYPE_NATIVE_BUFFER_ANDROID: {
+         const VkNativeBufferANDROID *gralloc_info =
+            (const VkNativeBufferANDROID *)s;
+         VkResult result = anv_image_bind_from_gralloc(device, image,
+                                                       gralloc_info);
+         if (result != VK_SUCCESS)
+            return result;
+         did_bind = true;
+         break;
+      }
 #pragma GCC diagnostic pop
-         default:
-            anv_debug_ignored_stype(s->sType);
-            break;
-         }
+      default:
+         anv_debug_ignored_stype(s->sType);
+         break;
       }
+   }
 
-      if (!did_bind) {
-         assert(!image->disjoint);
+   if (!did_bind) {
+      assert(!image->disjoint);
 
-         image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address =
-            (struct anv_address) {
-               .bo = mem->bo,
-               .offset = bind_info->memoryOffset,
-            };
+      image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].address =
+         (struct anv_address) {
+         .bo = mem->bo,
+         .offset = bind_info->memoryOffset,
+      };
 
-         did_bind = true;
-      }
+      ANV_RMV(image_bind, device, image,
+              ANV_IMAGE_MEMORY_BINDING_MAIN);
 
-      /* On platforms that use implicit CCS, if the plane's bo lacks implicit
-       * CCS then disable compression on the plane.
+      did_bind = true;
+   }
+
+   /* Now that we have the BO, finalize CCS setup. */
+   for (int p = 0; p < image->n_planes; ++p) {
+      enum anv_image_memory_binding binding =
+         image->planes[p].primary_surface.memory_range.binding;
+      const struct anv_bo *bo =
+         image->bindings[binding].address.bo;
+
+      if (!bo || !isl_aux_usage_has_ccs(image->planes[p].aux_usage))
+         continue;
+
+      /* Do nothing if flat CCS requirements are satisfied.
+       *
+       * Also, assume that imported BOs with a modifier including
+       * CCS live only in local memory. Otherwise the exporter should
+       * have failed the creation of the BO.
        */
-      for (int p = 0; p < image->n_planes; ++p) {
-         enum anv_image_memory_binding binding =
-            image->planes[p].primary_surface.memory_range.binding;
-         const struct anv_bo *bo =
-            image->bindings[binding].address.bo;
-
-         if (bo && !bo->has_implicit_ccs &&
-             device->physical->has_implicit_ccs)
-            image->planes[p].aux_usage = ISL_AUX_USAGE_NONE;
+      if (device->info->has_flat_ccs &&
+          (anv_bo_is_vram_only(bo) ||
+           (bo->alloc_flags & ANV_BO_ALLOC_IMPORTED)))
+         continue;
+
+      /* If the AUX-TT mapping succeeds, there is nothing else to do. */
+      if (device->info->has_aux_map && anv_image_map_aux_tt(device, image, p))
+         continue;
+
+      /* Do nothing prior to gfx12. There are no special requirements. */
+      if (device->info->ver < 12)
+         continue;
+
+      /* The plane's BO cannot support CCS, disable compression on it. */
+      assert(!isl_drm_modifier_has_aux(image->vk.drm_format_mod));
+
+      anv_perf_warn(VK_LOG_OBJS(&image->vk.base),
+                    "BO lacks CCS support. Disabling the CCS aux usage.");
+
+      if (image->planes[p].aux_surface.memory_range.size > 0) {
+         assert(image->planes[p].aux_usage == ISL_AUX_USAGE_HIZ_CCS ||
+                image->planes[p].aux_usage == ISL_AUX_USAGE_HIZ_CCS_WT);
+         image->planes[p].aux_usage = ISL_AUX_USAGE_HIZ;
+      } else {
+         assert(image->planes[p].aux_usage == ISL_AUX_USAGE_CCS_E ||
+                image->planes[p].aux_usage == ISL_AUX_USAGE_FCV_CCS_E ||
+                image->planes[p].aux_usage == ISL_AUX_USAGE_STC_CCS);
+         image->planes[p].aux_usage = ISL_AUX_USAGE_NONE;
       }
    }
 
+   if (bind_status)
+      *bind_status->pResult = VK_SUCCESS;
+
    return VK_SUCCESS;
 }
 
-void anv_GetImageSubresourceLayout(
-    VkDevice                                    device,
-    VkImage                                     _image,
-    const VkImageSubresource*                   subresource,
-    VkSubresourceLayout*                        layout)
+VkResult anv_BindImageMemory2(
+    VkDevice                                    _device,
+    uint32_t                                    bindInfoCount,
+    const VkBindImageMemoryInfo*                pBindInfos)
 {
-   ANV_FROM_HANDLE(anv_image, image, _image);
-   const struct anv_surface *surface;
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   VkResult result = VK_SUCCESS;
 
-   assert(__builtin_popcount(subresource->aspectMask) == 1);
+   for (uint32_t i = 0; i < bindInfoCount; i++) {
+      VkResult res = anv_bind_image_memory(device, &pBindInfos[i]);
+      if (result == VK_SUCCESS && res != VK_SUCCESS)
+         result = res;
+   }
+
+   return result;
+}
+
+static inline void
+get_image_fast_clear_layout(const struct anv_image *image,
+                            VkSubresourceLayout *out_layout)
+{
+   /* If the memory binding differs between primary and fast clear
+    * region, then the returned offset will be incorrect.
+    */
+   assert(image->planes[0].fast_clear_memory_range.binding ==
+          image->planes[0].primary_surface.memory_range.binding);
+   out_layout->offset = image->planes[0].fast_clear_memory_range.offset;
+   out_layout->size = image->planes[0].fast_clear_memory_range.size;
+   /* Refer to the comment above add_aux_state_tracking_buffer() for the
+    * design of fast clear region. It is not a typical isl surface, so we
+    * just push some values in these pitches when no other requirements
+    * to meet. We have some freedom to do so according to the spec of
+    * VkSubresourceLayout:
+    *
+    * If the image is non-linear, then rowPitch, arrayPitch, and depthPitch
+    * have an implementation-dependent meaning.
+    *
+    * Fast clear is neither supported on linear tiling formats nor linear
+    * modifiers, which don't have the fast clear plane. We should be safe
+    * with these values.
+    */
+   out_layout->arrayPitch = 1;
+   out_layout->depthPitch = 1;
+   /* On TGL and DG2, 64-byte alignment on clear color is required.
+    * This pitch is ignored on MTL. (drm_fourcc.h)
+    */
+   out_layout->rowPitch = 64;
+}
+
+static void
+anv_get_image_subresource_layout(const struct anv_image *image,
+                                 const VkImageSubresource2KHR *subresource,
+                                 VkSubresourceLayout2KHR *layout)
+{
+   const struct anv_image_memory_range *mem_range;
+   const struct isl_surf *isl_surf;
+
+   assert(__builtin_popcount(subresource->imageSubresource.aspectMask) == 1);
 
    /* The Vulkan spec requires that aspectMask be
     * VK_IMAGE_ASPECT_MEMORY_PLANE_i_BIT_EXT if tiling is
@@ -1822,11 +2597,13 @@ void anv_GetImageSubresourceLayout(
     * so it _should_ correctly use VK_IMAGE_ASPECT_MEMORY_PLANE_* in that case.
     * But it incorrectly uses VK_IMAGE_ASPECT_PLANE_*, so we have a temporary
     * workaround.
+    *
+    * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10176
     */
    if (image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT) {
       /* TODO(chadv): Drop this workaround when WSI gets fixed. */
       uint32_t mem_plane;
-      switch (subresource->aspectMask) {
+      switch (subresource->imageSubresource.aspectMask) {
       case VK_IMAGE_ASPECT_MEMORY_PLANE_0_BIT_EXT:
       case VK_IMAGE_ASPECT_PLANE_0_BIT:
          mem_plane = 0;
@@ -1842,46 +2619,136 @@ void anv_GetImageSubresourceLayout(
       default:
          unreachable("bad VkImageAspectFlags");
       }
+      if (isl_drm_modifier_plane_is_clear_color(image->vk.drm_format_mod,
+                                                mem_plane)) {
+         get_image_fast_clear_layout(image, &layout->subresourceLayout);
 
-      if (mem_plane == 1 && isl_drm_modifier_has_aux(image->vk.drm_format_mod)) {
+         return;
+      } else if (mem_plane == 1 &&
+                 isl_drm_modifier_has_aux(image->vk.drm_format_mod)) {
          assert(image->n_planes == 1);
          /* If the memory binding differs between primary and aux, then the
           * returned offset will be incorrect.
           */
-         assert(image->planes[0].aux_surface.memory_range.binding ==
+         mem_range = anv_image_get_aux_memory_range(image, 0);
+         assert(mem_range->binding ==
                 image->planes[0].primary_surface.memory_range.binding);
-         surface = &image->planes[0].aux_surface;
+         isl_surf = &image->planes[0].aux_surface.isl;
       } else {
          assert(mem_plane < image->n_planes);
-         surface = &image->planes[mem_plane].primary_surface;
+         mem_range = &image->planes[mem_plane].primary_surface.memory_range;
+         isl_surf = &image->planes[mem_plane].primary_surface.isl;
       }
    } else {
       const uint32_t plane =
-         anv_image_aspect_to_plane(image, subresource->aspectMask);
-      surface = &image->planes[plane].primary_surface;
+         anv_image_aspect_to_plane(image, subresource->imageSubresource.aspectMask);
+      mem_range = &image->planes[plane].primary_surface.memory_range;
+      isl_surf = &image->planes[plane].primary_surface.isl;
    }
 
-   layout->offset = surface->memory_range.offset;
-   layout->rowPitch = surface->isl.row_pitch_B;
-   layout->depthPitch = isl_surf_get_array_pitch(&surface->isl);
-   layout->arrayPitch = isl_surf_get_array_pitch(&surface->isl);
+   layout->subresourceLayout.offset = mem_range->offset;
+   layout->subresourceLayout.rowPitch = isl_surf->row_pitch_B;
+   layout->subresourceLayout.depthPitch = isl_surf_get_array_pitch(isl_surf);
+   layout->subresourceLayout.arrayPitch = isl_surf_get_array_pitch(isl_surf);
 
-   if (subresource->mipLevel > 0 || subresource->arrayLayer > 0) {
-      assert(surface->isl.tiling == ISL_TILING_LINEAR);
+   if (subresource->imageSubresource.mipLevel > 0 ||
+       subresource->imageSubresource.arrayLayer > 0) {
+      assert(isl_surf->tiling == ISL_TILING_LINEAR);
 
       uint64_t offset_B;
-      isl_surf_get_image_offset_B_tile_sa(&surface->isl,
-                                          subresource->mipLevel,
-                                          subresource->arrayLayer,
+      isl_surf_get_image_offset_B_tile_sa(isl_surf,
+                                          subresource->imageSubresource.mipLevel,
+                                          subresource->imageSubresource.arrayLayer,
                                           0 /* logical_z_offset_px */,
                                           &offset_B, NULL, NULL);
-      layout->offset += offset_B;
-      layout->size = layout->rowPitch * anv_minify(image->vk.extent.height,
-                                                   subresource->mipLevel) *
-                     image->vk.extent.depth;
+      layout->subresourceLayout.offset += offset_B;
+      layout->subresourceLayout.size =
+         layout->subresourceLayout.rowPitch *
+         u_minify(image->vk.extent.height,
+                  subresource->imageSubresource.mipLevel) *
+         image->vk.extent.depth;
    } else {
-      layout->size = surface->memory_range.size;
+      layout->subresourceLayout.size = mem_range->size;
+   }
+
+   VkImageCompressionPropertiesEXT *comp_props =
+      vk_find_struct(layout->pNext, IMAGE_COMPRESSION_PROPERTIES_EXT);
+   if (comp_props) {
+      comp_props->imageCompressionFixedRateFlags =
+         VK_IMAGE_COMPRESSION_FIXED_RATE_NONE_EXT;
+      comp_props->imageCompressionFlags = VK_IMAGE_COMPRESSION_DISABLED_EXT;
+      for (uint32_t p = 0; p < image->n_planes; p++) {
+         if (image->planes[p].aux_usage != ISL_AUX_USAGE_NONE) {
+            comp_props->imageCompressionFlags = VK_IMAGE_COMPRESSION_DEFAULT_EXT;
+            break;
+         }
+      }
+   }
+}
+
+void anv_GetDeviceImageSubresourceLayoutKHR(
+    VkDevice                                    _device,
+    const VkDeviceImageSubresourceInfoKHR*      pInfo,
+    VkSubresourceLayout2KHR*                    pLayout)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   struct anv_image image = { 0 };
+
+   if (anv_image_init_from_create_info(device, &image, pInfo->pCreateInfo,
+                                       true) != VK_SUCCESS) {
+      pLayout->subresourceLayout = (VkSubresourceLayout) { 0, };
+      return;
+   }
+
+   anv_get_image_subresource_layout(&image, pInfo->pSubresource, pLayout);
+}
+
+void anv_GetImageSubresourceLayout2KHR(
+    VkDevice                                    device,
+    VkImage                                     _image,
+    const VkImageSubresource2KHR*               pSubresource,
+    VkSubresourceLayout2KHR*                    pLayout)
+{
+   ANV_FROM_HANDLE(anv_image, image, _image);
+
+   anv_get_image_subresource_layout(image, pSubresource, pLayout);
+}
+
+static VkImageUsageFlags
+anv_image_flags_filter_for_queue(VkImageUsageFlags usages,
+                                 VkQueueFlagBits queue_flags)
+{
+   /* Eliminate graphics usages if the queue is not graphics capable */
+   if (!(queue_flags & VK_QUEUE_GRAPHICS_BIT)) {
+      usages &= ~(VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
+                  VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT |
+                  VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT |
+                  VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT |
+                  VK_IMAGE_USAGE_FRAGMENT_DENSITY_MAP_BIT_EXT |
+                  VK_IMAGE_USAGE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR |
+                  VK_IMAGE_USAGE_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT);
    }
+
+   /* Eliminate sampling & storage usages if the queue is neither graphics nor
+    * compute capable
+    */
+   if (!(queue_flags & (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT))) {
+      usages &= ~(VK_IMAGE_USAGE_SAMPLED_BIT |
+                  VK_IMAGE_USAGE_STORAGE_BIT);
+   }
+
+   /* Eliminate transfer usages if the queue is neither transfer, compute or
+    * graphics capable
+    */
+   if (!(queue_flags & (VK_QUEUE_TRANSFER_BIT |
+                        VK_QUEUE_COMPUTE_BIT |
+                        VK_QUEUE_GRAPHICS_BIT))) {
+      usages &= ~(VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
+                  VK_IMAGE_USAGE_TRANSFER_DST_BIT);
+   }
+
+   return usages;
 }
 
 /**
@@ -1900,7 +2767,8 @@ enum isl_aux_state ATTRIBUTE_PURE
 anv_layout_to_aux_state(const struct intel_device_info * const devinfo,
                         const struct anv_image * const image,
                         const VkImageAspectFlagBits aspect,
-                        const VkImageLayout layout)
+                        const VkImageLayout layout,
+                        const VkQueueFlagBits queue_flags)
 {
    /* Validate the inputs. */
 
@@ -1947,8 +2815,6 @@ anv_layout_to_aux_state(const struct intel_device_info * const devinfo,
          isl_drm_modifier_get_default_aux_state(image->vk.drm_format_mod);
 
       switch (aux_state) {
-      default:
-         assert(!"unexpected isl_aux_state");
       case ISL_AUX_STATE_AUX_INVALID:
          /* The modifier does not support compression. But, if we arrived
           * here, then we have enabled compression on it anyway, in which case
@@ -1964,8 +2830,12 @@ anv_layout_to_aux_state(const struct intel_device_info * const devinfo,
           * pass-through.
           */
          return ISL_AUX_STATE_PASS_THROUGH;
+      case ISL_AUX_STATE_COMPRESSED_CLEAR:
+         return ISL_AUX_STATE_COMPRESSED_CLEAR;
       case ISL_AUX_STATE_COMPRESSED_NO_CLEAR:
          return ISL_AUX_STATE_COMPRESSED_NO_CLEAR;
+      default:
+         unreachable("unexpected isl_aux_state");
       }
    }
 
@@ -1976,14 +2846,17 @@ anv_layout_to_aux_state(const struct intel_device_info * const devinfo,
    const bool read_only = vk_image_layout_is_read_only(layout, aspect);
 
    const VkImageUsageFlags image_aspect_usage =
-      vk_image_usage(&image->vk, aspect);
+      anv_image_flags_filter_for_queue(
+         vk_image_usage(&image->vk, aspect), queue_flags);
    const VkImageUsageFlags usage =
       vk_image_layout_to_usage_flags(layout, aspect) & image_aspect_usage;
 
    bool aux_supported = true;
    bool clear_supported = isl_aux_usage_has_fast_clears(aux_usage);
 
-   if ((usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) && !read_only) {
+   if ((usage & (VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT |
+                 VK_IMAGE_USAGE_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT)) &&
+       !read_only) {
       /* This image could be used as both an input attachment and a render
        * target (depth, stencil, or color) at the same time and this can cause
        * corruption.
@@ -1993,17 +2866,12 @@ anv_layout_to_aux_state(const struct intel_device_info * const devinfo,
        *
        * TODO: Should we be disabling this in more cases?
        */
-      if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT) {
+      if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT && devinfo->ver <= 9) {
          aux_supported = false;
          clear_supported = false;
       }
    }
 
-   if (usage & VK_IMAGE_USAGE_STORAGE_BIT) {
-      aux_supported = false;
-      clear_supported = false;
-   }
-
    if (usage & (VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
                 VK_IMAGE_USAGE_SAMPLED_BIT |
                 VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) {
@@ -2034,6 +2902,7 @@ anv_layout_to_aux_state(const struct intel_device_info * const devinfo,
          break;
 
       case ISL_AUX_USAGE_CCS_E:
+      case ISL_AUX_USAGE_FCV_CCS_E:
       case ISL_AUX_USAGE_STC_CCS:
          break;
 
@@ -2057,7 +2926,8 @@ anv_layout_to_aux_state(const struct intel_device_info * const devinfo,
 
    case ISL_AUX_USAGE_CCS_D:
       /* We only support clear in exactly one state */
-      if (layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) {
+      if (layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL ||
+          layout == VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL) {
          assert(aux_supported);
          assert(clear_supported);
          return ISL_AUX_STATE_PARTIAL_CLEAR;
@@ -2066,6 +2936,7 @@ anv_layout_to_aux_state(const struct intel_device_info * const devinfo,
       }
 
    case ISL_AUX_USAGE_CCS_E:
+   case ISL_AUX_USAGE_FCV_CCS_E:
       if (aux_supported) {
          assert(clear_supported);
          return ISL_AUX_STATE_COMPRESSED_CLEAR;
@@ -2110,7 +2981,8 @@ anv_layout_to_aux_usage(const struct intel_device_info * const devinfo,
                         const struct anv_image * const image,
                         const VkImageAspectFlagBits aspect,
                         const VkImageUsageFlagBits usage,
-                        const VkImageLayout layout)
+                        const VkImageLayout layout,
+                        const VkQueueFlagBits queue_flags)
 {
    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
 
@@ -2121,7 +2993,7 @@ anv_layout_to_aux_usage(const struct intel_device_info * const devinfo,
       return ISL_AUX_USAGE_NONE;
 
    enum isl_aux_state aux_state =
-      anv_layout_to_aux_state(devinfo, image, aspect, layout);
+      anv_layout_to_aux_state(devinfo, image, aspect, layout, queue_flags);
 
    switch (aux_state) {
    case ISL_AUX_STATE_CLEAR:
@@ -2176,9 +3048,10 @@ enum anv_fast_clear_type ATTRIBUTE_PURE
 anv_layout_to_fast_clear_type(const struct intel_device_info * const devinfo,
                               const struct anv_image * const image,
                               const VkImageAspectFlagBits aspect,
-                              const VkImageLayout layout)
+                              const VkImageLayout layout,
+                              const VkQueueFlagBits queue_flags)
 {
-   if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR)
+   if (INTEL_DEBUG(DEBUG_NO_FAST_CLEAR))
       return ANV_FAST_CLEAR_NONE;
 
    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
@@ -2187,14 +3060,11 @@ anv_layout_to_fast_clear_type(const struct intel_device_info * const devinfo,
    if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)
       return ANV_FAST_CLEAR_NONE;
 
-   /* We don't support MSAA fast-clears on Ivybridge or Bay Trail because they
-    * lack the MI ALU which we need to determine the predicates.
-    */
-   if (devinfo->verx10 == 70 && image->vk.samples > 1)
-      return ANV_FAST_CLEAR_NONE;
-
    enum isl_aux_state aux_state =
-      anv_layout_to_aux_state(devinfo, image, aspect, layout);
+      anv_layout_to_aux_state(devinfo, image, aspect, layout, queue_flags);
+
+   const VkImageUsageFlags layout_usage =
+      vk_image_layout_to_usage_flags(layout, aspect);
 
    switch (aux_state) {
    case ISL_AUX_STATE_CLEAR:
@@ -2204,15 +3074,31 @@ anv_layout_to_fast_clear_type(const struct intel_device_info * const devinfo,
    case ISL_AUX_STATE_COMPRESSED_CLEAR:
       if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT) {
          return ANV_FAST_CLEAR_DEFAULT_VALUE;
-      } else if (layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) {
+      } else if (layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL ||
+                 layout == VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL) {
+         /* The image might not support non zero fast clears when mutable. */
+         if (!image->planes[plane].can_non_zero_fast_clear)
+            return ANV_FAST_CLEAR_DEFAULT_VALUE;
+
          /* When we're in a render pass we have the clear color data from the
           * VkRenderPassBeginInfo and we can use arbitrary clear colors.  They
           * must get partially resolved before we leave the render pass.
           */
          return ANV_FAST_CLEAR_ANY;
+      } else if (layout_usage & (VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
+                                 VK_IMAGE_USAGE_TRANSFER_DST_BIT)) {
+         /* Fast clear with non zero color is not supported during transfer
+          * operations since transfer may do format reinterpretation.
+          */
+         return ANV_FAST_CLEAR_DEFAULT_VALUE;
       } else if (image->planes[plane].aux_usage == ISL_AUX_USAGE_MCS ||
-                 image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) {
+                 image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E ||
+                 image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E) {
          if (devinfo->ver >= 11) {
+            /* The image might not support non zero fast clears when mutable. */
+            if (!image->planes[plane].can_non_zero_fast_clear)
+               return ANV_FAST_CLEAR_DEFAULT_VALUE;
+
             /* On ICL and later, the sampler hardware uses a copy of the clear
              * value that is encoded as a pixel value.  Therefore, we can use
              * any clear color we like for sampling.
@@ -2241,10 +3127,60 @@ anv_layout_to_fast_clear_type(const struct intel_device_info * const devinfo,
 }
 
 
+/**
+ * This function determines if the layout & usage of an image can have
+ * untracked aux writes. When we see a transition that matches this criteria,
+ * we need to mark the image as compressed written so that our predicated
+ * resolves work properly.
+ *
+ * @param devinfo The device information of the Intel GPU.
+ * @param image The image that may contain a collection of buffers.
+ * @param aspect The aspect of the image to be accessed.
+ * @param layout The current layout of the image aspect(s).
+ */
+bool
+anv_layout_has_untracked_aux_writes(const struct intel_device_info * const devinfo,
+                                    const struct anv_image * const image,
+                                    const VkImageAspectFlagBits aspect,
+                                    const VkImageLayout layout,
+                                    const VkQueueFlagBits queue_flags)
+{
+   const VkImageUsageFlags image_aspect_usage =
+      vk_image_usage(&image->vk, aspect);
+   const VkImageUsageFlags usage =
+      vk_image_layout_to_usage_flags(layout, aspect) & image_aspect_usage;
+
+   /* Storage is the only usage where we do not write the image through a
+    * render target but through a descriptor. Since VK_EXT_descriptor_indexing
+    * and the update-after-bind feature, it has become impossible to track
+    * writes to images in descriptor at the command buffer build time. So it's
+    * not possible to mark an image as compressed like we do in
+    * genX_cmd_buffer.c(EndRendering) or anv_blorp.c for all transfer
+    * operations.
+    */
+   if (!(usage & VK_IMAGE_USAGE_STORAGE_BIT))
+      return false;
+
+   /* No AUX, no writes to the AUX surface :) */
+   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+   const enum isl_aux_usage aux_usage = image->planes[plane].aux_usage;
+   if (aux_usage == ISL_AUX_USAGE_NONE)
+      return false;
+
+   return true;
+}
+
 static struct anv_state
-alloc_surface_state(struct anv_device *device)
+maybe_alloc_surface_state(struct anv_device *device,
+                          struct anv_state_stream *surface_state_stream)
 {
-   return anv_state_pool_alloc(&device->surface_state_pool, 64, 64);
+   if (device->physical->indirect_descriptors) {
+      if (surface_state_stream)
+         return anv_state_stream_alloc(surface_state_stream, 64, 64);
+      return anv_state_pool_alloc(&device->bindless_surface_state_pool, 64, 64);
+   } else {
+      return ANV_STATE_NULL;
+   }
 }
 
 static enum isl_channel_select
@@ -2272,10 +3208,28 @@ anv_image_fill_surface_state(struct anv_device *device,
                              enum isl_aux_usage aux_usage,
                              const union isl_color_value *clear_color,
                              enum anv_image_view_state_flags flags,
-                             struct anv_surface_state *state_inout,
-                             struct brw_image_param *image_param_out)
+                             struct anv_surface_state *state_inout)
 {
-   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+   uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+   if (image->emu_plane_format != VK_FORMAT_UNDEFINED) {
+      const uint16_t view_bpb = isl_format_get_layout(view_in->format)->bpb;
+      const uint16_t plane_bpb = isl_format_get_layout(
+            image->planes[plane].primary_surface.isl.format)->bpb;
+
+      /* We should redirect to the hidden plane when the original view format
+       * is compressed or when the view usage is storage.  But we don't always
+       * have visibility to the original view format so we also check for size
+       * compatibility.
+       */
+      if (isl_format_is_compressed(view_in->format) ||
+          (view_usage & ISL_SURF_USAGE_STORAGE_BIT) ||
+          view_bpb != plane_bpb) {
+         plane = image->n_planes;
+         assert(isl_format_get_layout(
+                  image->planes[plane].primary_surface.isl.format)->bpb ==
+                view_bpb);
+      }
+   }
 
    const struct anv_surface *surface = &image->planes[plane].primary_surface,
       *aux_surface = &image->planes[plane].aux_surface;
@@ -2283,42 +3237,14 @@ anv_image_fill_surface_state(struct anv_device *device,
    struct isl_view view = *view_in;
    view.usage |= view_usage;
 
-   /* For texturing with VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL from a
-    * compressed surface with a shadow surface, we use the shadow instead of
-    * the primary surface.  The shadow surface will be tiled, unlike the main
-    * surface, so it should get significantly better performance.
-    */
-   if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
-       isl_format_is_compressed(view.format) &&
-       (flags & ANV_IMAGE_VIEW_STATE_TEXTURE_OPTIMAL)) {
-      assert(isl_format_is_compressed(surface->isl.format));
-      assert(surface->isl.tiling == ISL_TILING_LINEAR);
-      assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR);
-      surface = &image->planes[plane].shadow_surface;
-   }
-
-   /* For texturing from stencil on gfx7, we have to sample from a shadow
-    * surface because we don't support W-tiling in the sampler.
-    */
-   if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
-       aspect == VK_IMAGE_ASPECT_STENCIL_BIT) {
-      assert(device->info.ver == 7);
-      assert(view_usage & ISL_SURF_USAGE_TEXTURE_BIT);
-      surface = &image->planes[plane].shadow_surface;
-   }
-
    if (view_usage == ISL_SURF_USAGE_RENDER_TARGET_BIT)
       view.swizzle = anv_swizzle_for_render(view.swizzle);
 
-   /* On Ivy Bridge and Bay Trail we do the swizzle in the shader */
-   if (device->info.verx10 == 70)
-      view.swizzle = ISL_SWIZZLE_IDENTITY;
-
    /* If this is a HiZ buffer we can sample from with a programmable clear
     * value (SKL+), define the clear value to the optimal constant.
     */
    union isl_color_value default_clear_color = { .u32 = { 0, } };
-   if (device->info.ver >= 9 && aspect == VK_IMAGE_ASPECT_DEPTH_BIT)
+   if (aspect == VK_IMAGE_ASPECT_DEPTH_BIT)
       default_clear_color.f32[0] = ANV_HZ_FC_VAL;
    if (!clear_color)
       clear_color = &default_clear_color;
@@ -2326,117 +3252,85 @@ anv_image_fill_surface_state(struct anv_device *device,
    const struct anv_address address =
       anv_image_address(image, &surface->memory_range);
 
-   if (view_usage == ISL_SURF_USAGE_STORAGE_BIT &&
-       !(flags & ANV_IMAGE_VIEW_STATE_STORAGE_WRITE_ONLY) &&
-       !isl_has_matching_typed_storage_image_format(&device->info,
-                                                    view.format)) {
-      /* In this case, we are a writeable storage buffer which needs to be
-       * lowered to linear. All tiling and offset calculations will be done in
-       * the shader.
-       */
-      assert(aux_usage == ISL_AUX_USAGE_NONE);
-      isl_buffer_fill_state(&device->isl_dev, state_inout->state.map,
-                            .address = anv_address_physical(address),
-                            .size_B = surface->isl.size_B,
-                            .format = ISL_FORMAT_RAW,
-                            .swizzle = ISL_SWIZZLE_IDENTITY,
-                            .stride_B = 1,
-                            .mocs = anv_mocs(device, address.bo, view_usage));
-      state_inout->address = address,
-      state_inout->aux_address = ANV_NULL_ADDRESS;
-      state_inout->clear_address = ANV_NULL_ADDRESS;
-   } else {
-      if (view_usage == ISL_SURF_USAGE_STORAGE_BIT &&
-          !(flags & ANV_IMAGE_VIEW_STATE_STORAGE_WRITE_ONLY)) {
-         /* Typed surface reads support a very limited subset of the shader
-          * image formats.  Translate it into the closest format the hardware
-          * supports.
-          */
-         assert(aux_usage == ISL_AUX_USAGE_NONE);
-         view.format = isl_lower_storage_image_format(&device->info,
-                                                      view.format);
-      }
+   void *surface_state_map = state_inout->state_data.data;
 
-      const struct isl_surf *isl_surf = &surface->isl;
+   const struct isl_surf *isl_surf = &surface->isl;
 
-      struct isl_surf tmp_surf;
-      uint64_t offset_B = 0;
-      uint32_t tile_x_sa = 0, tile_y_sa = 0;
-      if (isl_format_is_compressed(surface->isl.format) &&
-          !isl_format_is_compressed(view.format)) {
-         /* We're creating an uncompressed view of a compressed surface.  This
-          * is allowed but only for a single level/layer.
-          */
-         assert(surface->isl.samples == 1);
-         assert(view.levels == 1);
-         assert(view.array_len == 1);
-
-         ASSERTED bool ok =
-            isl_surf_get_uncompressed_surf(&device->isl_dev, isl_surf, &view,
-                                           &tmp_surf, &view,
-                                           &offset_B, &tile_x_sa, &tile_y_sa);
-         assert(ok);
-         isl_surf = &tmp_surf;
-
-         if (device->info.ver <= 8) {
-            assert(surface->isl.tiling == ISL_TILING_LINEAR);
-            assert(tile_x_sa == 0);
-            assert(tile_y_sa == 0);
-         }
-      }
-
-      state_inout->address = anv_address_add(address, offset_B);
+   struct isl_surf tmp_surf;
+   uint64_t offset_B = 0;
+   uint32_t tile_x_sa = 0, tile_y_sa = 0;
+   if (isl_format_is_compressed(surface->isl.format) &&
+       !isl_format_is_compressed(view.format)) {
+      /* We're creating an uncompressed view of a compressed surface. This is
+       * allowed but only for a single level/layer.
+       */
+      assert(surface->isl.samples == 1);
+      assert(view.levels == 1);
+
+      ASSERTED bool ok =
+         isl_surf_get_uncompressed_surf(&device->isl_dev, isl_surf, &view,
+                                        &tmp_surf, &view,
+                                        &offset_B, &tile_x_sa, &tile_y_sa);
+      assert(ok);
+      isl_surf = &tmp_surf;
+   }
 
-      struct anv_address aux_address = ANV_NULL_ADDRESS;
-      if (aux_usage != ISL_AUX_USAGE_NONE)
-         aux_address = anv_image_address(image, &aux_surface->memory_range);
-      state_inout->aux_address = aux_address;
+   state_inout->address = anv_address_add(address, offset_B);
 
-      struct anv_address clear_address = ANV_NULL_ADDRESS;
-      if (device->info.ver >= 10 && isl_aux_usage_has_fast_clears(aux_usage)) {
-         clear_address = anv_image_get_clear_color_addr(device, image, aspect);
-      }
-      state_inout->clear_address = clear_address;
-
-      isl_surf_fill_state(&device->isl_dev, state_inout->state.map,
-                          .surf = isl_surf,
-                          .view = &view,
-                          .address = anv_address_physical(state_inout->address),
-                          .clear_color = *clear_color,
-                          .aux_surf = &aux_surface->isl,
-                          .aux_usage = aux_usage,
-                          .aux_address = anv_address_physical(aux_address),
-                          .clear_address = anv_address_physical(clear_address),
-                          .use_clear_address = !anv_address_is_null(clear_address),
-                          .mocs = anv_mocs(device, state_inout->address.bo,
-                                           view_usage),
-                          .x_offset_sa = tile_x_sa,
-                          .y_offset_sa = tile_y_sa);
-
-      /* With the exception of gfx8, the bottom 12 bits of the MCS base address
-       * are used to store other information.  This should be ok, however,
-       * because the surface buffer addresses are always 4K page aligned.
-       */
-      if (!anv_address_is_null(aux_address)) {
-         uint32_t *aux_addr_dw = state_inout->state.map +
-            device->isl_dev.ss.aux_addr_offset;
-         assert((aux_address.offset & 0xfff) == 0);
-         state_inout->aux_address.offset |= *aux_addr_dw & 0xfff;
-      }
+   struct anv_address aux_address = ANV_NULL_ADDRESS;
+   if (aux_usage != ISL_AUX_USAGE_NONE)
+      aux_address = anv_image_address(image, &aux_surface->memory_range);
+   state_inout->aux_address = aux_address;
 
-      if (device->info.ver >= 10 && clear_address.bo) {
-         uint32_t *clear_addr_dw = state_inout->state.map +
-                                   device->isl_dev.ss.clear_color_state_offset;
-         assert((clear_address.offset & 0x3f) == 0);
-         state_inout->clear_address.offset |= *clear_addr_dw & 0x3f;
-      }
+   struct anv_address clear_address = ANV_NULL_ADDRESS;
+   if (device->info->ver >= 10 && isl_aux_usage_has_fast_clears(aux_usage)) {
+      clear_address = anv_image_get_clear_color_addr(device, image, aspect);
+   }
+   state_inout->clear_address = clear_address;
+
+   isl_surf_fill_state(&device->isl_dev, surface_state_map,
+                       .surf = isl_surf,
+                       .view = &view,
+                       .address = anv_address_physical(state_inout->address),
+                       .clear_color = *clear_color,
+                       .aux_surf = &aux_surface->isl,
+                       .aux_usage = aux_usage,
+                       .aux_address = anv_address_physical(aux_address),
+                       .clear_address = anv_address_physical(clear_address),
+                       .use_clear_address = !anv_address_is_null(clear_address),
+                       .mocs = anv_mocs(device, state_inout->address.bo,
+                                        view_usage),
+                       .x_offset_sa = tile_x_sa,
+                       .y_offset_sa = tile_y_sa,
+                       /* Assume robustness with EXT_pipeline_robustness
+                        * because this can be turned on/off per pipeline and
+                        * we have no visibility on this here.
+                        */
+                       .robust_image_access =
+                          device->vk.enabled_features.robustImageAccess ||
+                          device->vk.enabled_features.robustImageAccess2 ||
+                          device->vk.enabled_extensions.EXT_pipeline_robustness);
+
+   /* With the exception of gfx8, the bottom 12 bits of the MCS base address
+    * are used to store other information. This should be ok, however, because
+    * the surface buffer addresses are always 4K page aligned.
+    */
+   if (!anv_address_is_null(aux_address)) {
+      uint32_t *aux_addr_dw = surface_state_map +
+         device->isl_dev.ss.aux_addr_offset;
+      assert((aux_address.offset & 0xfff) == 0);
+      state_inout->aux_address.offset |= *aux_addr_dw & 0xfff;
    }
 
-   if (image_param_out) {
-      assert(view_usage == ISL_SURF_USAGE_STORAGE_BIT);
-      isl_surf_fill_image_param(&device->isl_dev, image_param_out,
-                                &surface->isl, &view);
+   if (device->info->ver >= 10 && clear_address.bo) {
+      uint32_t *clear_addr_dw = surface_state_map +
+         device->isl_dev.ss.clear_color_state_offset;
+      assert((clear_address.offset & 0x3f) == 0);
+      state_inout->clear_address.offset |= *clear_addr_dw & 0x3f;
    }
+
+   if (state_inout->state.map)
+      memcpy(state_inout->state.map, surface_state_map, ANV_SURFACE_STATE_SIZE);
 }
 
 static uint32_t
@@ -2446,67 +3340,193 @@ anv_image_aspect_get_planes(VkImageAspectFlags aspect_mask)
    return util_bitcount(aspect_mask);
 }
 
-VkResult
-anv_CreateImageView(VkDevice _device,
-                    const VkImageViewCreateInfo *pCreateInfo,
-                    const VkAllocationCallbacks *pAllocator,
-                    VkImageView *pView)
+bool
+anv_can_hiz_clear_ds_view(struct anv_device *device,
+                          const struct anv_image_view *iview,
+                          VkImageLayout layout,
+                          VkImageAspectFlags clear_aspects,
+                          float depth_clear_value,
+                          VkRect2D render_area,
+                          const VkQueueFlagBits queue_flags)
 {
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_image, image, pCreateInfo->image);
-   struct anv_image_view *iview;
+   if (INTEL_DEBUG(DEBUG_NO_FAST_CLEAR))
+      return false;
 
-   iview = vk_image_view_create(&device->vk, pCreateInfo,
-                                pAllocator, sizeof(*iview));
-   if (iview == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   /* If we're just clearing stencil, we can always HiZ clear */
+   if (!(clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
+      return true;
 
-   iview->image = image;
-   iview->n_planes = anv_image_aspect_get_planes(iview->vk.aspects);
+   /* We must have depth in order to have HiZ */
+   if (!(iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
+      return false;
 
-   /* Check if a conversion info was passed. */
-   const struct anv_format *conv_format = NULL;
-   const VkSamplerYcbcrConversionInfo *conv_info =
-      vk_find_struct_const(pCreateInfo->pNext, SAMPLER_YCBCR_CONVERSION_INFO);
+   const enum isl_aux_usage clear_aux_usage =
+      anv_layout_to_aux_usage(device->info, iview->image,
+                              VK_IMAGE_ASPECT_DEPTH_BIT,
+                              VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+                              layout, queue_flags);
+   if (!blorp_can_hiz_clear_depth(device->info,
+                                  &iview->image->planes[0].primary_surface.isl,
+                                  clear_aux_usage,
+                                  iview->planes[0].isl.base_level,
+                                  iview->planes[0].isl.base_array_layer,
+                                  render_area.offset.x,
+                                  render_area.offset.y,
+                                  render_area.offset.x +
+                                  render_area.extent.width,
+                                  render_area.offset.y +
+                                  render_area.extent.height))
+      return false;
 
-#ifdef ANDROID
-   /* If image has an external format, the pNext chain must contain an
-    * instance of VKSamplerYcbcrConversionInfo with a conversion object
-    * created with the same external format as image."
-    */
-   assert(!image->vk.android_external_format || conv_info);
-#endif
+   if (depth_clear_value != ANV_HZ_FC_VAL)
+      return false;
+
+   /* If we got here, then we can fast clear */
+   return true;
+}
+
+static bool
+isl_color_value_requires_conversion(union isl_color_value color,
+                                    const struct isl_surf *surf,
+                                    const struct isl_view *view)
+{
+   if (surf->format == view->format && isl_swizzle_is_identity(view->swizzle))
+      return false;
+
+   uint32_t surf_pack[4] = { 0, 0, 0, 0 };
+   isl_color_value_pack(&color, surf->format, surf_pack);
+
+   uint32_t view_pack[4] = { 0, 0, 0, 0 };
+   union isl_color_value swiz_color =
+      isl_color_value_swizzle_inv(color, view->swizzle);
+   isl_color_value_pack(&swiz_color, view->format, view_pack);
 
-   if (conv_info) {
-      ANV_FROM_HANDLE(anv_ycbcr_conversion, conversion, conv_info->conversion);
-      conv_format = conversion->format;
+   return memcmp(surf_pack, view_pack, sizeof(surf_pack)) != 0;
+}
+
+bool
+anv_can_fast_clear_color_view(struct anv_device *device,
+                              struct anv_image_view *iview,
+                              VkImageLayout layout,
+                              union isl_color_value clear_color,
+                              uint32_t num_layers,
+                              VkRect2D render_area,
+                              const VkQueueFlagBits queue_flags)
+{
+   if (INTEL_DEBUG(DEBUG_NO_FAST_CLEAR))
+      return false;
+
+   if (iview->planes[0].isl.base_array_layer >=
+       anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT,
+                            iview->planes[0].isl.base_level))
+      return false;
+
+   /* Start by getting the fast clear type.  We use the first subpass
+    * layout here because we don't want to fast-clear if the first subpass
+    * to use the attachment can't handle fast-clears.
+    */
+   enum anv_fast_clear_type fast_clear_type =
+      anv_layout_to_fast_clear_type(device->info, iview->image,
+                                    VK_IMAGE_ASPECT_COLOR_BIT,
+                                    layout, queue_flags);
+   switch (fast_clear_type) {
+   case ANV_FAST_CLEAR_NONE:
+      return false;
+   case ANV_FAST_CLEAR_DEFAULT_VALUE:
+      if (!isl_color_value_is_zero(clear_color, iview->planes[0].isl.format))
+         return false;
+      break;
+   case ANV_FAST_CLEAR_ANY:
+      break;
    }
 
-#ifdef ANDROID
-   /* "If image has an external format, format must be VK_FORMAT_UNDEFINED." */
-   assert(!image->vk.android_external_format ||
-          pCreateInfo->format == VK_FORMAT_UNDEFINED);
-#endif
+   /* Potentially, we could do partial fast-clears but doing so has crazy
+    * alignment restrictions.  It's easier to just restrict to full size
+    * fast clears for now.
+    */
+   if (render_area.offset.x != 0 ||
+       render_area.offset.y != 0 ||
+       render_area.extent.width != iview->vk.extent.width ||
+       render_area.extent.height != iview->vk.extent.height)
+      return false;
 
-   /* Format is undefined, this can happen when using external formats. Set
-    * view format from the passed conversion info.
+   /* If the clear color is one that would require non-trivial format
+    * conversion on resolve, we don't bother with the fast clear.  This
+    * shouldn't be common as most clear colors are 0/1 and the most common
+    * format re-interpretation is for sRGB.
     */
-   if (iview->vk.format == VK_FORMAT_UNDEFINED && conv_format)
-      iview->vk.format = conv_format->vk_format;
+   if (isl_color_value_requires_conversion(clear_color,
+                                           &iview->image->planes[0].primary_surface.isl,
+                                           &iview->planes[0].isl)) {
+      anv_perf_warn(VK_LOG_OBJS(&iview->vk.base),
+                    "Cannot fast-clear to colors which would require "
+                    "format conversion on resolve");
+      return false;
+   }
+
+   /* We only allow fast clears to the first slice of an image (level 0,
+    * layer 0) and only for the entire slice.  This guarantees us that, at
+    * any given time, there is only one clear color on any given image at
+    * any given time.  At the time of our testing (Jan 17, 2018), there
+    * were no known applications which would benefit from fast-clearing
+    * more than just the first slice.
+    */
+   if (iview->planes[0].isl.base_level > 0 ||
+       iview->planes[0].isl.base_array_layer > 0) {
+      anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
+                    "Rendering with multi-lod or multi-layer framebuffer "
+                    "with LOAD_OP_LOAD and baseMipLevel > 0 or "
+                    "baseArrayLayer > 0.  Not fast clearing.");
+      return false;
+   }
+
+   if (num_layers > 1) {
+      anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
+                    "Rendering to a multi-layer framebuffer with "
+                    "LOAD_OP_CLEAR.  Only fast-clearing the first slice");
+   }
+
+   /* Wa_18020603990 - slow clear surfaces up to 256x256, 32bpp. */
+   if (intel_needs_workaround(device->info, 18020603990)) {
+      const struct anv_surface *anv_surf =
+         &iview->image->planes->primary_surface;
+      if (isl_format_get_layout(anv_surf->isl.format)->bpb <= 32 &&
+          anv_surf->isl.logical_level0_px.w <= 256 &&
+          anv_surf->isl.logical_level0_px.h <= 256)
+         return false;
+   }
+
+   return true;
+}
+
+void
+anv_image_view_init(struct anv_device *device,
+                    struct anv_image_view *iview,
+                    const VkImageViewCreateInfo *pCreateInfo,
+                    struct anv_state_stream *surface_state_stream)
+{
+   ANV_FROM_HANDLE(anv_image, image, pCreateInfo->image);
+
+   vk_image_view_init(&device->vk, &iview->vk, false, pCreateInfo);
+   iview->image = image;
+   iview->n_planes = anv_image_aspect_get_planes(iview->vk.aspects);
+   iview->use_surface_state_stream = surface_state_stream != NULL;
 
    /* Now go through the underlying image selected planes and map them to
     * planes in the image view.
     */
    anv_foreach_image_aspect_bit(iaspect_bit, image, iview->vk.aspects) {
-      const uint32_t iplane =
-         anv_aspect_to_plane(image->vk.aspects, 1UL << iaspect_bit);
       const uint32_t vplane =
          anv_aspect_to_plane(iview->vk.aspects, 1UL << iaspect_bit);
-      struct anv_format_plane format;
-      format = anv_get_format_plane(&device->info, iview->vk.format,
-                                    vplane, image->vk.tiling);
 
-      iview->planes[vplane].image_plane = iplane;
+      VkFormat view_format = iview->vk.view_format;
+      if (anv_is_format_emulated(device->physical, view_format)) {
+         assert(image->emu_plane_format != VK_FORMAT_UNDEFINED);
+         view_format =
+            anv_get_emulation_format(device->physical, view_format);
+      }
+      const struct anv_format_plane format = anv_get_format_plane(
+            device->info, view_format, vplane, image->vk.tiling);
 
       iview->planes[vplane].isl = (struct isl_view) {
          .format = format.isl_format,
@@ -2514,6 +3534,7 @@ anv_CreateImageView(VkDevice _device,
          .levels = iview->vk.level_count,
          .base_array_layer = iview->vk.base_array_layer,
          .array_len = iview->vk.layer_count,
+         .min_lod_clamp = iview->vk.min_lod,
          .swizzle = {
             .r = remap_swizzle(iview->vk.swizzle.r, format.swizzle),
             .g = remap_swizzle(iview->vk.swizzle.g, format.swizzle),
@@ -2534,73 +3555,114 @@ anv_CreateImageView(VkDevice _device,
          iview->planes[vplane].isl.usage = 0;
       }
 
-      if (iview->vk.usage & VK_IMAGE_USAGE_SAMPLED_BIT ||
-          (iview->vk.usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT &&
-           !(iview->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV))) {
-         iview->planes[vplane].optimal_sampler_surface_state.state = alloc_surface_state(device);
-         iview->planes[vplane].general_sampler_surface_state.state = alloc_surface_state(device);
+      if (iview->vk.usage & (VK_IMAGE_USAGE_SAMPLED_BIT |
+                             VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)) {
+         iview->planes[vplane].optimal_sampler.state =
+            maybe_alloc_surface_state(device, surface_state_stream);
+         iview->planes[vplane].general_sampler.state =
+            maybe_alloc_surface_state(device, surface_state_stream);
 
          enum isl_aux_usage general_aux_usage =
-            anv_layout_to_aux_usage(&device->info, image, 1UL << iaspect_bit,
+            anv_layout_to_aux_usage(device->info, image, 1UL << iaspect_bit,
                                     VK_IMAGE_USAGE_SAMPLED_BIT,
-                                    VK_IMAGE_LAYOUT_GENERAL);
+                                    VK_IMAGE_LAYOUT_GENERAL,
+                                    VK_QUEUE_GRAPHICS_BIT |
+                                    VK_QUEUE_COMPUTE_BIT |
+                                    VK_QUEUE_TRANSFER_BIT);
          enum isl_aux_usage optimal_aux_usage =
-            anv_layout_to_aux_usage(&device->info, image, 1UL << iaspect_bit,
+            anv_layout_to_aux_usage(device->info, image, 1UL << iaspect_bit,
                                     VK_IMAGE_USAGE_SAMPLED_BIT,
-                                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
+                                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+                                    VK_QUEUE_GRAPHICS_BIT |
+                                    VK_QUEUE_COMPUTE_BIT |
+                                    VK_QUEUE_TRANSFER_BIT);
 
          anv_image_fill_surface_state(device, image, 1ULL << iaspect_bit,
                                       &iview->planes[vplane].isl,
                                       ISL_SURF_USAGE_TEXTURE_BIT,
                                       optimal_aux_usage, NULL,
                                       ANV_IMAGE_VIEW_STATE_TEXTURE_OPTIMAL,
-                                      &iview->planes[vplane].optimal_sampler_surface_state,
-                                      NULL);
+                                      &iview->planes[vplane].optimal_sampler);
 
          anv_image_fill_surface_state(device, image, 1ULL << iaspect_bit,
                                       &iview->planes[vplane].isl,
                                       ISL_SURF_USAGE_TEXTURE_BIT,
                                       general_aux_usage, NULL,
                                       0,
-                                      &iview->planes[vplane].general_sampler_surface_state,
-                                      NULL);
+                                      &iview->planes[vplane].general_sampler);
       }
 
       /* NOTE: This one needs to go last since it may stomp isl_view.format */
       if (iview->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT) {
-         if (isl_is_storage_image_format(format.isl_format)) {
-            iview->planes[vplane].storage_surface_state.state =
-               alloc_surface_state(device);
-
-            anv_image_fill_surface_state(device, image, 1ULL << iaspect_bit,
-                                         &iview->planes[vplane].isl,
-                                         ISL_SURF_USAGE_STORAGE_BIT,
-                                         ISL_AUX_USAGE_NONE, NULL,
-                                         0,
-                                         &iview->planes[vplane].storage_surface_state,
-                                         &iview->planes[vplane].storage_image_param);
-         } else {
-            /* In this case, we support the format but, because there's no
-             * SPIR-V format specifier corresponding to it, we only support
-             * NonReadable (writeonly in GLSL) access.  Instead of hanging in
-             * these invalid cases, we give them a NULL descriptor.
-             */
-            assert(isl_format_supports_typed_writes(&device->info,
-                                                    format.isl_format));
-            iview->planes[vplane].storage_surface_state.state =
-               device->null_surface_state;
+         struct isl_view storage_view = iview->planes[vplane].isl;
+         if (iview->vk.view_type == VK_IMAGE_VIEW_TYPE_3D) {
+            storage_view.base_array_layer = iview->vk.storage.z_slice_offset;
+            storage_view.array_len = iview->vk.storage.z_slice_count;
          }
 
-         iview->planes[vplane].writeonly_storage_surface_state.state = alloc_surface_state(device);
+         enum isl_aux_usage general_aux_usage =
+            anv_layout_to_aux_usage(device->info, image, 1UL << iaspect_bit,
+                                    VK_IMAGE_USAGE_STORAGE_BIT,
+                                    VK_IMAGE_LAYOUT_GENERAL,
+                                    VK_QUEUE_GRAPHICS_BIT |
+                                    VK_QUEUE_COMPUTE_BIT |
+                                    VK_QUEUE_TRANSFER_BIT);
+         iview->planes[vplane].storage.state =
+            maybe_alloc_surface_state(device, surface_state_stream);
+
          anv_image_fill_surface_state(device, image, 1ULL << iaspect_bit,
-                                      &iview->planes[vplane].isl,
+                                      &storage_view,
                                       ISL_SURF_USAGE_STORAGE_BIT,
-                                      ISL_AUX_USAGE_NONE, NULL,
-                                      ANV_IMAGE_VIEW_STATE_STORAGE_WRITE_ONLY,
-                                      &iview->planes[vplane].writeonly_storage_surface_state,
-                                      NULL);
+                                      general_aux_usage, NULL,
+                                      0,
+                                      &iview->planes[vplane].storage);
       }
    }
+}
+
+void
+anv_image_view_finish(struct anv_image_view *iview)
+{
+   struct anv_device *device =
+      container_of(iview->vk.base.device, struct anv_device, vk);
+
+   if (!iview->use_surface_state_stream) {
+      for (uint32_t plane = 0; plane < iview->n_planes; plane++) {
+         if (iview->planes[plane].optimal_sampler.state.alloc_size) {
+            anv_state_pool_free(&device->bindless_surface_state_pool,
+                  iview->planes[plane].optimal_sampler.state);
+         }
+
+         if (iview->planes[plane].general_sampler.state.alloc_size) {
+            anv_state_pool_free(&device->bindless_surface_state_pool,
+                  iview->planes[plane].general_sampler.state);
+         }
+
+         if (iview->planes[plane].storage.state.alloc_size) {
+            anv_state_pool_free(&device->bindless_surface_state_pool,
+                  iview->planes[plane].storage.state);
+         }
+      }
+   }
+
+   vk_image_view_finish(&iview->vk);
+}
+
+VkResult
+anv_CreateImageView(VkDevice _device,
+                    const VkImageViewCreateInfo *pCreateInfo,
+                    const VkAllocationCallbacks *pAllocator,
+                    VkImageView *pView)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_image_view *iview;
+
+   iview = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*iview), 8,
+                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (iview == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   anv_image_view_init(device, iview, pCreateInfo, NULL);
 
    *pView = anv_image_view_to_handle(iview);
 
@@ -2611,42 +3673,33 @@ void
 anv_DestroyImageView(VkDevice _device, VkImageView _iview,
                      const VkAllocationCallbacks *pAllocator)
 {
-   ANV_FROM_HANDLE(anv_device, device, _device);
    ANV_FROM_HANDLE(anv_image_view, iview, _iview);
 
    if (!iview)
       return;
 
-   for (uint32_t plane = 0; plane < iview->n_planes; plane++) {
-      /* Check offset instead of alloc_size because this they might be
-       * device->null_surface_state which always has offset == 0.  We don't
-       * own that one so we don't want to accidentally free it.
-       */
-      if (iview->planes[plane].optimal_sampler_surface_state.state.offset) {
-         anv_state_pool_free(&device->surface_state_pool,
-                             iview->planes[plane].optimal_sampler_surface_state.state);
-      }
-
-      if (iview->planes[plane].general_sampler_surface_state.state.offset) {
-         anv_state_pool_free(&device->surface_state_pool,
-                             iview->planes[plane].general_sampler_surface_state.state);
-      }
-
-      if (iview->planes[plane].storage_surface_state.state.offset) {
-         anv_state_pool_free(&device->surface_state_pool,
-                             iview->planes[plane].storage_surface_state.state);
-      }
+   anv_image_view_finish(iview);
+   vk_free2(&iview->vk.base.device->alloc, pAllocator, iview);
+}
 
-      if (iview->planes[plane].writeonly_storage_surface_state.state.offset) {
-         anv_state_pool_free(&device->surface_state_pool,
-                             iview->planes[plane].writeonly_storage_surface_state.state);
-      }
-   }
+static void
+anv_fill_buffer_view_surface_state(struct anv_device *device,
+                                   struct anv_buffer_state *state,
+                                   enum isl_format format,
+                                   struct isl_swizzle swizzle,
+                                   isl_surf_usage_flags_t usage,
+                                   struct anv_address address,
+                                   uint32_t range, uint32_t stride)
+{
+   anv_fill_buffer_surface_state(device,
+                                 state->state_data.data,
+                                 format, swizzle, usage,
+                                 address, range, stride);
 
-   vk_image_view_destroy(&device->vk, pAllocator, &iview->vk);
+   if (state->state.map)
+      memcpy(state->state.map, state->state_data.data, ANV_SURFACE_STATE_SIZE);
 }
 
-
 VkResult
 anv_CreateBufferView(VkDevice _device,
                      const VkBufferViewCreateInfo *pCreateInfo,
@@ -2657,61 +3710,49 @@ anv_CreateBufferView(VkDevice _device,
    ANV_FROM_HANDLE(anv_buffer, buffer, pCreateInfo->buffer);
    struct anv_buffer_view *view;
 
-   view = vk_object_alloc(&device->vk, pAllocator, sizeof(*view),
-                          VK_OBJECT_TYPE_BUFFER_VIEW);
+   view = vk_buffer_view_create(&device->vk, pCreateInfo,
+                                pAllocator, sizeof(*view));
    if (!view)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   const VkBufferUsageFlags2CreateInfoKHR *view_usage_info =
+      vk_find_struct_const(pCreateInfo->pNext, BUFFER_USAGE_FLAGS_2_CREATE_INFO_KHR);
+   const VkBufferUsageFlags buffer_usage =
+      view_usage_info != NULL ? view_usage_info->usage : buffer->vk.usage;
 
-   /* TODO: Handle the format swizzle? */
+   struct anv_format_plane format;
+   format = anv_get_format_plane(device->info, pCreateInfo->format,
+                                 0, VK_IMAGE_TILING_LINEAR);
 
-   view->format = anv_get_isl_format(&device->info, pCreateInfo->format,
-                                     VK_IMAGE_ASPECT_COLOR_BIT,
-                                     VK_IMAGE_TILING_LINEAR);
-   const uint32_t format_bs = isl_format_get_layout(view->format)->bpb / 8;
-   view->range = anv_buffer_get_range(buffer, pCreateInfo->offset,
-                                              pCreateInfo->range);
-   view->range = align_down_npot_u32(view->range, format_bs);
+   const uint32_t format_bs = isl_format_get_layout(format.isl_format)->bpb / 8;
+   const uint32_t align_range =
+      align_down_npot_u32(view->vk.range, format_bs);
 
    view->address = anv_address_add(buffer->address, pCreateInfo->offset);
 
-   if (buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT) {
-      view->surface_state = alloc_surface_state(device);
+   if (buffer_usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT) {
+      view->general.state = maybe_alloc_surface_state(device, NULL);
 
-      anv_fill_buffer_surface_state(device, view->surface_state,
-                                    view->format, ISL_SURF_USAGE_TEXTURE_BIT,
-                                    view->address, view->range, format_bs);
+      anv_fill_buffer_view_surface_state(device,
+                                         &view->general,
+                                         format.isl_format,
+                                         format.swizzle,
+                                         ISL_SURF_USAGE_TEXTURE_BIT,
+                                         view->address, align_range, format_bs);
    } else {
-      view->surface_state = (struct anv_state){ 0 };
-   }
-
-   if (buffer->usage & VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT) {
-      view->storage_surface_state = alloc_surface_state(device);
-      view->writeonly_storage_surface_state = alloc_surface_state(device);
-
-      enum isl_format storage_format =
-         isl_has_matching_typed_storage_image_format(&device->info,
-                                                     view->format) ?
-         isl_lower_storage_image_format(&device->info, view->format) :
-         ISL_FORMAT_RAW;
-
-      anv_fill_buffer_surface_state(device, view->storage_surface_state,
-                                    storage_format, ISL_SURF_USAGE_STORAGE_BIT,
-                                    view->address, view->range,
-                                    (storage_format == ISL_FORMAT_RAW ? 1 :
-                                     isl_format_get_layout(storage_format)->bpb / 8));
-
-      /* Write-only accesses should use the original format. */
-      anv_fill_buffer_surface_state(device, view->writeonly_storage_surface_state,
-                                    view->format, ISL_SURF_USAGE_STORAGE_BIT,
-                                    view->address, view->range,
-                                    isl_format_get_layout(view->format)->bpb / 8);
-
-      isl_buffer_fill_image_param(&device->isl_dev,
-                                  &view->storage_image_param,
-                                  view->format, view->range);
+      view->general.state = ANV_STATE_NULL;
+   }
+
+   if (buffer_usage & VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT) {
+      view->storage.state = maybe_alloc_surface_state(device, NULL);
+
+      anv_fill_buffer_view_surface_state(device,
+                                         &view->storage,
+                                         format.isl_format, format.swizzle,
+                                         ISL_SURF_USAGE_STORAGE_BIT,
+                                         view->address, align_range, format_bs);
    } else {
-      view->storage_surface_state = (struct anv_state){ 0 };
-      view->writeonly_storage_surface_state = (struct anv_state){ 0 };
+      view->storage.state = ANV_STATE_NULL;
    }
 
    *pView = anv_buffer_view_to_handle(view);
@@ -2729,17 +3770,26 @@ anv_DestroyBufferView(VkDevice _device, VkBufferView bufferView,
    if (!view)
       return;
 
-   if (view->surface_state.alloc_size > 0)
-      anv_state_pool_free(&device->surface_state_pool,
-                          view->surface_state);
+   if (view->general.state.alloc_size > 0) {
+      anv_state_pool_free(&device->bindless_surface_state_pool,
+                          view->general.state);
+   }
 
-   if (view->storage_surface_state.alloc_size > 0)
-      anv_state_pool_free(&device->surface_state_pool,
-                          view->storage_surface_state);
+   if (view->storage.state.alloc_size > 0) {
+      anv_state_pool_free(&device->bindless_surface_state_pool,
+                          view->storage.state);
+   }
 
-   if (view->writeonly_storage_surface_state.alloc_size > 0)
-      anv_state_pool_free(&device->surface_state_pool,
-                          view->writeonly_storage_surface_state);
+   vk_buffer_view_destroy(&device->vk, pAllocator, &view->vk);
+}
 
-   vk_object_free(&device->vk, pAllocator, view);
+void anv_GetRenderingAreaGranularityKHR(
+    VkDevice                                    _device,
+    const VkRenderingAreaInfoKHR*               pRenderingAreaInfo,
+    VkExtent2D*                                 pGranularity)
+{
+   *pGranularity = (VkExtent2D) {
+      .width = 1,
+      .height = 1,
+   };
 }
diff --git a/src/intel/vulkan/anv_internal_kernels.c b/src/intel/vulkan/anv_internal_kernels.c
new file mode 100644
index 00000000000..b4496cb51bb
--- /dev/null
+++ b/src/intel/vulkan/anv_internal_kernels.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include "compiler/intel_nir.h"
+#include "compiler/brw_compiler.h"
+#include "compiler/brw_nir.h"
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "dev/intel_debug.h"
+#include "intel/compiler/intel_nir.h"
+#include "util/macros.h"
+
+#include "vk_nir.h"
+
+#include "anv_internal_kernels.h"
+
+static bool
+lower_base_workgroup_id(nir_builder *b, nir_intrinsic_instr *intrin,
+                        UNUSED void *data)
+{
+   if (intrin->intrinsic != nir_intrinsic_load_base_workgroup_id)
+      return false;
+
+   b->cursor = nir_instr_remove(&intrin->instr);
+   nir_def_rewrite_uses(&intrin->def, nir_imm_zero(b, 3, 32));
+   return true;
+}
+
+static void
+link_libanv(nir_shader *nir, const nir_shader *libanv)
+{
+   nir_link_shader_functions(nir, libanv);
+   NIR_PASS_V(nir, nir_inline_functions);
+   NIR_PASS_V(nir, nir_remove_non_entrypoints);
+   NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp,
+              glsl_get_cl_type_size_align);
+   NIR_PASS_V(nir, nir_opt_deref);
+   NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+   NIR_PASS_V(nir, nir_lower_explicit_io,
+              nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
+                 nir_var_mem_global,
+              nir_address_format_62bit_generic);
+}
+
+static struct anv_shader_bin *
+compile_shader(struct anv_device *device,
+               const nir_shader *libanv,
+               enum anv_internal_kernel_name shader_name,
+               gl_shader_stage stage,
+               const char *name,
+               const void *hash_key,
+               uint32_t hash_key_size,
+               uint32_t sends_count_expectation)
+{
+   const nir_shader_compiler_options *nir_options =
+      device->physical->compiler->nir_options[stage];
+
+   nir_builder b = nir_builder_init_simple_shader(stage, nir_options,
+                                                  "%s", name);
+
+   uint32_t uniform_size =
+      anv_genX(device->info, call_internal_shader)(&b, shader_name);
+
+   nir_shader *nir = b.shader;
+
+   link_libanv(nir, libanv);
+
+   NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+   NIR_PASS_V(nir, nir_opt_cse);
+   NIR_PASS_V(nir, nir_opt_gcm, true);
+   NIR_PASS_V(nir, nir_opt_peephole_select, 1, false, false);
+
+   NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);
+
+   NIR_PASS_V(nir, nir_split_var_copies);
+   NIR_PASS_V(nir, nir_split_per_member_structs);
+
+   if (stage == MESA_SHADER_COMPUTE) {
+      nir->info.workgroup_size[0] = 16;
+      nir->info.workgroup_size[1] = 1;
+      nir->info.workgroup_size[2] = 1;
+   }
+
+   struct brw_compiler *compiler = device->physical->compiler;
+   struct brw_nir_compiler_opts opts = {};
+   brw_preprocess_nir(compiler, nir, &opts);
+
+   NIR_PASS_V(nir, nir_propagate_invariant, false);
+
+   if (stage == MESA_SHADER_FRAGMENT) {
+      NIR_PASS_V(nir, nir_lower_input_attachments,
+                 &(nir_input_attachment_options) {
+                    .use_fragcoord_sysval = true,
+                    .use_layer_id_sysval = true,
+                 });
+   } else {
+      nir_lower_compute_system_values_options options = {
+         .has_base_workgroup_id = true,
+         .lower_cs_local_id_to_index = true,
+         .lower_workgroup_id_to_index = gl_shader_stage_is_mesh(stage),
+      };
+      NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
+      NIR_PASS_V(nir, nir_shader_intrinsics_pass, lower_base_workgroup_id,
+                 nir_metadata_block_index | nir_metadata_dominance, NULL);
+   }
+
+   /* Reset sizes before gathering information */
+   nir->global_mem_size = 0;
+   nir->scratch_size = 0;
+   nir->info.shared_size = 0;
+   nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
+
+   NIR_PASS_V(nir, nir_copy_prop);
+   NIR_PASS_V(nir, nir_opt_constant_folding);
+   NIR_PASS_V(nir, nir_opt_dce);
+
+   union brw_any_prog_key key;
+   memset(&key, 0, sizeof(key));
+
+   union brw_any_prog_data prog_data;
+   memset(&prog_data, 0, sizeof(prog_data));
+
+   if (stage == MESA_SHADER_COMPUTE) {
+      NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics,
+                 device->info, &prog_data.cs);
+   }
+
+   /* Do vectorizing here. For some reason when trying to do it in the back
+    * this just isn't working.
+    */
+   nir_load_store_vectorize_options options = {
+      .modes = nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_global,
+      .callback = brw_nir_should_vectorize_mem,
+      .robust_modes = (nir_variable_mode)0,
+   };
+   NIR_PASS_V(nir, nir_opt_load_store_vectorize, &options);
+
+   nir->num_uniforms = uniform_size;
+
+   prog_data.base.nr_params = nir->num_uniforms / 4;
+
+   brw_nir_analyze_ubo_ranges(compiler, nir, prog_data.base.ubo_ranges);
+
+   void *temp_ctx = ralloc_context(NULL);
+
+   const unsigned *program;
+   if (stage == MESA_SHADER_FRAGMENT) {
+      struct brw_compile_stats stats[3];
+      struct brw_compile_fs_params params = {
+         .base = {
+            .nir = nir,
+            .log_data = device,
+            .debug_flag = DEBUG_WM,
+            .stats = stats,
+            .mem_ctx = temp_ctx,
+         },
+         .key = &key.wm,
+         .prog_data = &prog_data.wm,
+      };
+      program = brw_compile_fs(compiler, &params);
+
+      unsigned stat_idx = 0;
+      if (prog_data.wm.dispatch_8) {
+         assert(stats[stat_idx].spills == 0);
+         assert(stats[stat_idx].fills == 0);
+         assert(stats[stat_idx].sends == sends_count_expectation);
+         stat_idx++;
+      }
+      if (prog_data.wm.dispatch_16) {
+         assert(stats[stat_idx].spills == 0);
+         assert(stats[stat_idx].fills == 0);
+         assert(stats[stat_idx].sends == sends_count_expectation);
+         stat_idx++;
+      }
+      if (prog_data.wm.dispatch_32) {
+         assert(stats[stat_idx].spills == 0);
+         assert(stats[stat_idx].fills == 0);
+         assert(stats[stat_idx].sends == sends_count_expectation * 2);
+         stat_idx++;
+      }
+   } else {
+      struct brw_compile_stats stats;
+      struct brw_compile_cs_params params = {
+         .base = {
+            .nir = nir,
+            .stats = &stats,
+            .log_data = device,
+            .debug_flag = DEBUG_CS,
+            .mem_ctx = temp_ctx,
+         },
+         .key = &key.cs,
+         .prog_data = &prog_data.cs,
+      };
+      program = brw_compile_cs(compiler, &params);
+
+      assert(stats.spills == 0);
+      assert(stats.fills == 0);
+      assert(stats.sends == sends_count_expectation);
+   }
+
+   assert(prog_data.base.total_scratch == 0);
+
+   struct anv_pipeline_bind_map empty_bind_map = {};
+   struct anv_push_descriptor_info empty_push_desc_info = {};
+   struct anv_shader_upload_params upload_params = {
+      .stage               = nir->info.stage,
+      .key_data            = hash_key,
+      .key_size            = hash_key_size,
+      .kernel_data         = program,
+      .kernel_size         = prog_data.base.program_size,
+      .prog_data           = &prog_data.base,
+      .prog_data_size      = sizeof(prog_data),
+      .bind_map            = &empty_bind_map,
+      .push_desc_info      = &empty_push_desc_info,
+   };
+
+   struct anv_shader_bin *kernel =
+      anv_device_upload_kernel(device, device->internal_cache, &upload_params);
+
+   ralloc_free(temp_ctx);
+   ralloc_free(nir);
+
+   return kernel;
+}
+
+VkResult
+anv_device_get_internal_shader(struct anv_device *device,
+                               enum anv_internal_kernel_name name,
+                               struct anv_shader_bin **out_bin)
+{
+   const struct {
+      struct {
+         char name[40];
+      } key;
+
+      gl_shader_stage stage;
+
+      uint32_t        send_count;
+   } internal_kernels[] = {
+      [ANV_INTERNAL_KERNEL_GENERATED_DRAWS] = {
+         .key        = {
+            .name    = "anv-generated-indirect-draws",
+         },
+         .stage      = MESA_SHADER_FRAGMENT,
+         .send_count =  (device->info->ver == 9 ?
+                         /* 1 load +
+                          * 4 stores +
+                          * 2 * (2 loads + 2 stores) +
+                          * 3 stores
+                          */
+                         16 :
+                         /* 1 load +
+                          * 2 * (2 loads + 3 stores) +
+                          * 3 stores
+                          */
+                         14),
+      },
+      [ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_COMPUTE] = {
+         .key        = {
+            .name    = "anv-copy-query-compute",
+         },
+         .stage      = MESA_SHADER_COMPUTE,
+         .send_count = device->info->verx10 >= 125 ?
+                       9 /* 4 loads + 4 stores + 1 EOT */ :
+                       8 /* 3 loads + 4 stores + 1 EOT */,
+      },
+      [ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_FRAGMENT] = {
+         .key        = {
+            .name    = "anv-copy-query-fragment",
+         },
+         .stage      = MESA_SHADER_FRAGMENT,
+         .send_count = 8 /* 3 loads + 4 stores + 1 EOT */,
+      },
+      [ANV_INTERNAL_KERNEL_MEMCPY_COMPUTE] = {
+         .key        = {
+            .name    = "anv-memcpy-compute",
+         },
+         .stage      = MESA_SHADER_COMPUTE,
+         .send_count = device->info->verx10 >= 125 ?
+                       10 /* 5 loads (1 pull constants) + 4 stores + 1 EOT */ :
+                       9 /* 4 loads + 4 stores + 1 EOT */,
+      },
+   };
+
+   struct anv_shader_bin *bin =
+      p_atomic_read(&device->internal_kernels[name]);
+   if (bin != NULL) {
+      *out_bin = bin;
+      return VK_SUCCESS;
+   }
+
+   bin =
+      anv_device_search_for_kernel(device,
+                                   device->internal_cache,
+                                   &internal_kernels[name].key,
+                                   sizeof(internal_kernels[name].key),
+                                   NULL);
+   if (bin != NULL) {
+      p_atomic_set(&device->internal_kernels[name], bin);
+      *out_bin = bin;
+      return VK_SUCCESS;
+   }
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   nir_shader *libanv_shaders =
+      anv_genX(device->info, load_libanv_shader)(device, mem_ctx);
+
+   bin = compile_shader(device,
+                        libanv_shaders,
+                        name,
+                        internal_kernels[name].stage,
+                        internal_kernels[name].key.name,
+                        &internal_kernels[name].key,
+                        sizeof(internal_kernels[name].key),
+                        internal_kernels[name].send_count);
+   if (bin == NULL)
+      return vk_errorf(device, VK_ERROR_OUT_OF_HOST_MEMORY,
+                       "Unable to compiler internal kernel");
+
+   /* The cache already has a reference and it's not going anywhere so
+    * there is no need to hold a second reference.
+    */
+   anv_shader_bin_unref(device, bin);
+
+   p_atomic_set(&device->internal_kernels[name], bin);
+
+   *out_bin = bin;
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_device_init_internal_kernels(struct anv_device *device)
+{
+   const struct intel_l3_weights w =
+      intel_get_default_l3_weights(device->info,
+                                   true /* wants_dc_cache */,
+                                   false /* needs_slm */);
+   device->internal_kernels_l3_config = intel_get_l3_config(device->info, w);
+
+   return VK_SUCCESS;
+}
+
+void
+anv_device_finish_internal_kernels(struct anv_device *device)
+{
+}
diff --git a/src/intel/vulkan/anv_internal_kernels.h b/src/intel/vulkan/anv_internal_kernels.h
new file mode 100644
index 00000000000..d0e325add2a
--- /dev/null
+++ b/src/intel/vulkan/anv_internal_kernels.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef ANV_GENERATED_INDIRECT_DRAWS_H
+#define ANV_GENERATED_INDIRECT_DRAWS_H
+
+#include "libintel_shaders.h"
+
+struct PACKED anv_gen_indirect_params {
+   /* Draw ID buffer address (only used on Gfx9) */
+   uint64_t draw_id_addr;
+
+   /* Indirect data buffer address (only used on Gfx9) */
+   uint64_t indirect_data_addr;
+
+   /* Stride between each elements of the indirect data buffer */
+   uint32_t indirect_data_stride;
+
+   uint32_t flags; /* 0-7: bits, 8-15: mocs, 16-23: cmd_dws */
+
+   /* Base number of the draw ID, it is added to the index computed from the
+    * gl_FragCoord
+    */
+   uint32_t draw_base;
+
+   /* Maximum number of draws (equals to draw_count for indirect draws without
+    * an indirect count)
+    */
+   uint32_t max_draw_count;
+
+   /* Number of draws to generate in the ring buffer (only useful in ring
+    * buffer mode)
+    */
+   uint32_t ring_count;
+
+   /* Instance multiplier for multi view */
+   uint32_t instance_multiplier;
+
+   /* Address where to jump at to generate further draws (used with ring mode)
+    */
+   uint64_t gen_addr;
+
+   /* Address where to jump at after the generated draw (only used with
+    * indirect draw count variants)
+    */
+   uint64_t end_addr;
+
+   /* Destination of the generated draw commands */
+   uint64_t generated_cmds_addr;
+
+   /* Draw count address (points to the draw_count field in cases) */
+   uint64_t draw_count_addr;
+
+   /* Draw count value for non count variants of draw indirect commands */
+   uint32_t draw_count;
+
+   /* CPU side pointer to the previous item when number of draws has to be
+    * split into smaller chunks, see while loop in
+    * genX(cmd_buffer_emit_indirect_generated_draws)
+    */
+   struct anv_gen_indirect_params *prev;
+};
+
+struct PACKED anv_query_copy_params {
+   /* ANV_COPY_QUERY_FLAG_* flags */
+   uint32_t flags;
+
+   /* Number of queries to copy */
+   uint32_t num_queries;
+
+   /* Number of items to write back in the results per query */
+   uint32_t num_items;
+
+   /* First query to copy result from */
+   uint32_t query_base;
+
+   /* Query stride in bytes */
+   uint32_t query_stride;
+
+   /* Offset at which the data should be read from */
+   uint32_t query_data_offset;
+
+   /* Stride of destination writes */
+   uint32_t destination_stride;
+
+   /* We need to be 64 bit aligned, or 32 bit builds get
+    * very unhappy.
+    */
+   uint32_t padding;
+
+   /* Address of the query pool */
+   uint64_t query_data_addr;
+
+   /* Destination address of the results */
+   uint64_t destination_addr;
+};
+
+struct PACKED anv_memcpy_params {
+   /* Number of dwords to copy*/
+   uint32_t num_dwords;
+
+   uint32_t pad;
+
+   /* Source address of the copy */
+   uint64_t src_addr;
+
+   /* Destination address of the copy */
+   uint64_t dst_addr;
+};
+
+#endif /* ANV_GENERATED_INDIRECT_DRAWS_H */
diff --git a/src/intel/vulkan/anv_kmd_backend.c b/src/intel/vulkan/anv_kmd_backend.c
new file mode 100644
index 00000000000..8ce882bba26
--- /dev/null
+++ b/src/intel/vulkan/anv_kmd_backend.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+
+#include "anv_kmd_backend.h"
+#include "anv_private.h"
+
+const struct anv_kmd_backend *
+anv_kmd_backend_get(enum intel_kmd_type type)
+{
+   switch (type) {
+   case INTEL_KMD_TYPE_I915:
+      return anv_i915_kmd_backend_get();
+   case INTEL_KMD_TYPE_XE:
+      return anv_xe_kmd_backend_get();
+   case INTEL_KMD_TYPE_STUB:
+      return anv_stub_kmd_backend_get();
+   default:
+      return NULL;
+   }
+}
diff --git a/src/intel/vulkan/anv_kmd_backend.h b/src/intel/vulkan/anv_kmd_backend.h
new file mode 100644
index 00000000000..13d3799858e
--- /dev/null
+++ b/src/intel/vulkan/anv_kmd_backend.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "vulkan/vulkan_core.h"
+#include "vk_sync.h"
+
+#include "dev/intel_device_info.h"
+#include "dev/intel_kmd.h"
+
+struct anv_bo;
+enum anv_bo_alloc_flags;
+struct anv_cmd_buffer;
+struct anv_device;
+struct anv_queue;
+struct anv_query_pool;
+struct anv_utrace_submit;
+struct anv_sparse_submission;
+struct anv_trtt_batch_bo;
+
+enum anv_vm_bind_op {
+   /* bind vma specified in anv_vm_bind */
+   ANV_VM_BIND,
+   /* unbind vma specified in anv_vm_bind */
+   ANV_VM_UNBIND,
+   /* unbind all vmas of anv_vm_bind::bo, address and size fields must be set to 0 */
+   ANV_VM_UNBIND_ALL,
+};
+
+struct anv_vm_bind {
+   struct anv_bo *bo;  /* Or NULL in case of a NULL binding. */
+   uint64_t address;   /* Includes the resource offset. */
+   uint64_t bo_offset; /* Also known as the memory offset. */
+   uint64_t size;
+   enum anv_vm_bind_op op;
+};
+
+/* These flags apply only to the vm_bind() ioctl backend operations, not to
+ * the higher-level concept of resource address binding. In other words: they
+ * don't apply to TR-TT, which also uses other structs with "vm_bind" in their
+ * names.
+ */
+enum anv_vm_bind_flags {
+   ANV_VM_BIND_FLAG_NONE = 0,
+   /* The most recent bind_timeline wait point is waited for during every
+    * command submission. This flag allows the vm_bind operation to create a
+    * new timeline point and signal it upon completion.
+    */
+   ANV_VM_BIND_FLAG_SIGNAL_BIND_TIMELINE = 1 << 0,
+};
+
+struct anv_kmd_backend {
+   /*
+    * Create a gem buffer.
+    * Return the gem handle in case of success otherwise returns 0.
+    */
+   uint32_t (*gem_create)(struct anv_device *device,
+                          const struct intel_memory_class_instance **regions,
+                          uint16_t num_regions, uint64_t size,
+                          enum anv_bo_alloc_flags alloc_flags,
+                          uint64_t *actual_size);
+   uint32_t (*gem_create_userptr)(struct anv_device *device, void *mem, uint64_t size);
+   void (*gem_close)(struct anv_device *device, struct anv_bo *bo);
+   /* Returns MAP_FAILED on error */
+   void *(*gem_mmap)(struct anv_device *device, struct anv_bo *bo,
+                     uint64_t offset, uint64_t size, void *placed_addr);
+
+   /*
+    * Bind things however you want.
+    * This is intended for sparse resources, so it's a little lower level and
+    * the _bo variants.
+    */
+   VkResult (*vm_bind)(struct anv_device *device,
+                       struct anv_sparse_submission *submit,
+                       enum anv_vm_bind_flags flags);
+
+   /*
+    * Fully bind or unbind a BO.
+    * This is intended for general buffer creation/destruction, so it creates
+    * a new point in the bind_timeline, which will be waited for the next time
+    * a batch is submitted.
+    */
+   VkResult (*vm_bind_bo)(struct anv_device *device, struct anv_bo *bo);
+   VkResult (*vm_unbind_bo)(struct anv_device *device, struct anv_bo *bo);
+
+   VkResult (*execute_simple_batch)(struct anv_queue *queue,
+                                    struct anv_bo *batch_bo,
+                                    uint32_t batch_bo_size,
+                                    bool is_companion_rcs_batch);
+   VkResult (*execute_trtt_batch)(struct anv_sparse_submission *submit,
+                                  struct anv_trtt_batch_bo *trtt_bbo);
+   VkResult (*queue_exec_locked)(struct anv_queue *queue,
+                                 uint32_t wait_count,
+                                 const struct vk_sync_wait *waits,
+                                 uint32_t cmd_buffer_count,
+                                 struct anv_cmd_buffer **cmd_buffers,
+                                 uint32_t signal_count,
+                                 const struct vk_sync_signal *signals,
+                                 struct anv_query_pool *perf_query_pool,
+                                 uint32_t perf_query_pass,
+                                 struct anv_utrace_submit *utrace_submit);
+   VkResult (*queue_exec_trace)(struct anv_queue *queue,
+                                struct anv_utrace_submit *submit);
+   uint32_t (*bo_alloc_flags_to_bo_flags)(struct anv_device *device,
+                                          enum anv_bo_alloc_flags alloc_flags);
+};
+
+const struct anv_kmd_backend *anv_kmd_backend_get(enum intel_kmd_type type);
+
+/* Internal functions, should only be called by anv_kmd_backend_get() */
+const struct anv_kmd_backend *anv_i915_kmd_backend_get(void);
+const struct anv_kmd_backend *anv_xe_kmd_backend_get(void);
+const struct anv_kmd_backend *anv_stub_kmd_backend_get(void);
diff --git a/src/intel/vulkan/anv_measure.c b/src/intel/vulkan/anv_measure.c
index 2ac654b7c05..8e778946ea8 100644
--- a/src/intel/vulkan/anv_measure.c
+++ b/src/intel/vulkan/anv_measure.c
@@ -28,7 +28,7 @@
 #include <sys/types.h>
 
 #include "common/intel_measure.h"
-#include "util/debug.h"
+#include "util/u_debug.h"
 
 struct anv_measure_batch {
    struct anv_bo *bo;
@@ -38,32 +38,6 @@ struct anv_measure_batch {
 void
 anv_measure_device_init(struct anv_physical_device *device)
 {
-   switch (device->info.verx10) {
-   case 125:
-      device->cmd_emit_timestamp = &gfx125_cmd_emit_timestamp;
-      break;
-   case 120:
-      device->cmd_emit_timestamp = &gfx12_cmd_emit_timestamp;
-      break;
-   case 110:
-      device->cmd_emit_timestamp = &gfx11_cmd_emit_timestamp;
-      break;
-   case 90:
-      device->cmd_emit_timestamp = &gfx9_cmd_emit_timestamp;
-      break;
-   case 80:
-      device->cmd_emit_timestamp = &gfx8_cmd_emit_timestamp;
-      break;
-   case 75:
-      device->cmd_emit_timestamp = &gfx75_cmd_emit_timestamp;
-      break;
-   case 70:
-      device->cmd_emit_timestamp = &gfx7_cmd_emit_timestamp;
-      break;
-   default:
-      assert(false);
-   }
-
    /* initialise list of measure structures that await rendering */
    struct intel_measure_device *measure_device = &device->measure_device;
    intel_measure_init(measure_device);
@@ -108,21 +82,25 @@ anv_measure_init(struct anv_cmd_buffer *cmd_buffer)
    const size_t batch_bytes = sizeof(struct anv_measure_batch) +
       config->batch_size * sizeof(struct intel_measure_snapshot);
    struct anv_measure_batch * measure =
-      vk_alloc(&cmd_buffer->pool->alloc,
+      vk_alloc(&cmd_buffer->vk.pool->alloc,
                batch_bytes, 8,
                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 
    memset(measure, 0, batch_bytes);
+   cmd_buffer->measure = measure;
+   if(config->cpu_measure)
+      return;
+
    ASSERTED VkResult result =
       anv_device_alloc_bo(device, "measure data",
                           config->batch_size * sizeof(uint64_t),
-                          ANV_BO_ALLOC_MAPPED,
+                          ANV_BO_ALLOC_MAPPED |
+                          ANV_BO_ALLOC_HOST_CACHED_COHERENT |
+                          ANV_BO_ALLOC_INTERNAL,
                           0,
                           (struct anv_bo**)&measure->bo);
    measure->base.timestamps = measure->bo->map;
    assert(result == VK_SUCCESS);
-
-   cmd_buffer->measure = measure;
 }
 
 static void
@@ -135,33 +113,37 @@ anv_measure_start_snapshot(struct anv_cmd_buffer *cmd_buffer,
    struct anv_measure_batch *measure = cmd_buffer->measure;
    struct anv_physical_device *device = cmd_buffer->device->physical;
    struct intel_measure_device *measure_device = &device->measure_device;
+   struct intel_measure_config *config = config_from_command_buffer(cmd_buffer);
+   enum anv_timestamp_capture_type capture_type;
+   unsigned index = measure->base.index++;
 
-   const unsigned device_frame = measure_device->frame;
-
-   /* if the command buffer is not associated with a frame, associate it with
-    * the most recent acquired frame
-    */
-   if (measure->base.frame == 0)
-      measure->base.frame = device_frame;
-
-   uintptr_t framebuffer = (uintptr_t)cmd_buffer->state.framebuffer;
-
-   if (!measure->base.framebuffer &&
-       cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
-      /* secondary command buffer inherited the framebuffer from the primary */
-      measure->base.framebuffer = framebuffer;
+   if (event_name == NULL)
+      event_name = intel_measure_snapshot_string(type);
 
-   /* verify framebuffer has been properly tracked */
-   assert(type == INTEL_SNAPSHOT_END ||
-          framebuffer == measure->base.framebuffer ||
-          framebuffer == 0 ); /* compute has no framebuffer */
+   if (config->cpu_measure) {
+      intel_measure_print_cpu_result(measure_device->frame,
+                                     measure->base.batch_count,
+                                     measure->base.batch_size,
+                                     index/2,
+                                     measure->base.event_count,
+                                     count,
+                                     event_name);
+      return;
+   }
 
-   unsigned index = measure->base.index++;
 
-   (*device->cmd_emit_timestamp)(batch, measure->bo, index * sizeof(uint64_t));
+   if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
+       (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO))
+      capture_type = ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
+   else
+      capture_type = ANV_TIMESTAMP_CAPTURE_AT_CS_STALL;
 
-   if (event_name == NULL)
-      event_name = intel_measure_snapshot_string(type);
+   (*device->cmd_emit_timestamp)(batch, cmd_buffer->device,
+                                 (struct anv_address) {
+                                    .bo = measure->bo,
+                                    .offset = index * sizeof(uint64_t) },
+                                 capture_type,
+                                 NULL);
 
    struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]);
    memset(snapshot, 0, sizeof(*snapshot));
@@ -169,18 +151,23 @@ anv_measure_start_snapshot(struct anv_cmd_buffer *cmd_buffer,
    snapshot->count = (unsigned) count;
    snapshot->event_count = measure->base.event_count;
    snapshot->event_name = event_name;
-   snapshot->framebuffer = framebuffer;
-
-   if (type == INTEL_SNAPSHOT_COMPUTE && cmd_buffer->state.compute.pipeline) {
-      snapshot->cs = (uintptr_t) cmd_buffer->state.compute.pipeline->cs;
-   } else if (cmd_buffer->state.gfx.pipeline) {
+   snapshot->renderpass = (type == INTEL_SNAPSHOT_COMPUTE) ? 0
+                            : measure->base.renderpass;
+
+   if (type == INTEL_SNAPSHOT_COMPUTE && cmd_buffer->state.compute.base.pipeline) {
+      const struct anv_compute_pipeline *pipeline =
+         anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
+      snapshot->cs = pipeline->source_hash;
+   } else if (type == INTEL_SNAPSHOT_DRAW && cmd_buffer->state.gfx.base.pipeline) {
       const struct anv_graphics_pipeline *pipeline =
-         cmd_buffer->state.gfx.pipeline;
-      snapshot->vs = (uintptr_t) pipeline->shaders[MESA_SHADER_VERTEX];
-      snapshot->tcs = (uintptr_t) pipeline->shaders[MESA_SHADER_TESS_CTRL];
-      snapshot->tes = (uintptr_t) pipeline->shaders[MESA_SHADER_TESS_EVAL];
-      snapshot->gs = (uintptr_t) pipeline->shaders[MESA_SHADER_GEOMETRY];
-      snapshot->fs = (uintptr_t) pipeline->shaders[MESA_SHADER_FRAGMENT];
+         anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+      snapshot->vs = pipeline->base.source_hashes[MESA_SHADER_VERTEX];
+      snapshot->tcs = pipeline->base.source_hashes[MESA_SHADER_TESS_CTRL];
+      snapshot->tes = pipeline->base.source_hashes[MESA_SHADER_TESS_EVAL];
+      snapshot->gs = pipeline->base.source_hashes[MESA_SHADER_GEOMETRY];
+      snapshot->fs = pipeline->base.source_hashes[MESA_SHADER_FRAGMENT];
+      snapshot->ms = pipeline->base.source_hashes[MESA_SHADER_MESH];
+      snapshot->ts = pipeline->base.source_hashes[MESA_SHADER_TASK];
    }
 }
 
@@ -191,11 +178,26 @@ anv_measure_end_snapshot(struct anv_cmd_buffer *cmd_buffer,
    struct anv_batch *batch = &cmd_buffer->batch;
    struct anv_measure_batch *measure = cmd_buffer->measure;
    struct anv_physical_device *device = cmd_buffer->device->physical;
-
+   struct intel_measure_config *config = config_from_command_buffer(cmd_buffer);
+   enum anv_timestamp_capture_type capture_type;
    unsigned index = measure->base.index++;
    assert(index % 2 == 1);
 
-   (*device->cmd_emit_timestamp)(batch, measure->bo, index * sizeof(uint64_t));
+   if (config->cpu_measure)
+      return;
+
+   if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
+       (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO))
+      capture_type = ANV_TIMESTAMP_CAPTURE_END_OF_PIPE;
+   else
+      capture_type = ANV_TIMESTAMP_CAPTURE_AT_CS_STALL;
+
+   (*device->cmd_emit_timestamp)(batch, cmd_buffer->device,
+                                 (struct anv_address) {
+                                    .bo = measure->bo,
+                                    .offset = index * sizeof(uint64_t) },
+                                 capture_type,
+                                 NULL);
 
    struct intel_measure_snapshot *snapshot = &(measure->base.snapshots[index]);
    memset(snapshot, 0, sizeof(*snapshot));
@@ -207,7 +209,7 @@ static bool
 state_changed(struct anv_cmd_buffer *cmd_buffer,
               enum intel_measure_snapshot_type type)
 {
-   uintptr_t vs=0, tcs=0, tes=0, gs=0, fs=0, cs=0;
+   uint32_t vs=0, tcs=0, tes=0, gs=0, fs=0, cs=0, ms=0, ts=0;
 
    if (cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)
       /* can't record timestamps in this mode */
@@ -215,22 +217,25 @@ state_changed(struct anv_cmd_buffer *cmd_buffer,
 
    if (type == INTEL_SNAPSHOT_COMPUTE) {
       const struct anv_compute_pipeline *cs_pipe =
-         cmd_buffer->state.compute.pipeline;
+         anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
       assert(cs_pipe);
-      cs = (uintptr_t)cs_pipe->cs;
+      cs = cs_pipe->source_hash;
    } else if (type == INTEL_SNAPSHOT_DRAW) {
-      const struct anv_graphics_pipeline *gfx = cmd_buffer->state.gfx.pipeline;
+      const struct anv_graphics_pipeline *gfx =
+         anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
       assert(gfx);
-      vs = (uintptr_t) gfx->shaders[MESA_SHADER_VERTEX];
-      tcs = (uintptr_t) gfx->shaders[MESA_SHADER_TESS_CTRL];
-      tes = (uintptr_t) gfx->shaders[MESA_SHADER_TESS_EVAL];
-      gs = (uintptr_t) gfx->shaders[MESA_SHADER_GEOMETRY];
-      fs = (uintptr_t) gfx->shaders[MESA_SHADER_FRAGMENT];
+      vs = gfx->base.source_hashes[MESA_SHADER_VERTEX];
+      tcs = gfx->base.source_hashes[MESA_SHADER_TESS_CTRL];
+      tes = gfx->base.source_hashes[MESA_SHADER_TESS_EVAL];
+      gs = gfx->base.source_hashes[MESA_SHADER_GEOMETRY];
+      fs = gfx->base.source_hashes[MESA_SHADER_FRAGMENT];
+      ms = gfx->base.source_hashes[MESA_SHADER_MESH];
+      ts = gfx->base.source_hashes[MESA_SHADER_TASK];
    }
    /* else blorp, all programs NULL */
 
    return intel_measure_state_changed(&cmd_buffer->measure->base,
-                                      vs, tcs, tes, gs, fs, cs);
+                                      vs, tcs, tes, gs, fs, cs, ms, ts);
 }
 
 void
@@ -315,25 +320,15 @@ anv_measure_reset(struct anv_cmd_buffer *cmd_buffer)
     * yet been processed
     */
    intel_measure_gather(&device->physical->measure_device,
-                        &device->info);
+                        device->info);
 
    assert(cmd_buffer->device != NULL);
 
    measure->base.index = 0;
-   measure->base.framebuffer = 0;
+   measure->base.renderpass = 0;
    measure->base.frame = 0;
    measure->base.event_count = 0;
    list_inithead(&measure->base.link);
-
-   anv_device_release_bo(device, measure->bo);
-   ASSERTED VkResult result =
-      anv_device_alloc_bo(device, "measure data",
-                          config->batch_size * sizeof(uint64_t),
-                          ANV_BO_ALLOC_MAPPED,
-                          0,
-                          (struct anv_bo**)&measure->bo);
-   measure->base.timestamps = measure->bo->map;
-   assert(result == VK_SUCCESS);
 }
 
 void
@@ -354,8 +349,9 @@ anv_measure_destroy(struct anv_cmd_buffer *cmd_buffer)
     */
    intel_measure_gather(&physical->measure_device, &physical->info);
 
-   anv_device_release_bo(device, measure->bo);
-   vk_free(&cmd_buffer->pool->alloc, measure);
+   if (measure->bo != NULL)
+      anv_device_release_bo(device, measure->bo);
+   vk_free(&cmd_buffer->vk.pool->alloc, measure);
    cmd_buffer->measure = NULL;
 }
 
@@ -395,19 +391,30 @@ _anv_measure_submit(struct anv_cmd_buffer *cmd_buffer)
    if (measure == NULL)
       return;
 
-   if (measure->base.index == 0)
+   struct intel_measure_batch *base = &measure->base;
+   if (base->index == 0)
       /* no snapshots were started */
       return;
 
    /* finalize snapshots and enqueue them */
    static unsigned cmd_buffer_count = 0;
-   measure->base.batch_count = p_atomic_inc_return(&cmd_buffer_count);
+   base->batch_count = p_atomic_inc_return(&cmd_buffer_count);
+   base->batch_size = cmd_buffer->total_batch_size;
+   base->frame = measure_device->frame;
 
-   if (measure->base.index %2 == 1) {
-      anv_measure_end_snapshot(cmd_buffer, measure->base.event_count);
-      measure->base.event_count = 0;
+   if (base->index %2 == 1) {
+      anv_measure_end_snapshot(cmd_buffer, base->event_count);
+      base->event_count = 0;
    }
 
+   if (config->cpu_measure)
+      return;
+
+   /* Mark the final timestamp as 'not completed'.  This marker will be used
+    * to verify that rendering is complete.
+    */
+   base->timestamps[base->index - 1] = 0;
+
    /* add to the list of submitted snapshots */
    pthread_mutex_lock(&measure_device->mutex);
    list_addtail(&measure->base.link, &measure_device->queued_snapshots);
@@ -418,7 +425,7 @@ _anv_measure_submit(struct anv_cmd_buffer *cmd_buffer)
  *  Hook for the start of a frame.
  */
 void
-anv_measure_acquire(struct anv_device *device)
+_anv_measure_acquire(struct anv_device *device)
 {
    struct intel_measure_config *config = config_from_device(device);
    struct intel_measure_device *measure_device = &device->physical->measure_device;
@@ -456,14 +463,10 @@ _anv_measure_beginrenderpass(struct anv_cmd_buffer *cmd_buffer)
 {
    struct intel_measure_config *config = config_from_command_buffer(cmd_buffer);
    struct anv_measure_batch *measure = cmd_buffer->measure;
+   struct anv_physical_device *device = cmd_buffer->device->physical;
+   struct intel_measure_device *measure_device = &device->measure_device;
 
-   if (!config)
-      return;
-   if (measure == NULL)
-      return;
-
-   if (measure->base.framebuffer == (uintptr_t) cmd_buffer->state.framebuffer)
-      /* no change */
+   if (!config || !measure)
       return;
 
    bool filtering = (config->flags & (INTEL_MEASURE_RENDERPASS |
@@ -475,7 +478,8 @@ _anv_measure_beginrenderpass(struct anv_cmd_buffer *cmd_buffer)
       measure->base.event_count = 0;
    }
 
-   measure->base.framebuffer = (uintptr_t) cmd_buffer->state.framebuffer;
+   measure->base.renderpass =
+      (uintptr_t) p_atomic_inc_return(&measure_device->render_pass_count);
 }
 
 void
diff --git a/src/intel/vulkan/anv_measure.h b/src/intel/vulkan/anv_measure.h
index bca0fc0c207..a058a5ac51e 100644
--- a/src/intel/vulkan/anv_measure.h
+++ b/src/intel/vulkan/anv_measure.h
@@ -46,7 +46,7 @@ void _anv_measure_endcommandbuffer(struct anv_cmd_buffer *cmd_buffer);
 void _anv_measure_beginrenderpass(struct anv_cmd_buffer *cmd_buffer);
 
 /* tracks frame progression */
-void anv_measure_acquire(struct anv_device *device);
+void _anv_measure_acquire(struct anv_device *device);
 
 /* should be combined with endcommandbuffer */
 void _anv_measure_submit(struct anv_cmd_buffer *cmd_buffer);
@@ -55,6 +55,10 @@ void
 _anv_measure_add_secondary(struct anv_cmd_buffer *primary,
                            struct anv_cmd_buffer *secondary);
 
+#define anv_measure_acquire(device) \
+   if (unlikely(device->physical->measure_device.config)) \
+      _anv_measure_acquire(device)
+
 #define anv_measure_snapshot(cmd_buffer, type, event_name, count) \
    if (unlikely(cmd_buffer->measure)) \
       _anv_measure_snapshot(cmd_buffer, type, event_name, count)
diff --git a/src/intel/vulkan/anv_mesh_perprim_wa.c b/src/intel/vulkan/anv_mesh_perprim_wa.c
new file mode 100644
index 00000000000..f46d6a1082b
--- /dev/null
+++ b/src/intel/vulkan/anv_mesh_perprim_wa.c
@@ -0,0 +1,533 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+#include "nir_builder.h"
+
+/*
+ * Wa_18019110168 for gfx 12.5.
+ *
+ * This file implements workaround for HW bug, which leads to fragment shader
+ * reading incorrect per-primitive data if mesh shader, in addition to writing
+ * per-primitive data, also writes to gl_ClipDistance.
+ *
+ * The suggested solution to that bug is to not use per-primitive data by:
+ * - creating new vertices for provoking vertices shared by multiple primitives
+ * - converting per-primitive attributes read by fragment shader to flat
+ *   per-vertex attributes for the provoking vertex
+ * - modifying fragment shader to read those per-vertex attributes
+ *
+ * There are at least 2 type of failures not handled very well:
+ * - if the number of varying slots overflows, than only some attributes will
+ *   be converted, leading to corruption of those unconverted attributes
+ * - if the overall MUE size is so large it doesn't fit in URB, then URB
+ *   allocation will fail in some way; unfortunately there's no good way to
+ *   say how big MUE will be at this moment and back out
+ *
+ * This workaround needs to be applied before linking, so that unused outputs
+ * created by this code are removed at link time.
+ *
+ * This workaround can be controlled by a driconf option to either disable it,
+ * lower its scope or force enable it.
+ *
+ * Option "anv_mesh_conv_prim_attrs_to_vert_attrs" is evaluated like this:
+ *  value == 0 - disable workaround
+ *  value < 0 - enable ONLY if workaround is required
+ *  value > 0 - enable ALWAYS, even if it's not required
+ *  abs(value) >= 1 - attribute conversion
+ *  abs(value) >= 2 - attribute conversion and vertex duplication
+ *
+ *  Default: -2 (both parts of the work around, ONLY if it's required)
+ *
+ */
+
+static bool
+anv_mesh_convert_attrs_prim_to_vert(struct nir_shader *nir,
+                                    gl_varying_slot *wa_mapping,
+                                    uint64_t fs_inputs,
+                                    const VkGraphicsPipelineCreateInfo *pCreateInfo,
+                                    void *mem_ctx,
+                                    const bool dup_vertices,
+                                    const bool force_conversion)
+{
+   uint64_t per_primitive_outputs = nir->info.per_primitive_outputs;
+   per_primitive_outputs &= ~BITFIELD64_BIT(VARYING_SLOT_PRIMITIVE_INDICES);
+
+   if (per_primitive_outputs == 0)
+      return false;
+
+   uint64_t outputs_written = nir->info.outputs_written;
+   uint64_t other_outputs = outputs_written & ~per_primitive_outputs;
+
+   if ((other_outputs & (VARYING_BIT_CLIP_DIST0 | VARYING_BIT_CLIP_DIST1)) == 0)
+      if (!force_conversion)
+         return false;
+
+   uint64_t all_outputs = outputs_written;
+   unsigned attrs = 0;
+
+   uint64_t remapped_outputs = outputs_written & per_primitive_outputs;
+   remapped_outputs &= ~BITFIELD64_BIT(VARYING_SLOT_CULL_PRIMITIVE);
+
+   /* Skip locations not read by the fragment shader, because they will
+    * be eliminated at linking time. Note that some fs inputs may be
+    * removed only after optimizations, so it's possible that we will
+    * create too many variables.
+    */
+   remapped_outputs &= fs_inputs;
+
+   /* Figure out the mapping between per-primitive and new per-vertex outputs. */
+   nir_foreach_shader_out_variable(var, nir) {
+      int location = var->data.location;
+
+      if (!(BITFIELD64_BIT(location) & remapped_outputs))
+         continue;
+
+      /* Although primitive shading rate, layer and viewport have predefined
+       * place in MUE Primitive Header (so we can't really move them anywhere),
+       * we have to copy them to per-vertex space if fragment shader reads them.
+       */
+      assert(location == VARYING_SLOT_PRIMITIVE_SHADING_RATE ||
+             location == VARYING_SLOT_LAYER ||
+             location == VARYING_SLOT_VIEWPORT ||
+             location == VARYING_SLOT_PRIMITIVE_ID ||
+             location >= VARYING_SLOT_VAR0);
+
+      const struct glsl_type *type = var->type;
+      if (nir_is_arrayed_io(var, MESA_SHADER_MESH) || var->data.per_view) {
+         assert(glsl_type_is_array(type));
+         type = glsl_get_array_element(type);
+      }
+
+      unsigned num_slots = glsl_count_attribute_slots(type, false);
+
+      for (gl_varying_slot slot = VARYING_SLOT_VAR0; slot <= VARYING_SLOT_VAR31; slot++) {
+         uint64_t mask = BITFIELD64_MASK(num_slots) << slot;
+         if ((all_outputs & mask) == 0) {
+            wa_mapping[location] = slot;
+            all_outputs |= mask;
+            attrs++;
+            break;
+         }
+      }
+
+      if (wa_mapping[location] == 0) {
+         fprintf(stderr, "Not enough space for hardware per-primitive data corruption work around.\n");
+         break;
+      }
+   }
+
+   if (attrs == 0)
+      if (!force_conversion)
+         return false;
+
+   unsigned provoking_vertex = 0;
+
+   const VkPipelineRasterizationStateCreateInfo *rs_info = pCreateInfo->pRasterizationState;
+   const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *rs_pv_info =
+      vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
+   if (rs_pv_info && rs_pv_info->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT)
+      provoking_vertex = 2;
+
+   unsigned vertices_per_primitive =
+         mesa_vertices_per_prim(nir->info.mesh.primitive_type);
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+   nir_builder b = nir_builder_at(nir_after_impl(impl));
+
+   /* wait for all subgroups to finish */
+   nir_barrier(&b, SCOPE_WORKGROUP);
+
+   nir_def *zero = nir_imm_int(&b, 0);
+
+   nir_def *local_invocation_index = nir_load_local_invocation_index(&b);
+
+   nir_def *cmp = nir_ieq(&b, local_invocation_index, zero);
+   nir_if *if_stmt = nir_push_if(&b, cmp);
+   {
+      nir_variable *primitive_count_var = NULL;
+      nir_variable *primitive_indices_var = NULL;
+
+      unsigned num_other_variables = 0;
+      nir_foreach_shader_out_variable(var, b.shader) {
+         if ((BITFIELD64_BIT(var->data.location) & other_outputs) == 0)
+            continue;
+         num_other_variables++;
+      }
+
+      nir_deref_instr **per_vertex_derefs =
+            ralloc_array(mem_ctx, nir_deref_instr *, num_other_variables);
+
+      unsigned num_per_vertex_variables = 0;
+
+      unsigned processed = 0;
+      nir_foreach_shader_out_variable(var, b.shader) {
+         if ((BITFIELD64_BIT(var->data.location) & other_outputs) == 0)
+            continue;
+
+         switch (var->data.location) {
+            case VARYING_SLOT_PRIMITIVE_COUNT:
+               primitive_count_var = var;
+               break;
+            case VARYING_SLOT_PRIMITIVE_INDICES:
+               primitive_indices_var = var;
+               break;
+            default: {
+               const struct glsl_type *type = var->type;
+               assert(glsl_type_is_array(type));
+               const struct glsl_type *array_element_type =
+                     glsl_get_array_element(type);
+
+               if (dup_vertices) {
+                  /*
+                   * Resize type of array output to make space for one extra
+                   * vertex attribute for each primitive, so we ensure that
+                   * the provoking vertex is not shared between primitives.
+                   */
+                  const struct glsl_type *new_type =
+                        glsl_array_type(array_element_type,
+                                        glsl_get_length(type) +
+                                        nir->info.mesh.max_primitives_out,
+                                        0);
+
+                  var->type = new_type;
+               }
+
+               per_vertex_derefs[num_per_vertex_variables++] =
+                     nir_build_deref_var(&b, var);
+               break;
+            }
+         }
+
+         ++processed;
+      }
+      assert(processed == num_other_variables);
+
+      assert(primitive_count_var != NULL);
+      assert(primitive_indices_var != NULL);
+
+      /* Update types of derefs to match type of variables they (de)reference. */
+      if (dup_vertices) {
+         nir_foreach_function_impl(impl, b.shader) {
+            nir_foreach_block(block, impl) {
+               nir_foreach_instr(instr, block) {
+                  if (instr->type != nir_instr_type_deref)
+                     continue;
+
+                  nir_deref_instr *deref = nir_instr_as_deref(instr);
+                  if (deref->deref_type != nir_deref_type_var)
+                     continue;
+
+                  if (deref->var->type != deref->type)
+                     deref->type = deref->var->type;
+               }
+            }
+         }
+      }
+
+      /* indexed by slot of per-prim attribute */
+      struct {
+         nir_deref_instr *per_prim_deref;
+         nir_deref_instr *per_vert_deref;
+      } mapping[VARYING_SLOT_MAX] = {{NULL, NULL}, };
+
+      /* Create new per-vertex output variables mirroring per-primitive variables
+       * and create derefs for both old and new variables.
+       */
+      nir_foreach_shader_out_variable(var, b.shader) {
+         gl_varying_slot location = var->data.location;
+
+         if ((BITFIELD64_BIT(location) & (outputs_written & per_primitive_outputs)) == 0)
+            continue;
+         if (wa_mapping[location] == 0)
+            continue;
+
+         const struct glsl_type *type = var->type;
+         assert(glsl_type_is_array(type));
+         const struct glsl_type *array_element_type = glsl_get_array_element(type);
+
+         const struct glsl_type *new_type =
+               glsl_array_type(array_element_type,
+                               nir->info.mesh.max_vertices_out +
+                               (dup_vertices ? nir->info.mesh.max_primitives_out : 0),
+                               0);
+
+         nir_variable *new_var =
+               nir_variable_create(b.shader, nir_var_shader_out, new_type, var->name);
+         assert(wa_mapping[location] >= VARYING_SLOT_VAR0);
+         assert(wa_mapping[location] <= VARYING_SLOT_VAR31);
+         new_var->data.location = wa_mapping[location];
+         new_var->data.interpolation = INTERP_MODE_FLAT;
+
+         mapping[location].per_vert_deref = nir_build_deref_var(&b, new_var);
+         mapping[location].per_prim_deref = nir_build_deref_var(&b, var);
+      }
+
+      nir_def *trueconst = nir_imm_true(&b);
+
+      /*
+       * for each Primitive (0 : primitiveCount)
+       *    if VertexUsed[PrimitiveIndices[Primitive][provoking vertex]]
+       *       create 1 new vertex at offset "Vertex"
+       *       copy per vert attributes of provoking vertex to the new one
+       *       update PrimitiveIndices[Primitive][provoking vertex]
+       *       Vertex++
+       *    else
+       *       VertexUsed[PrimitiveIndices[Primitive][provoking vertex]] := true
+       *
+       *    for each attribute : mapping
+       *       copy per_prim_attr(Primitive) to per_vert_attr[Primitive][provoking vertex]
+       */
+
+      /* primitive count */
+      nir_def *primitive_count = nir_load_var(&b, primitive_count_var);
+
+      /* primitive index */
+      nir_variable *primitive_var =
+            nir_local_variable_create(impl, glsl_uint_type(), "Primitive");
+      nir_deref_instr *primitive_deref = nir_build_deref_var(&b, primitive_var);
+      nir_store_deref(&b, primitive_deref, zero, 1);
+
+      /* vertex index */
+      nir_variable *vertex_var =
+            nir_local_variable_create(impl, glsl_uint_type(), "Vertex");
+      nir_deref_instr *vertex_deref = nir_build_deref_var(&b, vertex_var);
+      nir_store_deref(&b, vertex_deref, nir_imm_int(&b, nir->info.mesh.max_vertices_out), 1);
+
+      /* used vertices bitvector */
+      const struct glsl_type *used_vertex_type =
+            glsl_array_type(glsl_bool_type(),
+                            nir->info.mesh.max_vertices_out,
+                            0);
+      nir_variable *used_vertex_var =
+            nir_local_variable_create(impl, used_vertex_type, "VertexUsed");
+      nir_deref_instr *used_vertex_deref =
+               nir_build_deref_var(&b, used_vertex_var);
+      /* Initialize it as "not used" */
+      for (unsigned i = 0; i < nir->info.mesh.max_vertices_out; ++i) {
+         nir_deref_instr *indexed_used_vertex_deref =
+                        nir_build_deref_array(&b, used_vertex_deref, nir_imm_int(&b, i));
+         nir_store_deref(&b, indexed_used_vertex_deref, nir_imm_false(&b), 1);
+      }
+
+      nir_loop *loop = nir_push_loop(&b);
+      {
+         nir_def *primitive = nir_load_deref(&b, primitive_deref);
+         nir_def *cmp = nir_ige(&b, primitive, primitive_count);
+
+         nir_if *loop_check = nir_push_if(&b, cmp);
+         nir_jump(&b, nir_jump_break);
+         nir_pop_if(&b, loop_check);
+
+         nir_deref_instr *primitive_indices_deref =
+               nir_build_deref_var(&b, primitive_indices_var);
+         nir_deref_instr *indexed_primitive_indices_deref;
+         nir_def *src_vertex;
+         nir_def *prim_indices;
+
+         /* array of vectors, we have to extract index out of array deref */
+         indexed_primitive_indices_deref = nir_build_deref_array(&b, primitive_indices_deref, primitive);
+         prim_indices = nir_load_deref(&b, indexed_primitive_indices_deref);
+         src_vertex = nir_channel(&b, prim_indices, provoking_vertex);
+
+         nir_def *dst_vertex = nir_load_deref(&b, vertex_deref);
+
+         nir_deref_instr *indexed_used_vertex_deref =
+                        nir_build_deref_array(&b, used_vertex_deref, src_vertex);
+         nir_def *used_vertex = nir_load_deref(&b, indexed_used_vertex_deref);
+         if (!dup_vertices)
+            used_vertex = nir_imm_false(&b);
+
+         nir_if *vertex_used_check = nir_push_if(&b, used_vertex);
+         {
+            for (unsigned a = 0; a < num_per_vertex_variables; ++a) {
+               nir_deref_instr *attr_arr = per_vertex_derefs[a];
+               nir_deref_instr *src = nir_build_deref_array(&b, attr_arr, src_vertex);
+               nir_deref_instr *dst = nir_build_deref_array(&b, attr_arr, dst_vertex);
+
+               nir_copy_deref(&b, dst, src);
+            }
+
+            /* replace one component of primitive indices vector */
+            nir_def *new_val =
+                  nir_vector_insert_imm(&b, prim_indices, dst_vertex, provoking_vertex);
+
+            /* and store complete vector */
+            nir_store_deref(&b, indexed_primitive_indices_deref, new_val,
+                            BITFIELD_MASK(vertices_per_primitive));
+
+            nir_store_deref(&b, vertex_deref, nir_iadd_imm(&b, dst_vertex, 1), 1);
+
+            for (unsigned i = 0; i < ARRAY_SIZE(mapping); ++i) {
+               if (!mapping[i].per_vert_deref)
+                  continue;
+
+               nir_deref_instr *src =
+                     nir_build_deref_array(&b, mapping[i].per_prim_deref, primitive);
+               nir_deref_instr *dst =
+                     nir_build_deref_array(&b, mapping[i].per_vert_deref, dst_vertex);
+
+               nir_copy_deref(&b, dst, src);
+            }
+         }
+         nir_push_else(&b, vertex_used_check);
+         {
+            nir_store_deref(&b, indexed_used_vertex_deref, trueconst, 1);
+
+            for (unsigned i = 0; i < ARRAY_SIZE(mapping); ++i) {
+               if (!mapping[i].per_vert_deref)
+                  continue;
+
+               nir_deref_instr *src =
+                     nir_build_deref_array(&b, mapping[i].per_prim_deref, primitive);
+               nir_deref_instr *dst =
+                     nir_build_deref_array(&b, mapping[i].per_vert_deref, src_vertex);
+
+               nir_copy_deref(&b, dst, src);
+            }
+
+         }
+         nir_pop_if(&b, vertex_used_check);
+
+         nir_store_deref(&b, primitive_deref, nir_iadd_imm(&b, primitive, 1), 1);
+      }
+      nir_pop_loop(&b, loop);
+   }
+   nir_pop_if(&b, if_stmt); /* local_invocation_index == 0 */
+
+   if (dup_vertices)
+      nir->info.mesh.max_vertices_out += nir->info.mesh.max_primitives_out;
+
+   if (should_print_nir(nir)) {
+      printf("%s\n", __func__);
+      nir_print_shader(nir, stdout);
+   }
+
+   /* deal with copy_derefs */
+   NIR_PASS(_, nir, nir_split_var_copies);
+   NIR_PASS(_, nir, nir_lower_var_copies);
+
+   nir_shader_gather_info(nir, impl);
+
+   return true;
+}
+
+static bool
+anv_frag_update_derefs_instr(struct nir_builder *b, nir_instr *instr, void *data)
+{
+   if (instr->type != nir_instr_type_deref)
+      return false;
+
+   nir_deref_instr *deref = nir_instr_as_deref(instr);
+   if (deref->deref_type != nir_deref_type_var)
+      return false;
+
+   nir_variable *var = deref->var;
+   if (!(var->data.mode & nir_var_shader_in))
+      return false;
+
+   int location = var->data.location;
+   nir_deref_instr **new_derefs = (nir_deref_instr **)data;
+   if (new_derefs[location] == NULL)
+      return false;
+
+   nir_instr_remove(&deref->instr);
+   nir_def_rewrite_uses(&deref->def, &new_derefs[location]->def);
+
+   return true;
+}
+
+static bool
+anv_frag_update_derefs(nir_shader *shader, nir_deref_instr **mapping)
+{
+   return nir_shader_instructions_pass(shader, anv_frag_update_derefs_instr,
+                                       nir_metadata_none, (void *)mapping);
+}
+
+/* Update fragment shader inputs with new ones. */
+static void
+anv_frag_convert_attrs_prim_to_vert(struct nir_shader *nir,
+                                    gl_varying_slot *wa_mapping)
+{
+   /* indexed by slot of per-prim attribute */
+   nir_deref_instr *new_derefs[VARYING_SLOT_MAX] = {NULL, };
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+   nir_builder b = nir_builder_at(nir_before_impl(impl));
+
+   nir_foreach_shader_in_variable_safe(var, nir) {
+      gl_varying_slot location = var->data.location;
+      gl_varying_slot new_location = wa_mapping[location];
+      if (new_location == 0)
+         continue;
+
+      assert(wa_mapping[new_location] == 0);
+
+      nir_variable *new_var =
+            nir_variable_create(b.shader, nir_var_shader_in, var->type, var->name);
+      new_var->data.location = new_location;
+      new_var->data.location_frac = var->data.location_frac;
+      new_var->data.interpolation = INTERP_MODE_FLAT;
+
+      new_derefs[location] = nir_build_deref_var(&b, new_var);
+   }
+
+   NIR_PASS(_, nir, anv_frag_update_derefs, new_derefs);
+
+   nir_shader_gather_info(nir, impl);
+}
+
+void
+anv_apply_per_prim_attr_wa(struct nir_shader *ms_nir,
+                           struct nir_shader *fs_nir,
+                           struct anv_device *device,
+                           const VkGraphicsPipelineCreateInfo *info)
+{
+   const struct intel_device_info *devinfo = device->info;
+
+   int mesh_conv_prim_attrs_to_vert_attrs =
+         device->physical->instance->mesh_conv_prim_attrs_to_vert_attrs;
+   if (mesh_conv_prim_attrs_to_vert_attrs < 0 &&
+         !intel_needs_workaround(devinfo, 18019110168))
+      mesh_conv_prim_attrs_to_vert_attrs = 0;
+
+   if (mesh_conv_prim_attrs_to_vert_attrs != 0) {
+      uint64_t fs_inputs = 0;
+      nir_foreach_shader_in_variable(var, fs_nir)
+         fs_inputs |= BITFIELD64_BIT(var->data.location);
+
+      void *stage_ctx = ralloc_context(NULL);
+
+      gl_varying_slot wa_mapping[VARYING_SLOT_MAX] = { 0, };
+
+      const bool dup_vertices = abs(mesh_conv_prim_attrs_to_vert_attrs) >= 2;
+      const bool force_conversion = mesh_conv_prim_attrs_to_vert_attrs > 0;
+
+      if (anv_mesh_convert_attrs_prim_to_vert(ms_nir, wa_mapping,
+                                              fs_inputs, info, stage_ctx,
+                                              dup_vertices, force_conversion))
+         anv_frag_convert_attrs_prim_to_vert(fs_nir, wa_mapping);
+
+      ralloc_free(stage_ctx);
+   }
+}
diff --git a/src/intel/vulkan/anv_nir.h b/src/intel/vulkan/anv_nir.h
index 0ffed5dfc0f..435b9065979 100644
--- a/src/intel/vulkan/anv_nir.h
+++ b/src/intel/vulkan/anv_nir.h
@@ -31,63 +31,94 @@
 extern "C" {
 #endif
 
-bool anv_check_for_primitive_replication(nir_shader **shaders,
-                                         struct anv_graphics_pipeline *pipeline);
+/* This map is represent a mapping where the key is the NIR
+ * nir_intrinsic_resource_intel::block index. It allows mapping bindless UBOs
+ * accesses to descriptor entry.
+ *
+ * This map only temporary lives between the anv_nir_apply_pipeline_layout()
+ * and anv_nir_compute_push_layout() passes.
+ */
+struct anv_pipeline_push_map {
+   uint32_t                     block_count;
+   struct anv_pipeline_binding *block_to_descriptor;
+};
+
+bool anv_check_for_primitive_replication(struct anv_device *device,
+                                         VkShaderStageFlags stages,
+                                         nir_shader **shaders,
+                                         uint32_t view_mask);
 
-bool anv_nir_lower_multiview(nir_shader *shader,
-                             struct anv_graphics_pipeline *pipeline);
+bool anv_nir_lower_load_patch_vertices_in(nir_shader *shader);
+
+bool anv_nir_lower_multiview(nir_shader *shader, uint32_t view_mask,
+                             bool use_primitive_replication);
 
 bool anv_nir_lower_ycbcr_textures(nir_shader *shader,
-                                  const struct anv_pipeline_layout *layout);
+                                  const struct anv_pipeline_sets_layout *layout);
 
 static inline nir_address_format
 anv_nir_ssbo_addr_format(const struct anv_physical_device *pdevice,
-                         bool robust_buffer_access)
+                         enum brw_robustness_flags robust_flags)
 {
-   if (pdevice->has_a64_buffer_access) {
-      if (robust_buffer_access)
-         return nir_address_format_64bit_bounded_global;
-      else
-         return nir_address_format_64bit_global_32bit_offset;
-   } else {
-      return nir_address_format_32bit_index_offset;
-   }
+   if (robust_flags & BRW_ROBUSTNESS_SSBO)
+      return nir_address_format_64bit_bounded_global;
+   else
+      return nir_address_format_64bit_global_32bit_offset;
 }
 
 static inline nir_address_format
 anv_nir_ubo_addr_format(const struct anv_physical_device *pdevice,
-                        bool robust_buffer_access)
+                        enum brw_robustness_flags robust_flags)
 {
-   if (pdevice->has_a64_buffer_access) {
-      if (robust_buffer_access)
-         return nir_address_format_64bit_bounded_global;
-      else
-         return nir_address_format_64bit_global_32bit_offset;
-   } else {
-      return nir_address_format_32bit_index_offset;
-   }
+   if (robust_flags & BRW_ROBUSTNESS_UBO)
+      return nir_address_format_64bit_bounded_global;
+   else
+      return nir_address_format_64bit_global_32bit_offset;
 }
 
 bool anv_nir_lower_ubo_loads(nir_shader *shader);
 
-void anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
-                                   bool robust_buffer_access,
-                                   const struct anv_pipeline_layout *layout,
-                                   nir_shader *shader,
-                                   struct anv_pipeline_bind_map *map);
+void anv_nir_apply_pipeline_layout(nir_shader *shader,
+                                   const struct anv_physical_device *pdevice,
+                                   enum brw_robustness_flags robust_flags,
+                                   bool independent_sets,
+                                   const struct anv_pipeline_sets_layout *layout,
+                                   struct anv_pipeline_bind_map *map,
+                                   struct anv_pipeline_push_map *push_map,
+                                   void *push_map_mem_ctx);
 
-void anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
-                                 bool robust_buffer_access,
-                                 nir_shader *nir,
+void anv_nir_compute_push_layout(nir_shader *nir,
+                                 const struct anv_physical_device *pdevice,
+                                 enum brw_robustness_flags robust_flags,
+                                 bool fragment_dynamic,
                                  struct brw_stage_prog_data *prog_data,
                                  struct anv_pipeline_bind_map *map,
+                                 const struct anv_pipeline_push_map *push_map,
+                                 enum anv_descriptor_set_layout_type desc_type,
                                  void *mem_ctx);
 
 void anv_nir_validate_push_layout(struct brw_stage_prog_data *prog_data,
                                   struct anv_pipeline_bind_map *map);
 
+bool anv_nir_update_resource_intel_block(nir_shader *shader);
+
+bool anv_nir_lower_resource_intel(nir_shader *shader,
+                                  const struct anv_physical_device *device,
+                                  enum anv_descriptor_set_layout_type desc_type);
+
 bool anv_nir_add_base_work_group_id(nir_shader *shader);
 
+uint32_t anv_nir_compute_used_push_descriptors(nir_shader *shader,
+                                               const struct anv_pipeline_sets_layout *layout);
+
+bool anv_nir_loads_push_desc_buffer(nir_shader *nir,
+                                    const struct anv_pipeline_sets_layout *layout,
+                                    const struct anv_pipeline_bind_map *bind_map);
+
+uint32_t anv_nir_push_desc_ubo_fully_promoted(nir_shader *nir,
+                                              const struct anv_pipeline_sets_layout *layout,
+                                              const struct anv_pipeline_bind_map *bind_map);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/intel/vulkan/anv_nir_add_base_work_group_id.c b/src/intel/vulkan/anv_nir_add_base_work_group_id.c
deleted file mode 100644
index 97596214de9..00000000000
--- a/src/intel/vulkan/anv_nir_add_base_work_group_id.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright © 2017 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "anv_nir.h"
-#include "nir/nir_builder.h"
-#include "compiler/brw_compiler.h"
-
-bool
-anv_nir_add_base_work_group_id(nir_shader *shader)
-{
-   assert(shader->info.stage == MESA_SHADER_COMPUTE);
-
-   nir_builder b;
-   bool progress = false;
-   nir_foreach_function(function, shader) {
-      if (!function->impl)
-         continue;
-
-      nir_builder_init(&b, function->impl);
-
-      nir_foreach_block(block, function->impl) {
-         nir_foreach_instr_safe(instr, block) {
-            if (instr->type != nir_instr_type_intrinsic)
-               continue;
-
-            nir_intrinsic_instr *load_id = nir_instr_as_intrinsic(instr);
-            if (load_id->intrinsic != nir_intrinsic_load_workgroup_id)
-               continue;
-
-            b.cursor = nir_after_instr(&load_id->instr);
-
-            nir_ssa_def *load_base =
-               nir_load_push_constant(&b, 3, 32, nir_imm_int(&b, 0),
-                                      .base = offsetof(struct anv_push_constants, cs.base_work_group_id),
-                                      .range = 3 * sizeof(uint32_t));
-
-            nir_ssa_def *id = nir_iadd(&b, &load_id->dest.ssa,
-                                           load_base);
-
-            nir_ssa_def_rewrite_uses_after(&load_id->dest.ssa,
-                                           id,
-                                           id->parent_instr);
-            progress = true;
-         }
-      }
-
-      nir_metadata_preserve(function->impl, nir_metadata_block_index |
-                                            nir_metadata_dominance);
-   }
-
-   return progress;
-}
diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
index 0f508490110..19183a85949 100644
--- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
+++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
@@ -22,23 +22,33 @@
  */
 
 #include "anv_nir.h"
-#include "program/prog_parameter.h"
 #include "nir/nir_builder.h"
 #include "compiler/brw_nir.h"
 #include "util/mesa-sha1.h"
 #include "util/set.h"
 
+#include "vk_enum_to_str.h"
+
+#include "genxml/genX_bits.h"
+
 /* Sampler tables don't actually have a maximum size but we pick one just so
  * that we don't end up emitting too much state on-the-fly.
  */
 #define MAX_SAMPLER_TABLE_SIZE 128
 #define BINDLESS_OFFSET        255
 
+#define sizeof_field(type, field) sizeof(((type *)0)->field)
+
+enum binding_property {
+   BINDING_PROPERTY_NORMAL            = BITFIELD_BIT(0),
+   BINDING_PROPERTY_PUSHABLE          = BITFIELD_BIT(1),
+   BINDING_PROPERTY_EMBEDDED_SAMPLER  = BITFIELD_BIT(2),
+};
+
 struct apply_pipeline_layout_state {
    const struct anv_physical_device *pdevice;
 
-   const struct anv_pipeline_layout *layout;
-   bool add_bounds_checks;
+   const struct anv_pipeline_sets_layout *layout;
    nir_address_format desc_addr_format;
    nir_address_format ssbo_addr_format;
    nir_address_format ubo_addr_format;
@@ -48,17 +58,50 @@ struct apply_pipeline_layout_state {
 
    bool uses_constants;
    bool has_dynamic_buffers;
+   bool has_independent_sets;
    uint8_t constants_offset;
    struct {
       bool desc_buffer_used;
       uint8_t desc_offset;
 
-      uint8_t *use_count;
-      uint8_t *surface_offsets;
-      uint8_t *sampler_offsets;
+      struct {
+         uint8_t use_count;
+
+         /* Binding table offset */
+         uint8_t surface_offset;
+
+         /* Sampler table offset */
+         uint8_t sampler_offset;
+
+         /* Embedded sampler index */
+         uint16_t embedded_sampler_index;
+
+         /* Properties of the binding */
+         enum binding_property properties;
+
+         /* For each binding is identified with a unique identifier for push
+          * computation.
+          */
+         uint32_t push_block;
+      } *binding;
    } set[MAX_SETS];
 };
 
+/* For a given binding, tells us how many binding table entries are needed per
+ * element.
+ */
+static uint32_t
+bti_multiplier(const struct apply_pipeline_layout_state *state,
+               uint32_t set, uint32_t binding)
+{
+   const struct anv_descriptor_set_layout *set_layout =
+      state->layout->set[set].layout;
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+      &set_layout->binding[binding];
+
+   return bind_layout->max_plane_count == 0 ? 1 : bind_layout->max_plane_count;
+}
+
 static nir_address_format
 addr_format_for_desc_type(VkDescriptorType desc_type,
                           struct apply_pipeline_layout_state *state)
@@ -72,7 +115,7 @@ addr_format_for_desc_type(VkDescriptorType desc_type,
    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
       return state->ubo_addr_format;
 
-   case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
+   case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
       return state->desc_addr_format;
 
    default:
@@ -84,18 +127,68 @@ static void
 add_binding(struct apply_pipeline_layout_state *state,
             uint32_t set, uint32_t binding)
 {
+   const struct anv_descriptor_set_layout *set_layout =
+      state->layout->set[set].layout;
    const struct anv_descriptor_set_binding_layout *bind_layout =
-      &state->layout->set[set].layout->binding[binding];
+      &set_layout->binding[binding];
 
-   if (state->set[set].use_count[binding] < UINT8_MAX)
-      state->set[set].use_count[binding]++;
+   assert(set < state->layout->num_sets);
+   assert(binding < state->layout->set[set].layout->binding_count);
+
+   if (state->set[set].binding[binding].use_count < UINT8_MAX)
+      state->set[set].binding[binding].use_count++;
 
    /* Only flag the descriptor buffer as used if there's actually data for
     * this binding.  This lets us be lazy and call this function constantly
     * without worrying about unnecessarily enabling the buffer.
     */
-   if (anv_descriptor_size(bind_layout))
+   if (bind_layout->descriptor_surface_stride)
       state->set[set].desc_buffer_used = true;
+
+   if (bind_layout->dynamic_offset_index >= 0)
+      state->has_dynamic_buffers = true;
+
+   state->set[set].binding[binding].properties |= BINDING_PROPERTY_NORMAL;
+
+   if (set_layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_EMBEDDED_IMMUTABLE_SAMPLERS_BIT_EXT)
+      state->set[set].binding[binding].properties |= BINDING_PROPERTY_EMBEDDED_SAMPLER;
+}
+
+const VkDescriptorSetLayoutCreateFlags non_pushable_set_flags =
+   VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT |
+   VK_DESCRIPTOR_SET_LAYOUT_CREATE_EMBEDDED_IMMUTABLE_SAMPLERS_BIT_EXT;
+
+const VkDescriptorBindingFlags non_pushable_binding_flags =
+   VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT |
+   VK_DESCRIPTOR_BINDING_UPDATE_UNUSED_WHILE_PENDING_BIT |
+   VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT;
+
+static void
+add_binding_type(struct apply_pipeline_layout_state *state,
+                 uint32_t set, uint32_t binding, VkDescriptorType type)
+{
+   add_binding(state, set, binding);
+
+   const struct anv_descriptor_set_layout *set_layout =
+      state->layout->set[set].layout;
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+      &set_layout->binding[binding];
+
+   /* We can't push descriptor buffers but we can for push descriptors */
+   const bool is_set_pushable =
+      (set_layout->flags & non_pushable_set_flags) == 0 ||
+      set_layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR;
+   const bool is_binding_pushable =
+      (bind_layout->flags & non_pushable_binding_flags) == 0;
+
+   if (is_set_pushable && is_binding_pushable &&
+       (state->layout->set[set].layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
+        state->layout->set[set].layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
+        state->layout->set[set].layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK ||
+        state->layout->set[set].layout->binding[binding].type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT) &&
+       (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER ||
+        type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK))
+      state->set[set].binding[binding].properties |= BINDING_PROPERTY_PUSHABLE;
 }
 
 static void
@@ -127,28 +220,22 @@ get_used_bindings(UNUSED nir_builder *_b, nir_instr *instr, void *_state)
       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
       switch (intrin->intrinsic) {
       case nir_intrinsic_vulkan_resource_index:
-         add_binding(state, nir_intrinsic_desc_set(intrin),
-                     nir_intrinsic_binding(intrin));
+         add_binding_type(state,
+                          nir_intrinsic_desc_set(intrin),
+                          nir_intrinsic_binding(intrin),
+                          nir_intrinsic_desc_type(intrin));
          break;
 
       case nir_intrinsic_image_deref_load:
       case nir_intrinsic_image_deref_store:
-      case nir_intrinsic_image_deref_atomic_add:
-      case nir_intrinsic_image_deref_atomic_imin:
-      case nir_intrinsic_image_deref_atomic_umin:
-      case nir_intrinsic_image_deref_atomic_imax:
-      case nir_intrinsic_image_deref_atomic_umax:
-      case nir_intrinsic_image_deref_atomic_and:
-      case nir_intrinsic_image_deref_atomic_or:
-      case nir_intrinsic_image_deref_atomic_xor:
-      case nir_intrinsic_image_deref_atomic_exchange:
-      case nir_intrinsic_image_deref_atomic_comp_swap:
-      case nir_intrinsic_image_deref_atomic_fadd:
+      case nir_intrinsic_image_deref_atomic:
+      case nir_intrinsic_image_deref_atomic_swap:
       case nir_intrinsic_image_deref_size:
       case nir_intrinsic_image_deref_samples:
       case nir_intrinsic_image_deref_load_param_intel:
       case nir_intrinsic_image_deref_load_raw_intel:
       case nir_intrinsic_image_deref_store_raw_intel:
+      case nir_intrinsic_image_deref_sparse_load:
          add_deref_src_binding(state, intrin->src[0]);
          break;
 
@@ -200,11 +287,14 @@ descriptor_has_bti(nir_intrinsic_instr *intrin,
    const struct anv_descriptor_set_binding_layout *bind_layout =
       &state->layout->set[set].layout->binding[binding];
 
+   if (state->set[set].binding[binding].properties & BINDING_PROPERTY_EMBEDDED_SAMPLER)
+      return false;
+
    uint32_t surface_index;
    if (bind_layout->data & ANV_DESCRIPTOR_INLINE_UNIFORM)
       surface_index = state->set[set].desc_offset;
    else
-      surface_index = state->set[set].surface_offsets[binding];
+      surface_index = state->set[set].binding[binding].surface_offset;
 
    /* Only lower to a BTI message if we have a valid binding table index. */
    return surface_index < MAX_BINDING_TABLE_SIZE;
@@ -216,12 +306,7 @@ descriptor_address_format(nir_intrinsic_instr *intrin,
 {
    assert(intrin->intrinsic == nir_intrinsic_vulkan_resource_index);
 
-   uint32_t set = nir_intrinsic_desc_set(intrin);
-   uint32_t binding = nir_intrinsic_binding(intrin);
-   const struct anv_descriptor_set_binding_layout *bind_layout =
-      &state->layout->set[set].layout->binding[binding];
-
-   return addr_format_for_desc_type(bind_layout->type, state);
+   return addr_format_for_desc_type(nir_intrinsic_desc_type(intrin), state);
 }
 
 static nir_intrinsic_instr *
@@ -242,23 +327,23 @@ nir_deref_find_descriptor(nir_deref_instr *deref,
 
    nir_intrinsic_instr *intrin = nir_src_as_intrinsic(deref->parent);
    if (!intrin || intrin->intrinsic != nir_intrinsic_load_vulkan_descriptor)
-      return false;
+      return NULL;
 
    return find_descriptor_for_index_src(intrin->src[0], state);
 }
 
-static nir_ssa_def *
+static nir_def *
 build_load_descriptor_mem(nir_builder *b,
-                          nir_ssa_def *desc_addr, unsigned desc_offset,
+                          nir_def *desc_addr, unsigned desc_offset,
                           unsigned num_components, unsigned bit_size,
-                          struct apply_pipeline_layout_state *state)
+                          const struct apply_pipeline_layout_state *state)
 
 {
    switch (state->desc_addr_format) {
    case nir_address_format_64bit_global_32bit_offset: {
-      nir_ssa_def *base_addr =
-         nir_pack_64_2x32(b, nir_channels(b, desc_addr, 0x3));
-      nir_ssa_def *offset32 =
+      nir_def *base_addr =
+         nir_pack_64_2x32(b, nir_trim_vector(b, desc_addr, 2));
+      nir_def *offset32 =
          nir_iadd_imm(b, nir_channel(b, desc_addr, 3), desc_offset);
 
       return nir_load_global_constant_offset(b, num_components, bit_size,
@@ -268,8 +353,8 @@ build_load_descriptor_mem(nir_builder *b,
    }
 
    case nir_address_format_32bit_index_offset: {
-      nir_ssa_def *surface_index = nir_channel(b, desc_addr, 0);
-      nir_ssa_def *offset32 =
+      nir_def *surface_index = nir_channel(b, desc_addr, 0);
+      nir_def *offset32 =
          nir_iadd_imm(b, nir_channel(b, desc_addr, 1), desc_offset);
 
       return nir_load_ubo(b, num_components, bit_size,
@@ -277,7 +362,7 @@ build_load_descriptor_mem(nir_builder *b,
                           .align_mul = 8,
                           .align_offset = desc_offset % 8,
                           .range_base = 0,
-                          .range = ~0);
+                          .range = num_components * bit_size / 8);
    }
 
    default:
@@ -285,6 +370,183 @@ build_load_descriptor_mem(nir_builder *b,
    }
 }
 
+/* When using direct descriptor, we do not have a structure to read in memory
+ * like anv_address_range_descriptor where all the fields match perfectly the
+ * vec4 address format we need to generate for A64 messages. Instead we need
+ * to build the vec4 from parsing the RENDER_SURFACE_STATE structure. Easy
+ * enough for the surface address, lot less fun for the size where you have to
+ * combine 3 fields scattered over multiple dwords, add one to the total and
+ * do a check against the surface type to deal with the null descriptors.
+ *
+ * Fortunately we can reuse the Auxiliary surface adddress field to stash our
+ * buffer size and just load a vec4.
+ */
+static nir_def *
+build_optimized_load_render_surface_state_address(nir_builder *b,
+                                                  nir_def *desc_addr,
+                                                  struct apply_pipeline_layout_state *state)
+
+{
+   const struct intel_device_info *devinfo = &state->pdevice->info;
+
+   nir_def *surface_addr =
+      build_load_descriptor_mem(b, desc_addr,
+                                RENDER_SURFACE_STATE_SurfaceBaseAddress_start(devinfo) / 8,
+                                4, 32, state);
+   nir_def *addr_ldw = nir_channel(b, surface_addr, 0);
+   nir_def *addr_udw = nir_channel(b, surface_addr, 1);
+   nir_def *length = nir_channel(b, surface_addr, 3);
+
+   return nir_vec4(b, addr_ldw, addr_udw, length, nir_imm_int(b, 0));
+}
+
+/* When using direct descriptor, we do not have a structure to read in memory
+ * like anv_address_range_descriptor where all the fields match perfectly the
+ * vec4 address format we need to generate for A64 messages. Instead we need
+ * to build the vec4 from parsing the RENDER_SURFACE_STATE structure. Easy
+ * enough for the surface address, lot less fun for the size.
+ */
+static nir_def *
+build_non_optimized_load_render_surface_state_address(nir_builder *b,
+                                                      nir_def *desc_addr,
+                                                      struct apply_pipeline_layout_state *state)
+
+{
+   const struct intel_device_info *devinfo = &state->pdevice->info;
+
+   assert(((RENDER_SURFACE_STATE_SurfaceBaseAddress_start(devinfo) +
+            RENDER_SURFACE_STATE_SurfaceBaseAddress_bits(devinfo) - 1) -
+           RENDER_SURFACE_STATE_Width_start(devinfo)) / 8 <= 32);
+
+   nir_def *surface_addr =
+      build_load_descriptor_mem(b, desc_addr,
+                                RENDER_SURFACE_STATE_SurfaceBaseAddress_start(devinfo) / 8,
+                                DIV_ROUND_UP(RENDER_SURFACE_STATE_SurfaceBaseAddress_bits(devinfo), 32),
+                                32, state);
+   nir_def *addr_ldw = nir_channel(b, surface_addr, 0);
+   nir_def *addr_udw = nir_channel(b, surface_addr, 1);
+
+   /* Take all the RENDER_SURFACE_STATE fields from the beginning of the
+    * structure up to the Depth field.
+    */
+   const uint32_t type_sizes_dwords =
+      DIV_ROUND_UP(RENDER_SURFACE_STATE_Depth_start(devinfo) +
+                   RENDER_SURFACE_STATE_Depth_bits(devinfo), 32);
+   nir_def *type_sizes =
+      build_load_descriptor_mem(b, desc_addr, 0, type_sizes_dwords, 32, state);
+
+   const unsigned width_start = RENDER_SURFACE_STATE_Width_start(devinfo);
+   /* SKL PRMs, Volume 2d: Command Reference: Structures, RENDER_SURFACE_STATE
+    *
+    *    Width:  "bits [6:0]   of the number of entries in the buffer - 1"
+    *    Height: "bits [20:7]  of the number of entries in the buffer - 1"
+    *    Depth:  "bits [31:21] of the number of entries in the buffer - 1"
+    */
+   const unsigned width_bits = 7;
+   nir_def *width =
+      nir_iand_imm(b,
+                   nir_ishr_imm(b,
+                                nir_channel(b, type_sizes, width_start / 32),
+                                width_start % 32),
+                   (1u << width_bits) - 1);
+
+   const unsigned height_start = RENDER_SURFACE_STATE_Height_start(devinfo);
+   const unsigned height_bits = RENDER_SURFACE_STATE_Height_bits(devinfo);
+   nir_def *height =
+      nir_iand_imm(b,
+                   nir_ishr_imm(b,
+                                nir_channel(b, type_sizes, height_start / 32),
+                                height_start % 32),
+                   (1u << height_bits) - 1);
+
+   const unsigned depth_start = RENDER_SURFACE_STATE_Depth_start(devinfo);
+   const unsigned depth_bits = RENDER_SURFACE_STATE_Depth_bits(devinfo);
+   nir_def *depth =
+      nir_iand_imm(b,
+                   nir_ishr_imm(b,
+                                nir_channel(b, type_sizes, depth_start / 32),
+                                depth_start % 32),
+                   (1u << depth_bits) - 1);
+
+   nir_def *length = width;
+   length = nir_ior(b, length, nir_ishl_imm(b, height, width_bits));
+   length = nir_ior(b, length, nir_ishl_imm(b, depth, width_bits + height_bits));
+   length = nir_iadd_imm(b, length, 1);
+
+   /* Check the surface type, if it's SURFTYPE_NULL, set the length of the
+    * buffer to 0.
+    */
+   const unsigned type_start = RENDER_SURFACE_STATE_SurfaceType_start(devinfo);
+   const unsigned type_dw = type_start / 32;
+   nir_def *type =
+      nir_iand_imm(b,
+                   nir_ishr_imm(b,
+                                nir_channel(b, type_sizes, type_dw),
+                                type_start % 32),
+                   (1u << RENDER_SURFACE_STATE_SurfaceType_bits(devinfo)) - 1);
+
+   length = nir_bcsel(b,
+                      nir_ieq_imm(b, type, 7 /* SURFTYPE_NULL */),
+                      nir_imm_int(b, 0), length);
+
+   return nir_vec4(b, addr_ldw, addr_udw, length, nir_imm_int(b, 0));
+}
+
+static inline nir_def *
+build_load_render_surface_state_address(nir_builder *b,
+                                        nir_def *desc_addr,
+                                        struct apply_pipeline_layout_state *state)
+{
+   if (state->pdevice->isl_dev.buffer_length_in_aux_addr)
+      return build_optimized_load_render_surface_state_address(b, desc_addr, state);
+   return build_non_optimized_load_render_surface_state_address(b, desc_addr, state);
+}
+
+/* Load the depth of a 3D storage image.
+ *
+ * Either by reading the indirect descriptor value, or reading the value from
+ * RENDER_SURFACE_STATE.
+ *
+ * This is necessary for VK_EXT_image_sliced_view_of_3d.
+ */
+static nir_def *
+build_load_storage_3d_image_depth(nir_builder *b,
+                                  nir_def *desc_addr,
+                                  nir_def *resinfo_depth,
+                                  struct apply_pipeline_layout_state *state)
+
+{
+   const struct intel_device_info *devinfo = &state->pdevice->info;
+
+   if (state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT) {
+      return build_load_descriptor_mem(
+         b, desc_addr,
+         offsetof(struct anv_storage_image_descriptor, image_depth),
+         1, 32, state);
+   } else {
+      nir_def *data = build_load_descriptor_mem(
+         b, desc_addr,
+         RENDER_SURFACE_STATE_RenderTargetViewExtent_start(devinfo) / 8,
+         1, 32, state);
+      nir_def *depth =
+         nir_ushr_imm(
+            b, data,
+            RENDER_SURFACE_STATE_RenderTargetViewExtent_start(devinfo) % 32);
+      depth = nir_iand_imm(
+         b, depth,
+         (1u << RENDER_SURFACE_STATE_RenderTargetViewExtent_bits(devinfo)) - 1);
+      depth = nir_iadd_imm(b, depth, 1);
+
+      /* Return the minimum between the RESINFO value and the
+       * RENDER_SURFACE_STATE::RenderTargetViewExtent value.
+       *
+       * Both are expressed for the current view LOD, but in the case of a
+       * SURFTYPE_NULL, RESINFO will return the right value, while the -1
+       * value in RENDER_SURFACE_STATE should be ignored.
+       */
+      return nir_umin(b, resinfo_depth, depth);
+   }
+}
 /** Build a Vulkan resource index
  *
  * A "resource index" is the term used by our SPIR-V parser and the relevant
@@ -305,9 +567,10 @@ build_load_descriptor_mem(nir_builder *b,
  * The load_vulkan_descriptor intrinsic exists to provide a transition point
  * between these two forms of derefs: descriptor and memory.
  */
-static nir_ssa_def *
-build_res_index(nir_builder *b, uint32_t set, uint32_t binding,
-                nir_ssa_def *array_index, nir_address_format addr_format,
+static nir_def *
+build_res_index(nir_builder *b,
+                uint32_t set, uint32_t binding,
+                nir_def *array_index,
                 struct apply_pipeline_layout_state *state)
 {
    const struct anv_descriptor_set_binding_layout *bind_layout =
@@ -315,75 +578,94 @@ build_res_index(nir_builder *b, uint32_t set, uint32_t binding,
 
    uint32_t array_size = bind_layout->array_size;
 
-   switch (addr_format) {
+   uint32_t set_idx;
+   switch (state->desc_addr_format) {
    case nir_address_format_64bit_global_32bit_offset:
-   case nir_address_format_64bit_bounded_global: {
-      uint32_t set_idx;
-      switch (state->desc_addr_format) {
-      case nir_address_format_64bit_global_32bit_offset:
-         set_idx = set;
-         break;
-
-      case nir_address_format_32bit_index_offset:
-         assert(state->set[set].desc_offset < MAX_BINDING_TABLE_SIZE);
-         set_idx = state->set[set].desc_offset;
-         break;
-
-      default:
-         unreachable("Unsupported address format");
-      }
-
-      assert(bind_layout->dynamic_offset_index < MAX_DYNAMIC_BUFFERS);
-      uint32_t dynamic_offset_index = 0xff; /* No dynamic offset */
-      if (bind_layout->dynamic_offset_index >= 0) {
-         dynamic_offset_index =
-            state->layout->set[set].dynamic_offset_start +
-            bind_layout->dynamic_offset_index;
-      }
+      /* Descriptor set buffer accesses will go through A64 messages, so the
+       * index to get the descriptor set buffer address is located in the
+       * anv_push_constants::desc_surface_offsets and it's indexed by the set
+       * number.
+       */
+      set_idx = set;
+      break;
 
-      const uint32_t packed = (set_idx << 16) | dynamic_offset_index;
+   case nir_address_format_32bit_index_offset:
+      /* Descriptor set buffer accesses will go through the binding table. The
+       * offset is the entry in the binding table.
+       */
+      assert(state->set[set].desc_offset < MAX_BINDING_TABLE_SIZE);
+      set_idx = state->set[set].desc_offset;
+      break;
 
-      return nir_vec4(b, nir_imm_int(b, packed),
-                         nir_imm_int(b, bind_layout->descriptor_offset),
-                         nir_imm_int(b, array_size - 1),
-                         array_index);
+   default:
+      unreachable("Unsupported address format");
    }
 
-   case nir_address_format_32bit_index_offset: {
-      assert(state->desc_addr_format == nir_address_format_32bit_index_offset);
-      if (bind_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
-         uint32_t surface_index = state->set[set].desc_offset;
-         return nir_imm_ivec2(b, surface_index,
-                                 bind_layout->descriptor_offset);
+   assert(bind_layout->dynamic_offset_index < MAX_DYNAMIC_BUFFERS);
+      nir_def *dynamic_offset_index;
+      if (bind_layout->dynamic_offset_index >= 0) {
+         if (state->has_independent_sets) {
+            nir_def *dynamic_offset_start =
+               nir_load_desc_set_dynamic_index_intel(b, nir_imm_int(b, set));
+            dynamic_offset_index =
+               nir_iadd_imm(b, dynamic_offset_start,
+                            bind_layout->dynamic_offset_index);
+         } else {
+            dynamic_offset_index =
+               nir_imm_int(b,
+                           state->layout->set[set].dynamic_offset_start +
+                           bind_layout->dynamic_offset_index);
+         }
       } else {
-         uint32_t surface_index = state->set[set].surface_offsets[binding];
-         assert(array_size > 0 && array_size <= UINT16_MAX);
-         assert(surface_index <= UINT16_MAX);
-         uint32_t packed = ((array_size - 1) << 16) | surface_index;
-         return nir_vec2(b, array_index, nir_imm_int(b, packed));
+         dynamic_offset_index = nir_imm_int(b, 0xff); /* No dynamic offset */
       }
-   }
 
-   default:
-      unreachable("Unsupported address format");
-   }
+   const uint32_t desc_bti = state->set[set].binding[binding].surface_offset;
+   /* We don't care about the stride field for inline uniforms (see
+    * build_desc_addr_for_res_index), but for anything else we should be
+    * aligned to 8 bytes because we store a multiple of 8 in the packed info
+    * to be able to encode a stride up to 2040 (8 * 255).
+    */
+   assert(bind_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK ||
+          bind_layout->descriptor_surface_stride % 8 == 0);
+   const uint32_t desc_stride =
+      bind_layout->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK ? 0 :
+      bind_layout->descriptor_surface_stride / 8;
+
+      nir_def *packed =
+         nir_ior_imm(b,
+                     dynamic_offset_index,
+                     (desc_stride << 24) |
+                     (desc_bti << 16)    |
+                     (set_idx << 8));
+
+
+   return nir_vec4(b, packed,
+                      nir_imm_int(b, bind_layout->descriptor_surface_offset),
+                      nir_imm_int(b, array_size - 1),
+                      array_index);
 }
 
 struct res_index_defs {
-   nir_ssa_def *set_idx;
-   nir_ssa_def *dyn_offset_base;
-   nir_ssa_def *desc_offset_base;
-   nir_ssa_def *array_index;
+   nir_def *bti_idx;
+   nir_def *set_idx;
+   nir_def *dyn_offset_base;
+   nir_def *desc_offset_base;
+   nir_def *array_index;
+   nir_def *desc_stride;
 };
 
 static struct res_index_defs
-unpack_res_index(nir_builder *b, nir_ssa_def *index)
+unpack_res_index(nir_builder *b, nir_def *index)
 {
    struct res_index_defs defs;
 
-   nir_ssa_def *packed = nir_channel(b, index, 0);
-   defs.set_idx = nir_extract_u16(b, packed, nir_imm_int(b, 1));
-   defs.dyn_offset_base = nir_extract_u16(b, packed, nir_imm_int(b, 0));
+   nir_def *packed = nir_channel(b, index, 0);
+   defs.desc_stride =
+      nir_imul_imm(b, nir_extract_u8(b, packed, nir_imm_int(b, 3)), 8);
+   defs.bti_idx = nir_extract_u8(b, packed, nir_imm_int(b, 2));
+   defs.set_idx = nir_extract_u8(b, packed, nir_imm_int(b, 1));
+   defs.dyn_offset_base = nir_extract_u8(b, packed, nir_imm_int(b, 0));
 
    defs.desc_offset_base = nir_channel(b, index, 1);
    defs.array_index = nir_umin(b, nir_channel(b, index, 2),
@@ -392,6 +674,22 @@ unpack_res_index(nir_builder *b, nir_ssa_def *index)
    return defs;
 }
 
+/** Whether a surface is accessed through the bindless surface state heap */
+static bool
+is_binding_bindless(unsigned set, unsigned binding, bool sampler,
+                    const struct apply_pipeline_layout_state *state)
+{
+   /* Has binding table entry has been allocated for this binding? */
+   if (sampler &&
+       state->set[set].binding[binding].sampler_offset != BINDLESS_OFFSET)
+      return false;
+   if (!sampler &&
+       state->set[set].binding[binding].surface_offset != BINDLESS_OFFSET)
+      return false;
+
+   return true;
+}
+
 /** Adjust a Vulkan resource index
  *
  * This is the equivalent of nir_deref_type_ptr_as_array for resource indices.
@@ -400,25 +698,13 @@ unpack_res_index(nir_builder *b, nir_ssa_def *index)
  * vulkan_resource_index intrinsic and we have to do it based on nothing but
  * the address format.
  */
-static nir_ssa_def *
-build_res_reindex(nir_builder *b, nir_ssa_def *orig, nir_ssa_def *delta,
-                  nir_address_format addr_format)
+static nir_def *
+build_res_reindex(nir_builder *b, nir_def *orig, nir_def *delta)
 {
-   switch (addr_format) {
-   case nir_address_format_64bit_global_32bit_offset:
-   case nir_address_format_64bit_bounded_global:
-      return nir_vec4(b, nir_channel(b, orig, 0),
-                         nir_channel(b, orig, 1),
-                         nir_channel(b, orig, 2),
-                         nir_iadd(b, nir_channel(b, orig, 3), delta));
-
-   case nir_address_format_32bit_index_offset:
-      return nir_vec2(b, nir_iadd(b, nir_channel(b, orig, 0), delta),
-                         nir_channel(b, orig, 1));
-
-   default:
-      unreachable("Unhandled address format");
-   }
+   return nir_vec4(b, nir_channel(b, orig, 0),
+                      nir_channel(b, orig, 1),
+                      nir_channel(b, orig, 2),
+                      nir_iadd(b, nir_channel(b, orig, 3), delta));
 }
 
 /** Get the address for a descriptor given its resource index
@@ -431,38 +717,31 @@ build_res_reindex(nir_builder *b, nir_ssa_def *orig, nir_ssa_def *delta,
  * determine the descriptor stride for array descriptors.  The bind_layout is
  * optional for buffer descriptor types.
  */
-static nir_ssa_def *
-build_desc_addr(nir_builder *b,
-                const struct anv_descriptor_set_binding_layout *bind_layout,
-                const VkDescriptorType desc_type,
-                nir_ssa_def *index, nir_address_format addr_format,
-                struct apply_pipeline_layout_state *state)
+static nir_def *
+build_desc_addr_for_res_index(nir_builder *b,
+                              const VkDescriptorType desc_type,
+                              nir_def *index, nir_address_format addr_format,
+                              struct apply_pipeline_layout_state *state)
 {
+   struct res_index_defs res = unpack_res_index(b, index);
+
+   nir_def *desc_offset = res.desc_offset_base;
+   if (desc_type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+      /* Compute the actual descriptor offset.  For inline uniform blocks,
+       * the array index is ignored as they are only allowed to be a single
+       * descriptor (not an array) and there is no concept of a "stride".
+       *
+       */
+      desc_offset =
+         nir_iadd(b, desc_offset, nir_imul(b, res.array_index, res.desc_stride));
+   }
+
    switch (addr_format) {
    case nir_address_format_64bit_global_32bit_offset:
    case nir_address_format_64bit_bounded_global: {
-      struct res_index_defs res = unpack_res_index(b, index);
-
-      nir_ssa_def *desc_offset = res.desc_offset_base;
-      if (desc_type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
-         /* Compute the actual descriptor offset.  For inline uniform blocks,
-          * the array index is ignored as they are only allowed to be a single
-          * descriptor (not an array) and there is no concept of a "stride".
-          *
-          * We use the bind_layout, if available, because it provides a more
-          * accurate descriptor size.
-          */
-         const unsigned stride = bind_layout ?
-            anv_descriptor_size(bind_layout) :
-            anv_descriptor_type_size(state->pdevice, desc_type);
-
-         desc_offset =
-            nir_iadd(b, desc_offset, nir_imul_imm(b, res.array_index, stride));
-      }
-
       switch (state->desc_addr_format) {
       case nir_address_format_64bit_global_32bit_offset: {
-         nir_ssa_def *base_addr =
+         nir_def *base_addr =
             nir_load_desc_set_address_intel(b, res.set_idx);
          return nir_vec4(b, nir_unpack_64_2x32_split_x(b, base_addr),
                             nir_unpack_64_2x32_split_y(b, base_addr),
@@ -479,15 +758,272 @@ build_desc_addr(nir_builder *b,
    }
 
    case nir_address_format_32bit_index_offset:
-      assert(desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT);
+      assert(desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK);
       assert(state->desc_addr_format == nir_address_format_32bit_index_offset);
-      return index;
+      return nir_vec2(b, res.set_idx, desc_offset);
+
+   default:
+      unreachable("Unhandled address format");
+   }
+}
+
+static nir_def *
+build_desc_addr_for_binding(nir_builder *b,
+                            unsigned set, unsigned binding,
+                            nir_def *array_index,
+                            const struct apply_pipeline_layout_state *state)
+{
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+      &state->layout->set[set].layout->binding[binding];
+
+   switch (state->desc_addr_format) {
+   case nir_address_format_64bit_global_32bit_offset:
+   case nir_address_format_64bit_bounded_global: {
+      nir_def *set_addr = nir_load_desc_set_address_intel(b, nir_imm_int(b, set));
+      nir_def *desc_offset =
+         nir_iadd_imm(b,
+                      nir_imul_imm(b,
+                                   array_index,
+                                   bind_layout->descriptor_surface_stride),
+                      bind_layout->descriptor_surface_offset);
+
+      return nir_vec4(b, nir_unpack_64_2x32_split_x(b, set_addr),
+                         nir_unpack_64_2x32_split_y(b, set_addr),
+                         nir_imm_int(b, UINT32_MAX),
+                         desc_offset);
+   }
+
+   case nir_address_format_32bit_index_offset:
+      return nir_vec2(b,
+                      nir_imm_int(b, state->set[set].desc_offset),
+                      nir_iadd_imm(b,
+                                   nir_imul_imm(b,
+                                                array_index,
+                                                bind_layout->descriptor_surface_stride),
+                                   bind_layout->descriptor_surface_offset));
 
    default:
       unreachable("Unhandled address format");
    }
 }
 
+static unsigned
+binding_descriptor_offset(const struct apply_pipeline_layout_state *state,
+                          const struct anv_descriptor_set_binding_layout *bind_layout,
+                          bool sampler)
+{
+   if (sampler &&
+       state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT)
+      return bind_layout->descriptor_sampler_offset;
+
+   return bind_layout->descriptor_surface_offset;
+}
+
+static unsigned
+binding_descriptor_stride(const struct apply_pipeline_layout_state *state,
+                          const struct anv_descriptor_set_binding_layout *bind_layout,
+                          bool sampler)
+{
+   if (sampler &&
+       state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT)
+      return bind_layout->descriptor_sampler_stride;
+
+   return bind_layout->descriptor_surface_stride;
+}
+
+static nir_def *
+build_surface_index_for_binding(nir_builder *b,
+                                unsigned set, unsigned binding,
+                                nir_def *array_index,
+                                unsigned plane,
+                                bool non_uniform,
+                                const struct apply_pipeline_layout_state *state)
+{
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+      &state->layout->set[set].layout->binding[binding];
+   const unsigned descriptor_offset =
+      binding_descriptor_offset(state, bind_layout, false /* sampler */);
+   const unsigned descriptor_stride =
+      binding_descriptor_stride(state, bind_layout, false /* sampler */);
+   const bool is_bindless =
+      is_binding_bindless(set, binding, false /* sampler */, state);
+
+   nir_def *set_offset, *surface_index;
+   if (is_bindless) {
+      if (state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT) {
+         set_offset = nir_imm_int(b, 0xdeaddead);
+
+         nir_def *desc_addr =
+            build_desc_addr_for_binding(b, set, binding, array_index, state);
+
+         surface_index =
+            build_load_descriptor_mem(b, desc_addr, 0, 1, 32, state);
+      } else {
+         set_offset =
+            nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0),
+                                   .base = offsetof(struct anv_push_constants,
+                                                    desc_surface_offsets[set]),
+                                   .range = sizeof_field(struct anv_push_constants,
+                                                         desc_surface_offsets[set]));
+
+         /* With bindless indexes are offsets in the descriptor buffer */
+         surface_index =
+            nir_iadd_imm(b,
+                         nir_imul_imm(b, array_index, descriptor_stride),
+                         descriptor_offset);
+         if (plane != 0) {
+            assert(plane < bind_layout->max_plane_count);
+            surface_index = nir_iadd_imm(b, surface_index,
+                                         plane * (descriptor_stride /
+                                                  bind_layout->max_plane_count));
+         }
+
+         assert(descriptor_offset % 64 == 0);
+         assert(descriptor_stride % 64 == 0);
+      }
+   } else {
+      /* Unused */
+      set_offset = nir_imm_int(b, 0xdeaddead);
+
+      unsigned bti_stride = bti_multiplier(state, set, binding);
+      assert(bti_stride >= 1);
+
+      /* For Ycbcr descriptors, add the plane offset */
+      unsigned element_index = plane;
+
+      /* With the binding table, it's an index in the table */
+      surface_index =
+         nir_iadd_imm(b, nir_imul_imm(b, array_index, bti_stride),
+                         state->set[set].binding[binding].surface_offset + element_index);
+      assert(state->set[set].binding[binding].surface_offset < MAX_BINDING_TABLE_SIZE);
+   }
+
+   return nir_resource_intel(b,
+                             set_offset,
+                             surface_index,
+                             array_index,
+                             nir_imm_int(b, 0) /* bindless_base_offset */,
+                             .desc_set = set,
+                             .binding = binding,
+                             .resource_block_intel = state->set[set].binding[binding].push_block,
+                             .resource_access_intel =
+                                (is_bindless ? nir_resource_intel_bindless : 0) |
+                                (non_uniform ? nir_resource_intel_non_uniform : 0) |
+                                ((state->set[set].binding[binding].properties &
+                                  BINDING_PROPERTY_PUSHABLE) ? nir_resource_intel_pushable : 0));
+}
+
+static nir_def *
+build_sampler_handle_for_binding(nir_builder *b,
+                                 unsigned set, unsigned binding,
+                                 nir_def *array_index,
+                                 unsigned plane,
+                                 bool non_uniform,
+                                 const struct apply_pipeline_layout_state *state)
+{
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+      &state->layout->set[set].layout->binding[binding];
+   const unsigned descriptor_offset =
+      binding_descriptor_offset(state, bind_layout, true /* sampler */);
+   const unsigned descriptor_stride =
+      binding_descriptor_stride(state, bind_layout, true /* sampler */);
+   const bool is_embedded =
+      state->set[set].binding[binding].properties & BINDING_PROPERTY_EMBEDDED_SAMPLER;
+   const bool is_bindless =
+      is_binding_bindless(set, binding, true /* sampler */, state);
+   nir_def *set_offset, *sampler_index, *sampler_base_offset = nir_imm_int(b, 0);
+
+   if (is_embedded) {
+      set_offset = nir_imm_int(b, 0xdeaddead);
+      sampler_index = nir_load_reloc_const_intel(
+         b, BRW_SHADER_RELOC_EMBEDDED_SAMPLER_HANDLE +
+         state->set[set].binding[binding].embedded_sampler_index);
+   } else if (is_bindless) {
+      if (state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT) {
+         set_offset = nir_imm_int(b, 0xdeaddead);
+
+         nir_def *desc_addr =
+            build_desc_addr_for_binding(b, set, binding, array_index, state);
+
+         /* This is anv_sampled_image_descriptor, the sampler handle is always
+          * in component 1.
+          */
+         nir_def *desc_data =
+            build_load_descriptor_mem(b, desc_addr, 0, 2, 32, state);
+
+         sampler_index = nir_channel(b, desc_data, 1);
+      } else {
+         set_offset =
+            nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0),
+                                   .base = offsetof(struct anv_push_constants,
+                                                    desc_sampler_offsets[set]),
+                                   .range = sizeof_field(struct anv_push_constants,
+                                                         desc_sampler_offsets[set]));
+
+         uint32_t base_offset = descriptor_offset;
+
+         /* The SAMPLER_STATE can only be located at a 64 byte in the combined
+          * image/sampler case. Combined image/sampler is not supported to be
+          * used with mutable descriptor types.
+          */
+         if (bind_layout->data & ANV_DESCRIPTOR_SURFACE_SAMPLER)
+            base_offset += ANV_SURFACE_STATE_SIZE;
+
+         if (plane != 0) {
+            assert(plane < bind_layout->max_plane_count);
+            base_offset += plane * (descriptor_stride /
+                                    bind_layout->max_plane_count);
+         }
+
+         sampler_index =
+            nir_iadd_imm(b,
+                         nir_imul_imm(b, array_index, descriptor_stride),
+                         base_offset);
+      }
+   } else {
+      /* Unused */
+      set_offset = nir_imm_int(b, 0xdeaddead);
+
+      sampler_index =
+         nir_iadd_imm(b, array_index,
+                      state->set[set].binding[binding].sampler_offset + plane);
+   }
+
+   nir_resource_data_intel sampler_resource = nir_resource_intel_sampler;
+   if (is_bindless)
+      sampler_resource |= nir_resource_intel_bindless;
+   if (is_embedded)
+      sampler_resource |= nir_resource_intel_sampler_embedded;
+   if (non_uniform)
+      sampler_resource |= nir_resource_intel_non_uniform;
+
+   return nir_resource_intel(b,
+                             set_offset,
+                             sampler_index,
+                             array_index,
+                             sampler_base_offset,
+                             .desc_set = set,
+                             .binding = binding,
+                             .resource_access_intel = sampler_resource);
+}
+
+static nir_def *
+build_buffer_dynamic_offset_for_res_index(nir_builder *b,
+                                          nir_def *dyn_offset_base,
+                                          nir_def *array_index,
+                                          struct apply_pipeline_layout_state *state)
+{
+   nir_def *dyn_offset_idx = nir_iadd(b, dyn_offset_base, array_index);
+
+   nir_def *dyn_load =
+      nir_load_push_constant(b, 1, 32, nir_imul_imm(b, dyn_offset_idx, 4),
+                             .base = offsetof(struct anv_push_constants, dynamic_offsets),
+                             .range = sizeof_field(struct anv_push_constants, dynamic_offsets));
+
+   return nir_bcsel(b, nir_ieq_imm(b, dyn_offset_base, 0xff),
+                       nir_imm_int(b, 0), dyn_load);
+}
+
 /** Convert a Vulkan resource index into a buffer address
  *
  * In some cases, this does a  memory load from the descriptor set and, in
@@ -495,62 +1031,52 @@ build_desc_addr(nir_builder *b,
  *
  * See build_res_index for details about each resource index format.
  */
-static nir_ssa_def *
-build_buffer_addr_for_res_index(nir_builder *b,
-                                const VkDescriptorType desc_type,
-                                nir_ssa_def *res_index,
-                                nir_address_format addr_format,
-                                struct apply_pipeline_layout_state *state)
+static nir_def *
+build_indirect_buffer_addr_for_res_index(nir_builder *b,
+                                         const VkDescriptorType desc_type,
+                                         nir_def *res_index,
+                                         nir_address_format addr_format,
+                                         struct apply_pipeline_layout_state *state)
 {
-   if (desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
+   struct res_index_defs res = unpack_res_index(b, res_index);
+
+   if (desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
       assert(addr_format == state->desc_addr_format);
-      return build_desc_addr(b, NULL, desc_type, res_index, addr_format, state);
+      return build_desc_addr_for_res_index(b, desc_type, res_index,
+                                           addr_format, state);
    } else if (addr_format == nir_address_format_32bit_index_offset) {
-      nir_ssa_def *array_index = nir_channel(b, res_index, 0);
-      nir_ssa_def *packed = nir_channel(b, res_index, 1);
-      nir_ssa_def *array_max = nir_extract_u16(b, packed, nir_imm_int(b, 1));
-      nir_ssa_def *surface_index = nir_extract_u16(b, packed, nir_imm_int(b, 0));
-
-      if (state->add_bounds_checks)
-         array_index = nir_umin(b, array_index, array_max);
-
-      return nir_vec2(b, nir_iadd(b, surface_index, array_index),
+      return nir_vec2(b, nir_iadd(b, res.bti_idx, res.array_index),
                          nir_imm_int(b, 0));
    }
 
-   nir_ssa_def *desc_addr =
-      build_desc_addr(b, NULL, desc_type, res_index, addr_format, state);
+   nir_def *desc_addr =
+      build_desc_addr_for_res_index(b, desc_type, res_index,
+                                    addr_format, state);
 
-   nir_ssa_def *desc = build_load_descriptor_mem(b, desc_addr, 0, 4, 32, state);
+   nir_def *desc = build_load_descriptor_mem(b, desc_addr, 0, 4, 32, state);
 
    if (state->has_dynamic_buffers) {
-      struct res_index_defs res = unpack_res_index(b, res_index);
-
       /* This shader has dynamic offsets and we have no way of knowing
        * (save from the dynamic offset base index) if this buffer has a
        * dynamic offset.
        */
-      nir_ssa_def *dyn_offset_idx =
+      nir_def *dyn_offset_idx =
          nir_iadd(b, res.dyn_offset_base, res.array_index);
-      if (state->add_bounds_checks) {
-         dyn_offset_idx = nir_umin(b, dyn_offset_idx,
-                                      nir_imm_int(b, MAX_DYNAMIC_BUFFERS));
-      }
 
-      nir_ssa_def *dyn_load =
+      nir_def *dyn_load =
          nir_load_push_constant(b, 1, 32, nir_imul_imm(b, dyn_offset_idx, 4),
                                 .base = offsetof(struct anv_push_constants, dynamic_offsets),
                                 .range = MAX_DYNAMIC_BUFFERS * 4);
 
-      nir_ssa_def *dynamic_offset =
+      nir_def *dynamic_offset =
          nir_bcsel(b, nir_ieq_imm(b, res.dyn_offset_base, 0xff),
                       nir_imm_int(b, 0), dyn_load);
 
       /* The dynamic offset gets added to the base pointer so that we
        * have a sliding window range.
        */
-      nir_ssa_def *base_ptr =
-         nir_pack_64_2x32(b, nir_channels(b, desc, 0x3));
+      nir_def *base_ptr =
+         nir_pack_64_2x32(b, nir_trim_vector(b, desc, 2));
       base_ptr = nir_iadd(b, base_ptr, nir_u2u64(b, dynamic_offset));
       desc = nir_vec4(b, nir_unpack_64_2x32_split_x(b, base_ptr),
                          nir_unpack_64_2x32_split_y(b, base_ptr),
@@ -568,50 +1094,138 @@ build_buffer_addr_for_res_index(nir_builder *b,
                       nir_imm_int(b, 0));
 }
 
+static nir_def *
+build_direct_buffer_addr_for_res_index(nir_builder *b,
+                                       const VkDescriptorType desc_type,
+                                       nir_def *res_index,
+                                       nir_address_format addr_format,
+                                       struct apply_pipeline_layout_state *state)
+{
+   if (desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+      assert(addr_format == state->desc_addr_format);
+      return build_desc_addr_for_res_index(b, desc_type, res_index,
+                                           addr_format, state);
+   } else if (addr_format == nir_address_format_32bit_index_offset) {
+      struct res_index_defs res = unpack_res_index(b, res_index);
+
+      return nir_vec2(b, nir_iadd(b, res.desc_offset_base,
+                                  nir_imul(b, res.array_index, res.desc_stride)),
+                      nir_imm_int(b, 0));
+   }
+
+   nir_def *desc_addr =
+      build_desc_addr_for_res_index(b, desc_type, res_index,
+                                    addr_format, state);
+
+   nir_def *addr =
+      build_load_render_surface_state_address(b, desc_addr, state);
+
+   if (state->has_dynamic_buffers) {
+      struct res_index_defs res = unpack_res_index(b, res_index);
+
+      /* This shader has dynamic offsets and we have no way of knowing (save
+       * from the dynamic offset base index) if this buffer has a dynamic
+       * offset.
+       */
+      nir_def *dynamic_offset =
+         build_buffer_dynamic_offset_for_res_index(
+            b, res.dyn_offset_base, res.array_index, state);
+
+      /* The dynamic offset gets added to the base pointer so that we
+       * have a sliding window range.
+       */
+      nir_def *base_ptr =
+         nir_pack_64_2x32(b, nir_trim_vector(b, addr, 2));
+      base_ptr = nir_iadd(b, base_ptr, nir_u2u64(b, dynamic_offset));
+      addr = nir_vec4(b, nir_unpack_64_2x32_split_x(b, base_ptr),
+                         nir_unpack_64_2x32_split_y(b, base_ptr),
+                         nir_channel(b, addr, 2),
+                         nir_channel(b, addr, 3));
+   }
+
+   /* The last element of the vec4 is always zero.
+    *
+    * See also struct anv_address_range_descriptor
+    */
+   return nir_vec4(b, nir_channel(b, addr, 0),
+                      nir_channel(b, addr, 1),
+                      nir_channel(b, addr, 2),
+                      nir_imm_int(b, 0));
+}
+
+static nir_def *
+build_buffer_addr_for_res_index(nir_builder *b,
+                                const VkDescriptorType desc_type,
+                                nir_def *res_index,
+                                nir_address_format addr_format,
+                                struct apply_pipeline_layout_state *state)
+{
+   if (state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT)
+      return build_indirect_buffer_addr_for_res_index(b, desc_type, res_index, addr_format, state);
+   else
+      return build_direct_buffer_addr_for_res_index(b, desc_type, res_index, addr_format, state);
+}
+
+static nir_def *
+build_buffer_addr_for_binding(nir_builder *b,
+                              const VkDescriptorType desc_type,
+                              unsigned set,
+                              unsigned binding,
+                              nir_def *res_index,
+                              nir_address_format addr_format,
+                              struct apply_pipeline_layout_state *state)
+{
+   if (addr_format != nir_address_format_32bit_index_offset)
+      return build_buffer_addr_for_res_index(b, desc_type, res_index, addr_format, state);
+
+   if (desc_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+      const struct anv_descriptor_set_binding_layout *bind_layout =
+         &state->layout->set[set].layout->binding[binding];
+      return nir_vec2(b,
+                      nir_imm_int(b, state->set[set].desc_offset),
+                      nir_imm_int(b, bind_layout->descriptor_surface_offset));
+   }
+
+   struct res_index_defs res = unpack_res_index(b, res_index);
+
+   return nir_vec2(b,
+                   build_surface_index_for_binding(b, set, binding, res.array_index,
+                                                   0 /* plane */,
+                                                   false /* non_uniform */,
+                                                   state),
+                   nir_imm_int(b, 0));
+}
+
 /** Loads descriptor memory for a variable-based deref chain
  *
  * The deref chain has to terminate at a variable with a descriptor_set and
  * binding set.  This is used for images, textures, and samplers.
  */
-static nir_ssa_def *
-build_load_var_deref_descriptor_mem(nir_builder *b, nir_deref_instr *deref,
-                                    unsigned desc_offset,
-                                    unsigned num_components, unsigned bit_size,
+static nir_def *
+build_load_var_deref_surface_handle(nir_builder *b, nir_deref_instr *deref,
+                                    bool non_uniform,
+                                    bool *out_is_bindless,
                                     struct apply_pipeline_layout_state *state)
 {
    nir_variable *var = nir_deref_instr_get_variable(deref);
 
    const uint32_t set = var->data.descriptor_set;
    const uint32_t binding = var->data.binding;
-   const struct anv_descriptor_set_binding_layout *bind_layout =
-         &state->layout->set[set].layout->binding[binding];
 
-   nir_ssa_def *array_index;
+   *out_is_bindless =
+      is_binding_bindless(set, binding, false /* sampler */, state);
+
+   nir_def *array_index;
    if (deref->deref_type != nir_deref_type_var) {
       assert(deref->deref_type == nir_deref_type_array);
       assert(nir_deref_instr_parent(deref)->deref_type == nir_deref_type_var);
-      assert(deref->arr.index.is_ssa);
       array_index = deref->arr.index.ssa;
    } else {
       array_index = nir_imm_int(b, 0);
    }
 
-   /* It doesn't really matter what address format we choose as everything
-    * will constant-fold nicely.  Choose one that uses the actual descriptor
-    * buffer so we don't run into issues index/offset assumptions.
-    */
-   const nir_address_format addr_format =
-      nir_address_format_64bit_bounded_global;
-
-   nir_ssa_def *res_index =
-      build_res_index(b, set, binding, array_index, addr_format, state);
-
-   nir_ssa_def *desc_addr =
-      build_desc_addr(b, bind_layout, bind_layout->type,
-                      res_index, addr_format, state);
-
-   return build_load_descriptor_mem(b, desc_addr, desc_offset,
-                                    num_components, bit_size, state);
+   return build_surface_index_for_binding(b, set, binding, array_index,
+                                          0 /* plane */, non_uniform, state);
 }
 
 /** A recursive form of build_res_index()
@@ -621,7 +1235,7 @@ build_load_var_deref_descriptor_mem(nir_builder *b, nir_deref_instr *deref,
  * hopes of better CSE.  This means the cursor is not where you left it when
  * this function returns.
  */
-static nir_ssa_def *
+static nir_def *
 build_res_index_for_chain(nir_builder *b, nir_intrinsic_instr *intrin,
                           nir_address_format addr_format,
                           uint32_t *set, uint32_t *binding,
@@ -629,22 +1243,19 @@ build_res_index_for_chain(nir_builder *b, nir_intrinsic_instr *intrin,
 {
    if (intrin->intrinsic == nir_intrinsic_vulkan_resource_index) {
       b->cursor = nir_before_instr(&intrin->instr);
-      assert(intrin->src[0].is_ssa);
       *set = nir_intrinsic_desc_set(intrin);
       *binding = nir_intrinsic_binding(intrin);
-      return build_res_index(b, *set, *binding, intrin->src[0].ssa,
-                             addr_format, state);
+      return build_res_index(b, *set, *binding, intrin->src[0].ssa, state);
    } else {
       assert(intrin->intrinsic == nir_intrinsic_vulkan_resource_reindex);
       nir_intrinsic_instr *parent = nir_src_as_intrinsic(intrin->src[0]);
-      nir_ssa_def *index =
+      nir_def *index =
          build_res_index_for_chain(b, parent, addr_format,
                                    set, binding, state);
 
       b->cursor = nir_before_instr(&intrin->instr);
 
-      assert(intrin->src[1].is_ssa);
-      return build_res_reindex(b, index, intrin->src[1].ssa, addr_format);
+      return build_res_reindex(b, index, intrin->src[1].ssa);
    }
 }
 
@@ -652,22 +1263,23 @@ build_res_index_for_chain(nir_builder *b, nir_intrinsic_instr *intrin,
  *
  * The cursor is not where you left it when this function returns.
  */
-static nir_ssa_def *
+static nir_def *
 build_buffer_addr_for_idx_intrin(nir_builder *b,
                                  nir_intrinsic_instr *idx_intrin,
                                  nir_address_format addr_format,
                                  struct apply_pipeline_layout_state *state)
 {
    uint32_t set = UINT32_MAX, binding = UINT32_MAX;
-   nir_ssa_def *res_index =
+   nir_def *res_index =
       build_res_index_for_chain(b, idx_intrin, addr_format,
                                 &set, &binding, state);
 
    const struct anv_descriptor_set_binding_layout *bind_layout =
       &state->layout->set[set].layout->binding[binding];
 
-   return build_buffer_addr_for_res_index(b, bind_layout->type,
-                                          res_index, addr_format, state);
+   return build_buffer_addr_for_binding(b, bind_layout->type,
+                                        set, binding, res_index,
+                                        addr_format, state);
 }
 
 /** Builds a buffer address for deref chain
@@ -677,14 +1289,14 @@ build_buffer_addr_for_idx_intrin(nir_builder *b,
  *
  * The cursor is not where you left it when this function returns.
  */
-static nir_ssa_def *
+static nir_def *
 build_buffer_addr_for_deref(nir_builder *b, nir_deref_instr *deref,
                             nir_address_format addr_format,
                             struct apply_pipeline_layout_state *state)
 {
    nir_deref_instr *parent = nir_deref_instr_parent(deref);
    if (parent) {
-      nir_ssa_def *addr =
+      nir_def *addr =
          build_buffer_addr_for_deref(b, parent, addr_format, state);
 
       b->cursor = nir_before_instr(&deref->instr);
@@ -717,23 +1329,35 @@ try_lower_direct_buffer_intrinsic(nir_builder *b,
       return false;
    }
 
+   const unsigned set = nir_intrinsic_desc_set(desc);
+   const unsigned binding = nir_intrinsic_binding(desc);
+
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+      &state->layout->set[set].layout->binding[binding];
+
    nir_address_format addr_format = descriptor_address_format(desc, state);
 
+   /* Although we could lower non uniform binding table accesses with
+    * nir_opt_non_uniform_access, we might as well use an A64 message and
+    * avoid the loops inserted by that lowering pass.
+    */
+   if (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)
+      return false;
+
    if (nir_deref_mode_is(deref, nir_var_mem_ssbo)) {
       /* 64-bit atomics only support A64 messages so we can't lower them to
        * the index+offset model.
        */
-      if (is_atomic && nir_dest_bit_size(intrin->dest) == 64 &&
+      if (is_atomic && intrin->def.bit_size == 64 &&
           !state->pdevice->info.has_lsc)
          return false;
 
-      /* Normal binding table-based messages can't handle non-uniform access
-       * so we have to fall back to A64.
+      /* If we don't have a BTI for this binding and we're using indirect
+       * descriptors, we'll use A64 messages. This is handled in the main
+       * lowering path.
        */
-      if (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)
-         return false;
-
-      if (!descriptor_has_bti(desc, state))
+      if (state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT &&
+          !descriptor_has_bti(desc, state))
          return false;
 
       /* Rewrite to 32bit_index_offset whenever we can */
@@ -741,12 +1365,36 @@ try_lower_direct_buffer_intrinsic(nir_builder *b,
    } else {
       assert(nir_deref_mode_is(deref, nir_var_mem_ubo));
 
-      /* Rewrite to 32bit_index_offset whenever we can */
-      if (descriptor_has_bti(desc, state))
+      /* If we don't have a BTI for this binding and we're using indirect
+       * descriptors, we'll use A64 messages. This is handled in the main
+       * lowering path.
+       *
+       * We make an exception for uniform blocks which are built from the
+       * descriptor set base address + offset. There is no indirect data to
+       * fetch.
+       */
+      if (state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT &&
+          bind_layout->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK &&
+          !descriptor_has_bti(desc, state))
+         return false;
+
+      /* If this is an inline uniform and the shader stage is bindless, we
+       * can't switch to 32bit_index_offset.
+       */
+      if (bind_layout->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK ||
+          !brw_shader_stage_requires_bindless_resources(b->shader->info.stage))
          addr_format = nir_address_format_32bit_index_offset;
    }
 
-   nir_ssa_def *addr =
+   /* If a dynamic has not been assigned a binding table entry, we need to
+    * bail here.
+    */
+   if ((bind_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
+        bind_layout->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) &&
+       !descriptor_has_bti(desc, state))
+      return false;
+
+   nir_def *addr =
       build_buffer_addr_for_deref(b, deref, addr_format, state);
 
    b->cursor = nir_before_instr(&intrin->instr);
@@ -772,26 +1420,22 @@ lower_load_accel_struct_desc(nir_builder *b,
       nir_address_format_64bit_bounded_global;
 
    uint32_t set = UINT32_MAX, binding = UINT32_MAX;
-   nir_ssa_def *res_index =
+   nir_def *res_index =
       build_res_index_for_chain(b, idx_intrin, addr_format,
                                 &set, &binding, state);
 
-   const struct anv_descriptor_set_binding_layout *bind_layout =
-      &state->layout->set[set].layout->binding[binding];
-
    b->cursor = nir_before_instr(&load_desc->instr);
 
-   nir_ssa_def *desc_addr =
-      build_desc_addr(b, bind_layout, bind_layout->type,
-                      res_index, addr_format, state);
+   struct res_index_defs res = unpack_res_index(b, res_index);
+   nir_def *desc_addr =
+      build_desc_addr_for_binding(b, set, binding, res.array_index, state);
 
    /* Acceleration structure descriptors are always uint64_t */
-   nir_ssa_def *desc = build_load_descriptor_mem(b, desc_addr, 0, 1, 64, state);
+   nir_def *desc = build_load_descriptor_mem(b, desc_addr, 0, 1, 64, state);
 
-   assert(load_desc->dest.is_ssa);
-   assert(load_desc->dest.ssa.bit_size == 64);
-   assert(load_desc->dest.ssa.num_components == 1);
-   nir_ssa_def_rewrite_uses(&load_desc->dest.ssa, desc);
+   assert(load_desc->def.bit_size == 64);
+   assert(load_desc->def.num_components == 1);
+   nir_def_rewrite_uses(&load_desc->def, desc);
    nir_instr_remove(&load_desc->instr);
 
    return true;
@@ -811,20 +1455,8 @@ lower_direct_buffer_instr(nir_builder *b, nir_instr *instr, void *_state)
    case nir_intrinsic_store_deref:
       return try_lower_direct_buffer_intrinsic(b, intrin, false, state);
 
-   case nir_intrinsic_deref_atomic_add:
-   case nir_intrinsic_deref_atomic_imin:
-   case nir_intrinsic_deref_atomic_umin:
-   case nir_intrinsic_deref_atomic_imax:
-   case nir_intrinsic_deref_atomic_umax:
-   case nir_intrinsic_deref_atomic_and:
-   case nir_intrinsic_deref_atomic_or:
-   case nir_intrinsic_deref_atomic_xor:
-   case nir_intrinsic_deref_atomic_exchange:
-   case nir_intrinsic_deref_atomic_comp_swap:
-   case nir_intrinsic_deref_atomic_fadd:
-   case nir_intrinsic_deref_atomic_fmin:
-   case nir_intrinsic_deref_atomic_fmax:
-   case nir_intrinsic_deref_atomic_fcomp_swap:
+   case nir_intrinsic_deref_atomic:
+   case nir_intrinsic_deref_atomic_swap:
       return try_lower_direct_buffer_intrinsic(b, intrin, true, state);
 
    case nir_intrinsic_get_ssbo_size: {
@@ -833,23 +1465,30 @@ lower_direct_buffer_instr(nir_builder *b, nir_instr *instr, void *_state)
        */
       nir_intrinsic_instr *idx_intrin =
          find_descriptor_for_index_src(intrin->src[0], state);
-      if (idx_intrin == NULL || !descriptor_has_bti(idx_intrin, state))
+      if (idx_intrin == NULL)
          return false;
 
-      b->cursor = nir_before_instr(&intrin->instr);
-
       /* We just checked that this is a BTI descriptor */
       const nir_address_format addr_format =
          nir_address_format_32bit_index_offset;
 
-      nir_ssa_def *buffer_addr =
-         build_buffer_addr_for_idx_intrin(b, idx_intrin, addr_format, state);
-
       b->cursor = nir_before_instr(&intrin->instr);
-      nir_ssa_def *bti = nir_channel(b, buffer_addr, 0);
 
-      nir_instr_rewrite_src(&intrin->instr, &intrin->src[0],
-                            nir_src_for_ssa(bti));
+      uint32_t set = UINT32_MAX, binding = UINT32_MAX;
+      nir_def *res_index =
+         build_res_index_for_chain(b, idx_intrin, addr_format,
+                                   &set, &binding, state);
+
+      bool non_uniform = nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM;
+
+      nir_def *surface_index =
+         build_surface_index_for_binding(b, set, binding,
+                                         nir_channel(b, res_index, 3),
+                                         0 /* plane */,
+                                         non_uniform,
+                                         state);
+
+      nir_src_rewrite(&intrin->src[0], surface_index);
       _mesa_set_add(state->lowered_instrs, intrin);
       return true;
    }
@@ -871,20 +1510,15 @@ lower_res_index_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
 {
    b->cursor = nir_before_instr(&intrin->instr);
 
-   nir_address_format addr_format =
-      addr_format_for_desc_type(nir_intrinsic_desc_type(intrin), state);
-
-   assert(intrin->src[0].is_ssa);
-   nir_ssa_def *index =
+   nir_def *index =
       build_res_index(b, nir_intrinsic_desc_set(intrin),
                          nir_intrinsic_binding(intrin),
                          intrin->src[0].ssa,
-                         addr_format, state);
+                         state);
 
-   assert(intrin->dest.is_ssa);
-   assert(intrin->dest.ssa.bit_size == index->bit_size);
-   assert(intrin->dest.ssa.num_components == index->num_components);
-   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, index);
+   assert(intrin->def.bit_size == index->bit_size);
+   assert(intrin->def.num_components == index->num_components);
+   nir_def_rewrite_uses(&intrin->def, index);
    nir_instr_remove(&intrin->instr);
 
    return true;
@@ -896,19 +1530,13 @@ lower_res_reindex_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
 {
    b->cursor = nir_before_instr(&intrin->instr);
 
-   nir_address_format addr_format =
-      addr_format_for_desc_type(nir_intrinsic_desc_type(intrin), state);
-
-   assert(intrin->src[0].is_ssa && intrin->src[1].is_ssa);
-   nir_ssa_def *index =
+   nir_def *index =
       build_res_reindex(b, intrin->src[0].ssa,
-                           intrin->src[1].ssa,
-                           addr_format);
+                           intrin->src[1].ssa);
 
-   assert(intrin->dest.is_ssa);
-   assert(intrin->dest.ssa.bit_size == index->bit_size);
-   assert(intrin->dest.ssa.num_components == index->num_components);
-   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, index);
+   assert(intrin->def.bit_size == index->bit_size);
+   assert(intrin->def.num_components == index->num_components);
+   nir_def_rewrite_uses(&intrin->def, index);
    nir_instr_remove(&intrin->instr);
 
    return true;
@@ -923,40 +1551,14 @@ lower_load_vulkan_descriptor(nir_builder *b, nir_intrinsic_instr *intrin,
    const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin);
    nir_address_format addr_format = addr_format_for_desc_type(desc_type, state);
 
-   assert(intrin->dest.is_ssa);
-   nir_foreach_use(src, &intrin->dest.ssa) {
-      if (src->parent_instr->type != nir_instr_type_deref)
-         continue;
-
-      nir_deref_instr *cast = nir_instr_as_deref(src->parent_instr);
-      assert(cast->deref_type == nir_deref_type_cast);
-      switch (desc_type) {
-      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
-      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
-         cast->cast.align_mul = ANV_UBO_ALIGNMENT;
-         cast->cast.align_offset = 0;
-         break;
-
-      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
-      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
-         cast->cast.align_mul = ANV_SSBO_ALIGNMENT;
-         cast->cast.align_offset = 0;
-         break;
-
-      default:
-         break;
-      }
-   }
-
-   assert(intrin->src[0].is_ssa);
-   nir_ssa_def *desc =
-      build_buffer_addr_for_res_index(b, desc_type, intrin->src[0].ssa,
+   nir_def *desc =
+      build_buffer_addr_for_res_index(b,
+                                      desc_type, intrin->src[0].ssa,
                                       addr_format, state);
 
-   assert(intrin->dest.is_ssa);
-   assert(intrin->dest.ssa.bit_size == desc->bit_size);
-   assert(intrin->dest.ssa.num_components == desc->num_components);
-   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, desc);
+   assert(intrin->def.bit_size == desc->bit_size);
+   assert(intrin->def.num_components == desc->num_components);
+   nir_def_rewrite_uses(&intrin->def, desc);
    nir_instr_remove(&intrin->instr);
 
    return true;
@@ -971,35 +1573,37 @@ lower_get_ssbo_size(nir_builder *b, nir_intrinsic_instr *intrin,
 
    b->cursor = nir_before_instr(&intrin->instr);
 
-   nir_address_format addr_format =
-      addr_format_for_desc_type(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, state);
-
-   assert(intrin->src[0].is_ssa);
-   nir_ssa_def *desc =
-      build_buffer_addr_for_res_index(b, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-                                      intrin->src[0].ssa, addr_format, state);
-
-   switch (addr_format) {
-   case nir_address_format_64bit_global_32bit_offset:
-   case nir_address_format_64bit_bounded_global: {
-      nir_ssa_def *size = nir_channel(b, desc, 2);
-      nir_ssa_def_rewrite_uses(&intrin->dest.ssa, size);
-      nir_instr_remove(&intrin->instr);
-      break;
-   }
+   const nir_address_format addr_format =
+      nir_address_format_64bit_bounded_global;
 
-   case nir_address_format_32bit_index_offset:
-      /* The binding table index is the first component of the address.  The
-       * back-end wants a scalar binding table index source.
+   nir_def *desc_addr =
+      nir_build_addr_iadd_imm(
+         b,
+         build_desc_addr_for_res_index(b,
+                                       VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+                                       intrin->src[0].ssa,
+                                       addr_format, state),
+         addr_format,
+         nir_var_mem_ssbo,
+         state->pdevice->isl_dev.ss.size);
+
+   nir_def *desc_range;
+   if (state->layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT) {
+      /* Load the anv_address_range_descriptor */
+      desc_range =
+         build_load_descriptor_mem(b, desc_addr, 0, 4, 32, state);
+   } else {
+      /* Build a vec4 similar to anv_address_range_descriptor using the
+       * RENDER_SURFACE_STATE.
        */
-      nir_instr_rewrite_src(&intrin->instr, &intrin->src[0],
-                            nir_src_for_ssa(nir_channel(b, desc, 0)));
-      break;
-
-   default:
-      unreachable("Unsupported address format");
+      desc_range =
+         build_load_render_surface_state_address(b, desc_addr, state);
    }
 
+   nir_def *size = nir_channel(b, desc_range, 2);
+   nir_def_rewrite_uses(&intrin->def, size);
+   nir_instr_remove(&intrin->instr);
+
    return true;
 }
 
@@ -1008,53 +1612,67 @@ lower_image_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
                       struct apply_pipeline_layout_state *state)
 {
    nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
-   nir_variable *var = nir_deref_instr_get_variable(deref);
-
-   unsigned set = var->data.descriptor_set;
-   unsigned binding = var->data.binding;
-   unsigned binding_offset = state->set[set].surface_offsets[binding];
 
    b->cursor = nir_before_instr(&intrin->instr);
 
-   ASSERTED const bool use_bindless = state->pdevice->has_bindless_images;
+   bool non_uniform = nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM;
+   bool is_bindless;
+   nir_def *handle =
+      build_load_var_deref_surface_handle(b, deref, non_uniform,
+                                          &is_bindless, state);
+   nir_rewrite_image_intrinsic(intrin, handle, is_bindless);
+
+   return true;
+}
 
-   if (intrin->intrinsic == nir_intrinsic_image_deref_load_param_intel) {
-      b->cursor = nir_instr_remove(&intrin->instr);
+static bool
+lower_image_size_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
+                           struct apply_pipeline_layout_state *state)
+{
+   if (nir_intrinsic_image_dim(intrin) != GLSL_SAMPLER_DIM_3D)
+      return lower_image_intrinsic(b, intrin, state);
 
-      assert(!use_bindless); /* Otherwise our offsets would be wrong */
-      const unsigned param = nir_intrinsic_base(intrin);
+   nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
 
-      nir_ssa_def *desc =
-         build_load_var_deref_descriptor_mem(b, deref, param * 16,
-                                             intrin->dest.ssa.num_components,
-                                             intrin->dest.ssa.bit_size, state);
+   b->cursor = nir_before_instr(&intrin->instr);
 
-      nir_ssa_def_rewrite_uses(&intrin->dest.ssa, desc);
-   } else if (binding_offset > MAX_BINDING_TABLE_SIZE) {
-      const bool write_only =
-         (var->data.access & ACCESS_NON_READABLE) != 0;
-      nir_ssa_def *desc =
-         build_load_var_deref_descriptor_mem(b, deref, 0, 2, 32, state);
-      nir_ssa_def *handle = nir_channel(b, desc, write_only ? 1 : 0);
-      nir_rewrite_image_intrinsic(intrin, handle, true);
-   } else {
-      unsigned array_size =
-         state->layout->set[set].layout->binding[binding].array_size;
-
-      nir_ssa_def *index = NULL;
-      if (deref->deref_type != nir_deref_type_var) {
-         assert(deref->deref_type == nir_deref_type_array);
-         index = nir_ssa_for_src(b, deref->arr.index, 1);
-         if (state->add_bounds_checks)
-            index = nir_umin(b, index, nir_imm_int(b, array_size - 1));
-      } else {
-         index = nir_imm_int(b, 0);
-      }
+   bool non_uniform = nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM;
+   bool is_bindless;
+   nir_def *handle =
+      build_load_var_deref_surface_handle(b, deref, non_uniform,
+                                          &is_bindless, state);
+   nir_rewrite_image_intrinsic(intrin, handle, is_bindless);
 
-      index = nir_iadd_imm(b, index, binding_offset);
-      nir_rewrite_image_intrinsic(intrin, index, false);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+   const uint32_t set = var->data.descriptor_set;
+   const uint32_t binding = var->data.binding;
+
+   nir_def *array_index;
+   if (deref->deref_type != nir_deref_type_var) {
+      assert(deref->deref_type == nir_deref_type_array);
+      assert(nir_deref_instr_parent(deref)->deref_type == nir_deref_type_var);
+      array_index = deref->arr.index.ssa;
+   } else {
+      array_index = nir_imm_int(b, 0);
    }
 
+   nir_def *desc_addr = build_desc_addr_for_binding(
+      b, set, binding, array_index, state);
+
+   b->cursor = nir_after_instr(&intrin->instr);
+
+   nir_def *image_depth =
+      build_load_storage_3d_image_depth(b, desc_addr,
+                                        nir_channel(b, &intrin->def, 2),
+                                        state);
+
+   nir_def *comps[4] = {};
+   for (unsigned c = 0; c < intrin->def.num_components; c++)
+      comps[c] = c == 2 ? image_depth : nir_channel(b, &intrin->def, c);
+
+   nir_def *vec = nir_vec(b, comps, intrin->def.num_components);
+   nir_def_rewrite_uses_after(&intrin->def, vec, vec->parent_instr);
+
    return true;
 }
 
@@ -1068,40 +1686,45 @@ lower_load_constant(nir_builder *b, nir_intrinsic_instr *intrin,
     * by constant folding.
     */
    assert(!nir_src_is_const(intrin->src[0]));
-   nir_ssa_def *offset = nir_iadd_imm(b, nir_ssa_for_src(b, intrin->src[0], 1),
+   nir_def *offset = nir_iadd_imm(b, intrin->src[0].ssa,
                                       nir_intrinsic_base(intrin));
 
-   nir_ssa_def *data;
-   if (state->pdevice->use_softpin) {
-      unsigned load_size = intrin->dest.ssa.num_components *
-                           intrin->dest.ssa.bit_size / 8;
-      unsigned load_align = intrin->dest.ssa.bit_size / 8;
+   unsigned load_size = intrin->def.num_components *
+                        intrin->def.bit_size / 8;
+   unsigned load_align = intrin->def.bit_size / 8;
 
-      assert(load_size < b->shader->constant_data_size);
-      unsigned max_offset = b->shader->constant_data_size - load_size;
-      offset = nir_umin(b, offset, nir_imm_int(b, max_offset));
+   assert(load_size < b->shader->constant_data_size);
+   unsigned max_offset = b->shader->constant_data_size - load_size;
+   offset = nir_umin(b, offset, nir_imm_int(b, max_offset));
 
-      nir_ssa_def *const_data_base_addr = nir_pack_64_2x32_split(b,
+   nir_def *const_data_addr = nir_pack_64_2x32_split(b,
+      nir_iadd(b,
          nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW),
-         nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH));
+         offset),
+      nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH));
 
-      data = nir_load_global_constant(b, nir_iadd(b, const_data_base_addr,
-                                                     nir_u2u64(b, offset)),
-                                      load_align,
-                                      intrin->dest.ssa.num_components,
-                                      intrin->dest.ssa.bit_size);
-   } else {
-      nir_ssa_def *index = nir_imm_int(b, state->constants_offset);
-
-      data = nir_load_ubo(b, intrin->num_components, intrin->dest.ssa.bit_size,
-                          index, offset,
-                          .align_mul = intrin->dest.ssa.bit_size / 8,
-                          .align_offset =  0,
-                          .range_base = nir_intrinsic_base(intrin),
-                          .range = nir_intrinsic_range(intrin));
-   }
+   nir_def *data =
+      nir_load_global_constant(b, const_data_addr,
+                               load_align,
+                               intrin->def.num_components,
+                               intrin->def.bit_size);
+
+   nir_def_rewrite_uses(&intrin->def, data);
+
+   return true;
+}
+
+static bool
+lower_base_workgroup_id(nir_builder *b, nir_intrinsic_instr *intrin,
+                        struct apply_pipeline_layout_state *state)
+{
+   b->cursor = nir_instr_remove(&intrin->instr);
 
-   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, data);
+   nir_def *base_workgroup_id =
+      nir_load_push_constant(b, 3, 32, nir_imm_int(b, 0),
+                             .base = offsetof(struct anv_push_constants, cs.base_work_group_id),
+                             .range = sizeof_field(struct anv_push_constants, cs.base_work_group_id));
+   nir_def_rewrite_uses(&intrin->def, base_workgroup_id);
 
    return true;
 }
@@ -1109,7 +1732,7 @@ lower_load_constant(nir_builder *b, nir_intrinsic_instr *intrin,
 static void
 lower_tex_deref(nir_builder *b, nir_tex_instr *tex,
                 nir_tex_src_type deref_src_type,
-                unsigned *base_index, unsigned plane,
+                unsigned base_index, unsigned plane,
                 struct apply_pipeline_layout_state *state)
 {
    int deref_src_idx = nir_tex_instr_src_index(tex, deref_src_type);
@@ -1119,91 +1742,44 @@ lower_tex_deref(nir_builder *b, nir_tex_instr *tex,
    nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
    nir_variable *var = nir_deref_instr_get_variable(deref);
 
-   unsigned set = var->data.descriptor_set;
-   unsigned binding = var->data.binding;
-   unsigned array_size =
-      state->layout->set[set].layout->binding[binding].array_size;
+   const bool is_sampler = deref_src_type == nir_tex_src_sampler_deref;
+   const unsigned set = var->data.descriptor_set;
+   const unsigned binding = var->data.binding;
+   const bool bindless = is_binding_bindless(set, binding, is_sampler, state);
 
-   unsigned binding_offset;
-   if (deref_src_type == nir_tex_src_texture_deref) {
-      binding_offset = state->set[set].surface_offsets[binding];
+   nir_def *array_index = NULL;
+   if (deref->deref_type != nir_deref_type_var) {
+      assert(deref->deref_type == nir_deref_type_array);
+
+      array_index = deref->arr.index.ssa;
    } else {
-      assert(deref_src_type == nir_tex_src_sampler_deref);
-      binding_offset = state->set[set].sampler_offsets[binding];
+      array_index = nir_imm_int(b, 0);
    }
 
    nir_tex_src_type offset_src_type;
-   nir_ssa_def *index = NULL;
-   if (binding_offset > MAX_BINDING_TABLE_SIZE) {
-      const unsigned plane_offset =
-         plane * sizeof(struct anv_sampled_image_descriptor);
-
-      nir_ssa_def *desc =
-         build_load_var_deref_descriptor_mem(b, deref, plane_offset,
-                                             2, 32, state);
-
-      if (deref_src_type == nir_tex_src_texture_deref) {
-         offset_src_type = nir_tex_src_texture_handle;
-         index = nir_channel(b, desc, 0);
-      } else {
-         assert(deref_src_type == nir_tex_src_sampler_deref);
-         offset_src_type = nir_tex_src_sampler_handle;
-         index = nir_channel(b, desc, 1);
-      }
+   nir_def *index;
+   if (deref_src_type == nir_tex_src_texture_deref) {
+      index = build_surface_index_for_binding(b, set, binding, array_index,
+                                              plane,
+                                              tex->texture_non_uniform,
+                                              state);
+      offset_src_type = bindless ?
+                        nir_tex_src_texture_handle :
+                        nir_tex_src_texture_offset;
    } else {
-      if (deref_src_type == nir_tex_src_texture_deref) {
-         offset_src_type = nir_tex_src_texture_offset;
-      } else {
-         assert(deref_src_type == nir_tex_src_sampler_deref);
-         offset_src_type = nir_tex_src_sampler_offset;
-      }
-
-      *base_index = binding_offset + plane;
-
-      if (deref->deref_type != nir_deref_type_var) {
-         assert(deref->deref_type == nir_deref_type_array);
-
-         if (nir_src_is_const(deref->arr.index)) {
-            unsigned arr_index = MIN2(nir_src_as_uint(deref->arr.index), array_size - 1);
-            struct anv_sampler **immutable_samplers =
-               state->layout->set[set].layout->binding[binding].immutable_samplers;
-            if (immutable_samplers) {
-               /* Array of YCbCr samplers are tightly packed in the binding
-                * tables, compute the offset of an element in the array by
-                * adding the number of planes of all preceding elements.
-                */
-               unsigned desc_arr_index = 0;
-               for (int i = 0; i < arr_index; i++)
-                  desc_arr_index += immutable_samplers[i]->n_planes;
-               *base_index += desc_arr_index;
-            } else {
-               *base_index += arr_index;
-            }
-         } else {
-            /* From VK_KHR_sampler_ycbcr_conversion:
-             *
-             * If sampler Y’CBCR conversion is enabled, the combined image
-             * sampler must be indexed only by constant integral expressions
-             * when aggregated into arrays in shader code, irrespective of
-             * the shaderSampledImageArrayDynamicIndexing feature.
-             */
-            assert(nir_tex_instr_src_index(tex, nir_tex_src_plane) == -1);
-
-            index = nir_ssa_for_src(b, deref->arr.index, 1);
+      assert(deref_src_type == nir_tex_src_sampler_deref);
 
-            if (state->add_bounds_checks)
-               index = nir_umin(b, index, nir_imm_int(b, array_size - 1));
-         }
-      }
+      index = build_sampler_handle_for_binding(b, set, binding, array_index,
+                                               plane,
+                                               tex->sampler_non_uniform,
+                                               state);
+      offset_src_type = bindless ?
+                        nir_tex_src_sampler_handle :
+                        nir_tex_src_sampler_offset;
    }
 
-   if (index) {
-      nir_instr_rewrite_src(&tex->instr, &tex->src[deref_src_idx].src,
-                            nir_src_for_ssa(index));
-      tex->src[deref_src_idx].src_type = offset_src_type;
-   } else {
-      nir_tex_instr_remove_src(tex, deref_src_idx);
-   }
+   nir_src_rewrite(&tex->src[deref_src_idx].src, index);
+   tex->src[deref_src_idx].src_type = offset_src_type;
 }
 
 static uint32_t
@@ -1220,106 +1796,51 @@ tex_instr_get_and_remove_plane_src(nir_tex_instr *tex)
    return plane;
 }
 
-static nir_ssa_def *
-build_def_array_select(nir_builder *b, nir_ssa_def **srcs, nir_ssa_def *idx,
+static nir_def *
+build_def_array_select(nir_builder *b, nir_def **srcs, nir_def *idx,
                        unsigned start, unsigned end)
 {
    if (start == end - 1) {
       return srcs[start];
    } else {
       unsigned mid = start + (end - start) / 2;
-      return nir_bcsel(b, nir_ilt(b, idx, nir_imm_int(b, mid)),
+      return nir_bcsel(b, nir_ilt_imm(b, idx, mid),
                        build_def_array_select(b, srcs, idx, start, mid),
                        build_def_array_select(b, srcs, idx, mid, end));
    }
 }
 
-static void
-lower_gfx7_tex_swizzle(nir_builder *b, nir_tex_instr *tex, unsigned plane,
-                       struct apply_pipeline_layout_state *state)
-{
-   assert(state->pdevice->info.verx10 == 70);
-   if (tex->sampler_dim == GLSL_SAMPLER_DIM_BUF ||
-       nir_tex_instr_is_query(tex) ||
-       tex->op == nir_texop_tg4 || /* We can't swizzle TG4 */
-       (tex->is_shadow && tex->is_new_style_shadow))
-      return;
-
-   int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
-   assert(deref_src_idx >= 0);
-
-   nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
-   nir_variable *var = nir_deref_instr_get_variable(deref);
-
-   unsigned set = var->data.descriptor_set;
-   unsigned binding = var->data.binding;
-   const struct anv_descriptor_set_binding_layout *bind_layout =
-      &state->layout->set[set].layout->binding[binding];
-
-   if ((bind_layout->data & ANV_DESCRIPTOR_TEXTURE_SWIZZLE) == 0)
-      return;
-
-   b->cursor = nir_before_instr(&tex->instr);
-
-   const unsigned plane_offset =
-      plane * sizeof(struct anv_texture_swizzle_descriptor);
-   nir_ssa_def *swiz =
-      build_load_var_deref_descriptor_mem(b, deref, plane_offset,
-                                          1, 32, state);
-
-   b->cursor = nir_after_instr(&tex->instr);
-
-   assert(tex->dest.ssa.bit_size == 32);
-   assert(tex->dest.ssa.num_components == 4);
-
-   /* Initializing to undef is ok; nir_opt_undef will clean it up. */
-   nir_ssa_def *undef = nir_ssa_undef(b, 1, 32);
-   nir_ssa_def *comps[8];
-   for (unsigned i = 0; i < ARRAY_SIZE(comps); i++)
-      comps[i] = undef;
-
-   comps[ISL_CHANNEL_SELECT_ZERO] = nir_imm_int(b, 0);
-   if (nir_alu_type_get_base_type(tex->dest_type) == nir_type_float)
-      comps[ISL_CHANNEL_SELECT_ONE] = nir_imm_float(b, 1);
-   else
-      comps[ISL_CHANNEL_SELECT_ONE] = nir_imm_int(b, 1);
-   comps[ISL_CHANNEL_SELECT_RED] = nir_channel(b, &tex->dest.ssa, 0);
-   comps[ISL_CHANNEL_SELECT_GREEN] = nir_channel(b, &tex->dest.ssa, 1);
-   comps[ISL_CHANNEL_SELECT_BLUE] = nir_channel(b, &tex->dest.ssa, 2);
-   comps[ISL_CHANNEL_SELECT_ALPHA] = nir_channel(b, &tex->dest.ssa, 3);
-
-   nir_ssa_def *swiz_comps[4];
-   for (unsigned i = 0; i < 4; i++) {
-      nir_ssa_def *comp_swiz = nir_extract_u8(b, swiz, nir_imm_int(b, i));
-      swiz_comps[i] = build_def_array_select(b, comps, comp_swiz, 0, 8);
-   }
-   nir_ssa_def *swiz_tex_res = nir_vec(b, swiz_comps, 4);
-
-   /* Rewrite uses before we insert so we don't rewrite this use */
-   nir_ssa_def_rewrite_uses_after(&tex->dest.ssa,
-                                  swiz_tex_res,
-                                  swiz_tex_res->parent_instr);
-}
-
 static bool
 lower_tex(nir_builder *b, nir_tex_instr *tex,
           struct apply_pipeline_layout_state *state)
 {
    unsigned plane = tex_instr_get_and_remove_plane_src(tex);
 
-   /* On Ivy Bridge and Bay Trail, we have to swizzle in the shader.  Do this
-    * before we lower the derefs away so we can still find the descriptor.
-    */
-   if (state->pdevice->info.verx10 == 70)
-      lower_gfx7_tex_swizzle(b, tex, plane, state);
-
    b->cursor = nir_before_instr(&tex->instr);
 
    lower_tex_deref(b, tex, nir_tex_src_texture_deref,
-                   &tex->texture_index, plane, state);
-
+                   tex->texture_index, plane, state);
    lower_tex_deref(b, tex, nir_tex_src_sampler_deref,
-                   &tex->sampler_index, plane, state);
+                   tex->sampler_index, plane, state);
+
+   /* The whole lot will be embedded in the offset/handle source */
+   tex->texture_index = 0;
+   tex->sampler_index = 0;
+
+   return true;
+}
+
+static bool
+lower_ray_query_globals(nir_builder *b, nir_intrinsic_instr *intrin,
+                        struct apply_pipeline_layout_state *state)
+{
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   nir_def *rq_globals =
+      nir_load_push_constant(b, 1, 64, nir_imm_int(b, 0),
+                             .base = offsetof(struct anv_push_constants, ray_query_globals),
+                             .range = sizeof_field(struct anv_push_constants, ray_query_globals));
+   nir_def_rewrite_uses(&intrin->def, rq_globals);
 
    return true;
 }
@@ -1343,25 +1864,22 @@ apply_pipeline_layout(nir_builder *b, nir_instr *instr, void *_state)
          return lower_get_ssbo_size(b, intrin, state);
       case nir_intrinsic_image_deref_load:
       case nir_intrinsic_image_deref_store:
-      case nir_intrinsic_image_deref_atomic_add:
-      case nir_intrinsic_image_deref_atomic_imin:
-      case nir_intrinsic_image_deref_atomic_umin:
-      case nir_intrinsic_image_deref_atomic_imax:
-      case nir_intrinsic_image_deref_atomic_umax:
-      case nir_intrinsic_image_deref_atomic_and:
-      case nir_intrinsic_image_deref_atomic_or:
-      case nir_intrinsic_image_deref_atomic_xor:
-      case nir_intrinsic_image_deref_atomic_exchange:
-      case nir_intrinsic_image_deref_atomic_comp_swap:
-      case nir_intrinsic_image_deref_atomic_fadd:
-      case nir_intrinsic_image_deref_size:
+      case nir_intrinsic_image_deref_atomic:
+      case nir_intrinsic_image_deref_atomic_swap:
       case nir_intrinsic_image_deref_samples:
       case nir_intrinsic_image_deref_load_param_intel:
       case nir_intrinsic_image_deref_load_raw_intel:
       case nir_intrinsic_image_deref_store_raw_intel:
+      case nir_intrinsic_image_deref_sparse_load:
          return lower_image_intrinsic(b, intrin, state);
+      case nir_intrinsic_image_deref_size:
+         return lower_image_size_intrinsic(b, intrin, state);
       case nir_intrinsic_load_constant:
          return lower_load_constant(b, intrin, state);
+      case nir_intrinsic_load_base_workgroup_id:
+         return lower_base_workgroup_id(b, intrin, state);
+      case nir_intrinsic_load_ray_query_global_intel:
+         return lower_ray_query_globals(b, intrin, state);
       default:
          return false;
       }
@@ -1393,66 +1911,300 @@ compare_binding_infos(const void *_a, const void *_b)
    return a->binding - b->binding;
 }
 
+#ifndef NDEBUG
+static void
+anv_validate_pipeline_layout(const struct anv_pipeline_sets_layout *layout,
+                             nir_shader *shader)
+{
+   nir_foreach_function_impl(impl, shader) {
+      nir_foreach_block(block, impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            if (intrin->intrinsic != nir_intrinsic_vulkan_resource_index)
+               continue;
+
+            unsigned set = nir_intrinsic_desc_set(intrin);
+            assert(layout->set[set].layout);
+         }
+      }
+   }
+}
+#endif
+
+static bool
+binding_is_promotable_to_push(const struct anv_descriptor_set_layout *set_layout,
+                              const struct anv_descriptor_set_binding_layout *bind_layout)
+{
+   if (set_layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)
+      return true;
+
+   if (set_layout->flags & (VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT |
+                            VK_DESCRIPTOR_SET_LAYOUT_CREATE_EMBEDDED_IMMUTABLE_SAMPLERS_BIT_EXT))
+      return false;
+
+   return (bind_layout->flags & non_pushable_binding_flags) == 0;
+}
+
+static void
+add_null_bti_entry(struct anv_pipeline_bind_map *map)
+{
+   map->surface_to_descriptor[map->surface_count++] =
+      (struct anv_pipeline_binding) {
+         .set = ANV_DESCRIPTOR_SET_NULL,
+   };
+   assert(map->surface_count <= MAX_BINDING_TABLE_SIZE);
+}
+
+static void
+add_bti_entry(struct anv_pipeline_bind_map *map,
+              uint32_t set,
+              uint32_t binding,
+              uint32_t element,
+              uint32_t plane,
+              const struct anv_descriptor_set_binding_layout *bind_layout)
+{
+   map->surface_to_descriptor[map->surface_count++] =
+      (struct anv_pipeline_binding) {
+         .set = set,
+         .binding = binding,
+         .index = bind_layout->descriptor_index + element,
+         .set_offset = bind_layout->descriptor_surface_offset +
+                       element * bind_layout->descriptor_surface_stride +
+                       plane * bind_layout->descriptor_data_surface_size,
+         .plane = plane,
+   };
+   assert(map->surface_count <= MAX_BINDING_TABLE_SIZE);
+}
+
+static void
+add_dynamic_bti_entry(struct anv_pipeline_bind_map *map,
+                      uint32_t set,
+                      uint32_t binding,
+                      uint32_t element,
+                      const struct anv_pipeline_sets_layout *layout,
+                      const struct anv_descriptor_set_binding_layout *bind_layout)
+{
+   map->surface_to_descriptor[map->surface_count++] =
+      (struct anv_pipeline_binding) {
+         .set = set,
+         .binding = binding,
+         .index = bind_layout->descriptor_index + element,
+         .set_offset = bind_layout->descriptor_surface_offset +
+                       element * bind_layout->descriptor_surface_stride,
+         .dynamic_offset_index = bind_layout->dynamic_offset_index + element,
+   };
+   assert(map->surface_count <= MAX_BINDING_TABLE_SIZE);
+}
+
+static void
+add_sampler_entry(struct anv_pipeline_bind_map *map,
+                  uint32_t set,
+                  uint32_t binding,
+                  uint32_t element,
+                  uint32_t plane,
+                  const struct anv_pipeline_sets_layout *layout,
+                  const struct anv_descriptor_set_binding_layout *bind_layout)
+{
+   assert((bind_layout->descriptor_index + element) < layout->set[set].layout->descriptor_count);
+   map->sampler_to_descriptor[map->sampler_count++] =
+      (struct anv_pipeline_binding) {
+         .set = set,
+         .binding = binding,
+         .index = bind_layout->descriptor_index + element,
+         .plane = plane,
+   };
+}
+
+static void
+add_push_entry(struct anv_pipeline_push_map *push_map,
+               uint32_t set,
+               uint32_t binding,
+               uint32_t element,
+               const struct anv_pipeline_sets_layout *layout,
+               const struct anv_descriptor_set_binding_layout *bind_layout)
+{
+   push_map->block_to_descriptor[push_map->block_count++] =
+      (struct anv_pipeline_binding) {
+         .set = set,
+         .binding = binding,
+         .index = bind_layout->descriptor_index + element,
+         .dynamic_offset_index = bind_layout->dynamic_offset_index + element,
+   };
+}
+
+static void
+add_embedded_sampler_entry(struct apply_pipeline_layout_state *state,
+                           struct anv_pipeline_bind_map *map,
+                           uint32_t set, uint32_t binding)
+{
+   state->set[set].binding[binding].embedded_sampler_index =
+      map->embedded_sampler_count;
+   struct anv_pipeline_embedded_sampler_binding *sampler =
+      &map->embedded_sampler_to_binding[map->embedded_sampler_count++];
+   const struct anv_descriptor_set_layout *set_layout =
+      state->layout->set[set].layout;
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+      &set_layout->binding[binding];
+
+   *sampler = (struct anv_pipeline_embedded_sampler_binding) {
+      .set = set,
+      .binding = binding,
+   };
+
+   assert(sizeof(sampler->key.sampler) ==
+          sizeof(bind_layout->immutable_samplers[0]->state_no_bc[0]));
+   memcpy(sampler->key.sampler,
+          bind_layout->immutable_samplers[0]->state_no_bc[0],
+          sizeof(sampler->key.sampler));
+
+   assert(sizeof(sampler->key.color) ==
+          sizeof(bind_layout->immutable_samplers[0]->vk.border_color_value.uint32));
+   memcpy(sampler->key.color,
+          bind_layout->immutable_samplers[0]->vk.border_color_value.uint32,
+          sizeof(sampler->key.color));
+}
+
+static bool
+binding_should_use_surface_binding_table(const struct apply_pipeline_layout_state *state,
+                                         const struct anv_descriptor_set_binding_layout *binding)
+{
+   if ((binding->data & ANV_DESCRIPTOR_BTI_SURFACE_STATE) == 0)
+      return false;
+
+   if (state->pdevice->always_use_bindless &&
+       (binding->data & ANV_DESCRIPTOR_SURFACE))
+      return false;
+
+   return true;
+}
+
+static bool
+binding_should_use_sampler_binding_table(const struct apply_pipeline_layout_state *state,
+                                         const struct anv_descriptor_set_binding_layout *binding)
+{
+   if ((binding->data & ANV_DESCRIPTOR_BTI_SAMPLER_STATE) == 0)
+      return false;
+
+   if (state->pdevice->always_use_bindless &&
+       (binding->data & ANV_DESCRIPTOR_SAMPLER))
+      return false;
+
+   return true;
+}
+
 void
-anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
-                              bool robust_buffer_access,
-                              const struct anv_pipeline_layout *layout,
-                              nir_shader *shader,
-                              struct anv_pipeline_bind_map *map)
+anv_nir_apply_pipeline_layout(nir_shader *shader,
+                              const struct anv_physical_device *pdevice,
+                              enum brw_robustness_flags robust_flags,
+                              bool independent_sets,
+                              const struct anv_pipeline_sets_layout *layout,
+                              struct anv_pipeline_bind_map *map,
+                              struct anv_pipeline_push_map *push_map,
+                              void *push_map_mem_ctx)
 {
    void *mem_ctx = ralloc_context(NULL);
 
+#ifndef NDEBUG
+   /* We should not have have any reference to a descriptor set that is not
+    * given through the pipeline layout (layout->set[set].layout = NULL).
+    */
+   anv_validate_pipeline_layout(layout, shader);
+#endif
+
+   const bool bindless_stage =
+      brw_shader_stage_requires_bindless_resources(shader->info.stage);
    struct apply_pipeline_layout_state state = {
       .pdevice = pdevice,
       .layout = layout,
-      .add_bounds_checks = robust_buffer_access,
-      .desc_addr_format = brw_shader_stage_is_bindless(shader->info.stage) ?
+      .desc_addr_format = bindless_stage ?
                           nir_address_format_64bit_global_32bit_offset :
                           nir_address_format_32bit_index_offset,
-      .ssbo_addr_format = anv_nir_ssbo_addr_format(pdevice, robust_buffer_access),
-      .ubo_addr_format = anv_nir_ubo_addr_format(pdevice, robust_buffer_access),
+      .ssbo_addr_format = anv_nir_ssbo_addr_format(pdevice, robust_flags),
+      .ubo_addr_format = anv_nir_ubo_addr_format(pdevice, robust_flags),
       .lowered_instrs = _mesa_pointer_set_create(mem_ctx),
+      .has_independent_sets = independent_sets,
    };
 
+   /* Compute the amount of push block items required. */
+   unsigned push_block_count = 0;
    for (unsigned s = 0; s < layout->num_sets; s++) {
+      if (!layout->set[s].layout)
+         continue;
+
       const unsigned count = layout->set[s].layout->binding_count;
-      state.set[s].use_count = rzalloc_array(mem_ctx, uint8_t, count);
-      state.set[s].surface_offsets = rzalloc_array(mem_ctx, uint8_t, count);
-      state.set[s].sampler_offsets = rzalloc_array(mem_ctx, uint8_t, count);
+      state.set[s].binding = rzalloc_array_size(mem_ctx, sizeof(state.set[s].binding[0]), count);
+
+      const struct anv_descriptor_set_layout *set_layout = layout->set[s].layout;
+      for (unsigned b = 0; b < set_layout->binding_count; b++) {
+         if (set_layout->binding[b].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
+            push_block_count += set_layout->binding[b].array_size;
+      }
    }
 
+   /* Find all use sets/bindings */
    nir_shader_instructions_pass(shader, get_used_bindings,
                                 nir_metadata_all, &state);
 
+   /* Assign a BTI to each used descriptor set */
    for (unsigned s = 0; s < layout->num_sets; s++) {
       if (state.desc_addr_format != nir_address_format_32bit_index_offset) {
          state.set[s].desc_offset = BINDLESS_OFFSET;
       } else if (state.set[s].desc_buffer_used) {
          map->surface_to_descriptor[map->surface_count] =
             (struct anv_pipeline_binding) {
-               .set = ANV_DESCRIPTOR_SET_DESCRIPTORS,
+               .set = (layout->type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER) ?
+                      ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER :
+                      ANV_DESCRIPTOR_SET_DESCRIPTORS,
+               .binding = UINT32_MAX,
                .index = s,
             };
-         state.set[s].desc_offset = map->surface_count;
-         map->surface_count++;
+         state.set[s].desc_offset = map->surface_count++;
       }
    }
 
-   if (state.uses_constants && !pdevice->use_softpin) {
-      state.constants_offset = map->surface_count;
-      map->surface_to_descriptor[map->surface_count].set =
-         ANV_DESCRIPTOR_SET_SHADER_CONSTANTS;
-      map->surface_count++;
-   }
+   /* Assign a block index for each surface */
+   push_map->block_to_descriptor =
+      rzalloc_array(push_map_mem_ctx, struct anv_pipeline_binding,
+                    map->surface_count + push_block_count);
+
+   memcpy(push_map->block_to_descriptor,
+          map->surface_to_descriptor,
+          sizeof(push_map->block_to_descriptor[0]) * map->surface_count);
+   push_map->block_count = map->surface_count;
 
+   /* Count used bindings, assign embedded sampler indices & add push blocks
+    * for promotion to push constants
+    */
    unsigned used_binding_count = 0;
    for (uint32_t set = 0; set < layout->num_sets; set++) {
       struct anv_descriptor_set_layout *set_layout = layout->set[set].layout;
+      if (!set_layout)
+         continue;
+
       for (unsigned b = 0; b < set_layout->binding_count; b++) {
-         if (state.set[set].use_count[b] == 0)
+         if (state.set[set].binding[b].use_count == 0)
             continue;
 
          used_binding_count++;
+
+         const struct anv_descriptor_set_binding_layout *bind_layout =
+            &set_layout->binding[b];
+
+         if (state.set[set].binding[b].properties & BINDING_PROPERTY_EMBEDDED_SAMPLER)
+            add_embedded_sampler_entry(&state, map, set, b);
+
+         if (binding_is_promotable_to_push(set_layout, bind_layout)) {
+            if (bind_layout->type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
+               state.set[set].binding[b].push_block = push_map->block_count;
+               for (unsigned i = 0; i < bind_layout->array_size; i++)
+                  add_push_entry(push_map, set, b, i, layout, bind_layout);
+            } else {
+               state.set[set].binding[b].push_block = state.set[set].desc_offset;
+            }
+         }
       }
    }
 
@@ -1461,8 +2213,11 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
    used_binding_count = 0;
    for (uint32_t set = 0; set < layout->num_sets; set++) {
       const struct anv_descriptor_set_layout *set_layout = layout->set[set].layout;
+      if (!set_layout)
+         continue;
+
       for (unsigned b = 0; b < set_layout->binding_count; b++) {
-         if (state.set[set].use_count[b] == 0)
+         if (state.set[set].binding[b].use_count == 0)
             continue;
 
          const struct anv_descriptor_set_binding_layout *binding =
@@ -1474,14 +2229,13 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
           * everything which does not support bindless super higher priority
           * than things which do.
           */
-         uint16_t score = ((uint16_t)state.set[set].use_count[b] << 7) /
+         uint16_t score = ((uint16_t)state.set[set].binding[b].use_count << 7) /
                           binding->array_size;
 
          /* If the descriptor type doesn't support bindless then put it at the
           * beginning so we guarantee it gets a slot.
           */
-         if (!anv_descriptor_supports_bindless(pdevice, binding, true) ||
-             !anv_descriptor_supports_bindless(pdevice, binding, false))
+         if (!anv_descriptor_supports_bindless(pdevice, set_layout, binding))
             score |= 1 << 15;
 
          infos[used_binding_count++] = (struct binding_info) {
@@ -1500,58 +2254,59 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
 
    for (unsigned i = 0; i < used_binding_count; i++) {
       unsigned set = infos[i].set, b = infos[i].binding;
+      assert(layout->set[set].layout);
+      const struct anv_descriptor_set_layout *set_layout =
+         layout->set[set].layout;
       const struct anv_descriptor_set_binding_layout *binding =
-            &layout->set[set].layout->binding[b];
+            &set_layout->binding[b];
 
       const uint32_t array_size = binding->array_size;
 
       if (binding->dynamic_offset_index >= 0)
          state.has_dynamic_buffers = true;
 
-      if (binding->data & ANV_DESCRIPTOR_SURFACE_STATE) {
-         if (map->surface_count + array_size > MAX_BINDING_TABLE_SIZE ||
-             anv_descriptor_requires_bindless(pdevice, binding, false) ||
-             brw_shader_stage_is_bindless(shader->info.stage)) {
+      const unsigned array_multiplier = bti_multiplier(&state, set, b);
+      assert(array_multiplier >= 1);
+
+      /* Assume bindless by default */
+      state.set[set].binding[b].surface_offset = BINDLESS_OFFSET;
+      state.set[set].binding[b].sampler_offset = BINDLESS_OFFSET;
+
+      if (binding_should_use_surface_binding_table(&state, binding)) {
+         if (map->surface_count + array_size * array_multiplier > MAX_BINDING_TABLE_SIZE ||
+             anv_descriptor_requires_bindless(pdevice, set_layout, binding) ||
+             brw_shader_stage_requires_bindless_resources(shader->info.stage)) {
             /* If this descriptor doesn't fit in the binding table or if it
              * requires bindless for some reason, flag it as bindless.
              */
-            assert(anv_descriptor_supports_bindless(pdevice, binding, false));
-            state.set[set].surface_offsets[b] = BINDLESS_OFFSET;
+            assert(anv_descriptor_supports_bindless(pdevice, set_layout, binding));
          } else {
-            state.set[set].surface_offsets[b] = map->surface_count;
+            state.set[set].binding[b].surface_offset = map->surface_count;
             if (binding->dynamic_offset_index < 0) {
                struct anv_sampler **samplers = binding->immutable_samplers;
+               uint8_t max_planes = bti_multiplier(&state, set, b);
                for (unsigned i = 0; i < binding->array_size; i++) {
                   uint8_t planes = samplers ? samplers[i]->n_planes : 1;
-                  for (uint8_t p = 0; p < planes; p++) {
-                     map->surface_to_descriptor[map->surface_count++] =
-                        (struct anv_pipeline_binding) {
-                           .set = set,
-                           .index = binding->descriptor_index + i,
-                           .plane = p,
-                        };
+                  for (uint8_t p = 0; p < max_planes; p++) {
+                     if (p < planes) {
+                        add_bti_entry(map, set, b, i, p, binding);
+                     } else {
+                        add_null_bti_entry(map);
+                     }
                   }
                }
             } else {
-               for (unsigned i = 0; i < binding->array_size; i++) {
-                  map->surface_to_descriptor[map->surface_count++] =
-                     (struct anv_pipeline_binding) {
-                        .set = set,
-                        .index = binding->descriptor_index + i,
-                        .dynamic_offset_index =
-                           layout->set[set].dynamic_offset_start +
-                           binding->dynamic_offset_index + i,
-                     };
-               }
+               for (unsigned i = 0; i < binding->array_size; i++)
+                  add_dynamic_bti_entry(map, set, b, i, layout, binding);
             }
          }
          assert(map->surface_count <= MAX_BINDING_TABLE_SIZE);
       }
 
-      if (binding->data & ANV_DESCRIPTOR_SAMPLER_STATE) {
-         if (map->sampler_count + array_size > MAX_SAMPLER_TABLE_SIZE ||
-             anv_descriptor_requires_bindless(pdevice, binding, true) ||
-             brw_shader_stage_is_bindless(shader->info.stage)) {
+      if (binding_should_use_sampler_binding_table(&state, binding)) {
+         if (map->sampler_count + array_size * array_multiplier > MAX_SAMPLER_TABLE_SIZE ||
+             anv_descriptor_requires_bindless(pdevice, set_layout, binding) ||
+             brw_shader_stage_requires_bindless_resources(shader->info.stage)) {
             /* If this descriptor doesn't fit in the binding table or if it
              * requires bindless for some reason, flag it as bindless.
              *
@@ -1559,60 +2314,29 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
              * using indirect sends thanks to bindless samplers being packed
              * less tightly than the sampler table.
              */
-            assert(anv_descriptor_supports_bindless(pdevice, binding, true));
-            state.set[set].sampler_offsets[b] = BINDLESS_OFFSET;
+            assert(anv_descriptor_supports_bindless(pdevice, set_layout, binding));
          } else {
-            state.set[set].sampler_offsets[b] = map->sampler_count;
-            struct anv_sampler **samplers = binding->immutable_samplers;
+            state.set[set].binding[b].sampler_offset = map->sampler_count;
+            uint8_t max_planes = bti_multiplier(&state, set, b);
             for (unsigned i = 0; i < binding->array_size; i++) {
-               uint8_t planes = samplers ? samplers[i]->n_planes : 1;
-               for (uint8_t p = 0; p < planes; p++) {
-                  map->sampler_to_descriptor[map->sampler_count++] =
-                     (struct anv_pipeline_binding) {
-                        .set = set,
-                        .index = binding->descriptor_index + i,
-                        .plane = p,
-                     };
+               for (uint8_t p = 0; p < max_planes; p++) {
+                  add_sampler_entry(map, set, b, i, p, layout, binding);
                }
             }
          }
       }
-   }
 
-   nir_foreach_uniform_variable(var, shader) {
-      const struct glsl_type *glsl_type = glsl_without_array(var->type);
-
-      if (!glsl_type_is_image(glsl_type))
-         continue;
-
-      enum glsl_sampler_dim dim = glsl_get_sampler_dim(glsl_type);
-
-      const uint32_t set = var->data.descriptor_set;
-      const uint32_t binding = var->data.binding;
-      const struct anv_descriptor_set_binding_layout *bind_layout =
-            &layout->set[set].layout->binding[binding];
-      const uint32_t array_size = bind_layout->array_size;
-
-      if (state.set[set].use_count[binding] == 0)
-         continue;
-
-      if (state.set[set].surface_offsets[binding] >= MAX_BINDING_TABLE_SIZE)
-         continue;
-
-      struct anv_pipeline_binding *pipe_binding =
-         &map->surface_to_descriptor[state.set[set].surface_offsets[binding]];
-      for (unsigned i = 0; i < array_size; i++) {
-         assert(pipe_binding[i].set == set);
-         assert(pipe_binding[i].index == bind_layout->descriptor_index + i);
-
-         if (dim == GLSL_SAMPLER_DIM_SUBPASS ||
-             dim == GLSL_SAMPLER_DIM_SUBPASS_MS)
-            pipe_binding[i].input_attachment_index = var->data.index + i;
-
-         /* NOTE: This is a uint8_t so we really do need to != 0 here */
-         pipe_binding[i].write_only =
-            (var->data.access & ACCESS_NON_READABLE) != 0;
+      if (binding->data & ANV_DESCRIPTOR_INLINE_UNIFORM) {
+         state.set[set].binding[b].surface_offset = state.set[set].desc_offset;
       }
+
+#if 0
+      fprintf(stderr, "set=%u binding=%u surface_offset=0x%08x require_bindless=%u type=%s\n",
+              set, b,
+              state.set[set].binding[b].surface_offset,
+              anv_descriptor_requires_bindless(pdevice, set_layout, binding),
+              vk_DescriptorType_to_str(binding->type));
+#endif
    }
 
    /* Before we do the normal lowering, we look for any SSBO operations
@@ -1667,6 +2391,27 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
       assert(map->sampler_count == 0);
    }
 
+#if 0
+   fprintf(stderr, "bti:\n");
+   for (unsigned i = 0; i < map->surface_count; i++) {
+      fprintf(stderr, "  %03i: set=%03u binding=%06i index=%u plane=%u set_offset=0x%08x dyn_offset=0x%08x\n", i,
+              map->surface_to_descriptor[i].set,
+              map->surface_to_descriptor[i].binding,
+              map->surface_to_descriptor[i].index,
+              map->surface_to_descriptor[i].plane,
+              map->surface_to_descriptor[i].set_offset,
+              map->surface_to_descriptor[i].dynamic_offset_index);
+   }
+   fprintf(stderr, "sti:\n");
+   for (unsigned i = 0; i < map->sampler_count; i++) {
+      fprintf(stderr, "  %03i: set=%03u binding=%06i index=%u plane=%u\n", i,
+              map->sampler_to_descriptor[i].set,
+              map->sampler_to_descriptor[i].binding,
+              map->sampler_to_descriptor[i].index,
+              map->sampler_to_descriptor[i].plane);
+   }
+#endif
+
    /* Now that we're done computing the surface and sampler portions of the
     * bind map, hash them.  This lets us quickly determine if the actual
     * mapping has changed and not just a no-op pipeline change.
diff --git a/src/intel/vulkan/anv_nir_compute_push_layout.c b/src/intel/vulkan/anv_nir_compute_push_layout.c
index 526e1a48f0b..74e59e4cb28 100644
--- a/src/intel/vulkan/anv_nir_compute_push_layout.c
+++ b/src/intel/vulkan/anv_nir_compute_push_layout.c
@@ -29,11 +29,14 @@
 #define sizeof_field(type, field) sizeof(((type *)0)->field)
 
 void
-anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
-                            bool robust_buffer_access,
-                            nir_shader *nir,
+anv_nir_compute_push_layout(nir_shader *nir,
+                            const struct anv_physical_device *pdevice,
+                            enum brw_robustness_flags robust_flags,
+                            bool fragment_dynamic,
                             struct brw_stage_prog_data *prog_data,
                             struct anv_pipeline_bind_map *map,
+                            const struct anv_pipeline_push_map *push_map,
+                            enum anv_descriptor_set_layout_type desc_type,
                             void *mem_ctx)
 {
    const struct brw_compiler *compiler = pdevice->compiler;
@@ -42,11 +45,8 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
 
    bool has_const_ubo = false;
    unsigned push_start = UINT_MAX, push_end = 0;
-   nir_foreach_function(function, nir) {
-      if (!function->impl)
-         continue;
-
-      nir_foreach_block(block, function->impl) {
+   nir_foreach_function_impl(impl, nir) {
+      nir_foreach_block(block, impl) {
          nir_foreach_instr(instr, block) {
             if (instr->type != nir_instr_type_intrinsic)
                continue;
@@ -54,7 +54,7 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
             switch (intrin->intrinsic) {
             case nir_intrinsic_load_ubo:
-               if (nir_src_is_const(intrin->src[0]) &&
+               if (brw_nir_ubo_surface_index_is_pushable(intrin->src[0]) &&
                    nir_src_is_const(intrin->src[1]))
                   has_const_ubo = true;
                break;
@@ -68,11 +68,25 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
             }
 
             case nir_intrinsic_load_desc_set_address_intel:
-               push_start = MIN2(push_start,
-                  offsetof(struct anv_push_constants, desc_sets));
-               push_end = MAX2(push_end, push_start +
-                  sizeof_field(struct anv_push_constants, desc_sets));
+            case nir_intrinsic_load_desc_set_dynamic_index_intel: {
+               unsigned base = offsetof(struct anv_push_constants,
+                                        desc_surface_offsets);
+               push_start = MIN2(push_start, base);
+               push_end = MAX2(push_end, base +
+                  sizeof_field(struct anv_push_constants,
+                               desc_surface_offsets));
+
+               if (desc_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER &&
+                   !pdevice->uses_ex_bso) {
+                  base = offsetof(struct anv_push_constants,
+                                  surfaces_base_offset);
+                  push_start = MIN2(push_start, base);
+                  push_end = MAX2(push_end, base +
+                                  sizeof_field(struct anv_push_constants,
+                                               surfaces_base_offset));
+               }
                break;
+            }
 
             default:
                break;
@@ -84,11 +98,10 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
    const bool has_push_intrinsic = push_start <= push_end;
 
    const bool push_ubo_ranges =
-      pdevice->info.verx10 >= 75 &&
       has_const_ubo && nir->info.stage != MESA_SHADER_COMPUTE &&
-      !brw_shader_stage_is_bindless(nir->info.stage);
+      !brw_shader_stage_requires_bindless_resources(nir->info.stage);
 
-   if (push_ubo_ranges && robust_buffer_access) {
+   if (push_ubo_ranges && (robust_flags & BRW_ROBUSTNESS_UBO)) {
       /* We can't on-the-fly adjust our push ranges because doing so would
        * mess up the layout in the shader.  When robustBufferAccess is
        * enabled, we push a mask into the shader indicating which pushed
@@ -102,6 +115,14 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
       push_end = MAX2(push_end, push_reg_mask_end);
    }
 
+   if (nir->info.stage == MESA_SHADER_FRAGMENT && fragment_dynamic) {
+      const uint32_t fs_msaa_flags_start =
+         offsetof(struct anv_push_constants, gfx.fs_msaa_flags);
+      const uint32_t fs_msaa_flags_end = fs_msaa_flags_start + sizeof(uint32_t);
+      push_start = MIN2(push_start, fs_msaa_flags_start);
+      push_end = MAX2(push_end, fs_msaa_flags_end);
+   }
+
    if (nir->info.stage == MESA_SHADER_COMPUTE && devinfo->verx10 < 125) {
       /* For compute shaders, we always have to have the subgroup ID.  The
        * back-end compiler will "helpfully" add it for us in the last push
@@ -118,13 +139,11 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
     * push_end (no push constants is indicated by push_start = UINT_MAX).
     */
    push_start = MIN2(push_start, push_end);
-   push_start = align_down_u32(push_start, 32);
+   push_start = ROUND_DOWN_TO(push_start, 32);
 
-   /* For vec4 our push data size needs to be aligned to a vec4 and for
-    * scalar, it needs to be aligned to a DWORD.
-    */
-   const unsigned align = compiler->scalar_stage[nir->info.stage] ? 4 : 16;
-   nir->num_uniforms = ALIGN(push_end - push_start, align);
+   /* For scalar, push data size needs to be aligned to a DWORD. */
+   const unsigned alignment = 4;
+   nir->num_uniforms = ALIGN(push_end - push_start, alignment);
    prog_data->nr_params = nir->num_uniforms / 4;
    prog_data->param = rzalloc_array(mem_ctx, uint32_t, prog_data->nr_params);
 
@@ -135,35 +154,80 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
    };
 
    if (has_push_intrinsic) {
-      nir_foreach_function(function, nir) {
-         if (!function->impl)
-            continue;
+      nir_foreach_function_impl(impl, nir) {
+         nir_builder build = nir_builder_create(impl);
+         nir_builder *b = &build;
 
-         nir_builder build, *b = &build;
-         nir_builder_init(b, function->impl);
-
-         nir_foreach_block(block, function->impl) {
+         nir_foreach_block(block, impl) {
             nir_foreach_instr_safe(instr, block) {
                if (instr->type != nir_instr_type_intrinsic)
                   continue;
 
                nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
                switch (intrin->intrinsic) {
-               case nir_intrinsic_load_push_constant:
+               case nir_intrinsic_load_push_constant: {
+                  /* With bindless shaders we load uniforms with SEND
+                   * messages. All the push constants are located after the
+                   * RT_DISPATCH_GLOBALS. We just need to add the offset to
+                   * the address right after RT_DISPATCH_GLOBALS (see
+                   * brw_nir_lower_rt_intrinsics.c).
+                   */
+                  unsigned base_offset =
+                     brw_shader_stage_requires_bindless_resources(nir->info.stage) ? 0 : push_start;
                   intrin->intrinsic = nir_intrinsic_load_uniform;
                   nir_intrinsic_set_base(intrin,
                                          nir_intrinsic_base(intrin) -
-                                         push_start);
+                                         base_offset);
                   break;
+               }
 
                case nir_intrinsic_load_desc_set_address_intel: {
+                  assert(brw_shader_stage_requires_bindless_resources(nir->info.stage));
                   b->cursor = nir_before_instr(&intrin->instr);
-                  nir_ssa_def *pc_load = nir_load_uniform(b, 1, 64,
-                     nir_imul_imm(b, intrin->src[0].ssa, sizeof(uint64_t)),
-                     .base = offsetof(struct anv_push_constants, desc_sets),
-                     .range = sizeof_field(struct anv_push_constants, desc_sets),
-                     .dest_type = nir_type_uint64);
-                  nir_ssa_def_rewrite_uses(&intrin->dest.ssa, pc_load);
+                  nir_def *desc_offset = nir_load_uniform(b, 1, 32,
+                     nir_imul_imm(b, intrin->src[0].ssa, sizeof(uint32_t)),
+                     .base = offsetof(struct anv_push_constants,
+                                      desc_surface_offsets),
+                     .range = sizeof_field(struct anv_push_constants,
+                                           desc_surface_offsets),
+                     .dest_type = nir_type_uint32);
+                  desc_offset = nir_iand_imm(b, desc_offset, ANV_DESCRIPTOR_SET_OFFSET_MASK);
+                  if (desc_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER &&
+                      !pdevice->uses_ex_bso) {
+                     nir_def *bindless_base_offset = nir_load_uniform(
+                        b, 1, 32,
+                        nir_imm_int(b, 0),
+                        .base = offsetof(struct anv_push_constants,
+                                         surfaces_base_offset),
+                        .range = sizeof_field(struct anv_push_constants,
+                                              surfaces_base_offset),
+                        .dest_type = nir_type_uint32);
+                     desc_offset = nir_iadd(b, bindless_base_offset, desc_offset);
+                  }
+                  nir_def *desc_addr =
+                     nir_pack_64_2x32_split(
+                        b, desc_offset,
+                        nir_load_reloc_const_intel(
+                           b,
+                           desc_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER ?
+                           BRW_SHADER_RELOC_DESCRIPTORS_BUFFER_ADDR_HIGH :
+                           BRW_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH));
+                  nir_def_rewrite_uses(&intrin->def, desc_addr);
+                  break;
+               }
+
+               case nir_intrinsic_load_desc_set_dynamic_index_intel: {
+                  b->cursor = nir_before_instr(&intrin->instr);
+                  nir_def *pc_load = nir_load_uniform(b, 1, 32,
+                     nir_imul_imm(b, intrin->src[0].ssa, sizeof(uint32_t)),
+                     .base = offsetof(struct anv_push_constants,
+                                      desc_surface_offsets),
+                     .range = sizeof_field(struct anv_push_constants,
+                                           desc_surface_offsets),
+                     .dest_type = nir_type_uint32);
+                  pc_load = nir_iand_imm(
+                     b, pc_load, ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK);
+                  nir_def_rewrite_uses(&intrin->def, pc_load);
                   break;
                }
 
@@ -176,15 +240,9 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
    }
 
    if (push_ubo_ranges) {
-      brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
+      brw_nir_analyze_ubo_ranges(compiler, nir, prog_data->ubo_ranges);
 
-      /* The vec4 back-end pushes at most 32 regs while the scalar back-end
-       * pushes up to 64.  This is primarily because the scalar back-end has a
-       * massively more competent register allocator and so the risk of
-       * spilling due to UBO pushing isn't nearly as high.
-       */
-      const unsigned max_push_regs =
-         compiler->scalar_stage[nir->info.stage] ? 64 : 32;
+      const unsigned max_push_regs = 64;
 
       unsigned total_push_regs = push_constant_range.length;
       for (unsigned i = 0; i < 4; i++) {
@@ -199,7 +257,7 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
       if (push_constant_range.length > 0)
          map->push_ranges[n++] = push_constant_range;
 
-      if (robust_buffer_access) {
+      if (robust_flags & BRW_ROBUSTNESS_UBO) {
          const uint32_t push_reg_mask_offset =
             offsetof(struct anv_push_constants, push_reg_mask[nir->info.stage]);
          assert(push_reg_mask_offset >= push_start);
@@ -214,13 +272,14 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
          if (ubo_range->length == 0)
             continue;
 
-         if (n >= 4 || (n == 3 && compiler->constant_buffer_0_is_relative)) {
+         if (n >= 4) {
             memset(ubo_range, 0, sizeof(*ubo_range));
             continue;
          }
 
+         assert(ubo_range->block < push_map->block_count);
          const struct anv_pipeline_binding *binding =
-            &map->surface_to_descriptor[ubo_range->block];
+            &push_map->block_to_descriptor[ubo_range->block];
 
          map->push_ranges[n++] = (struct anv_push_range) {
             .set = binding->set,
@@ -231,7 +290,8 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
          };
 
          /* We only bother to shader-zero pushed client UBOs */
-         if (binding->set < MAX_SETS && robust_buffer_access) {
+         if (binding->set < MAX_SETS &&
+             (robust_flags & BRW_ROBUSTNESS_UBO)) {
             prog_data->zero_push_reg |= BITFIELD64_RANGE(range_start_reg,
                                                          ubo_range->length);
          }
@@ -250,6 +310,27 @@ anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
       map->push_ranges[0] = push_constant_range;
    }
 
+   if (nir->info.stage == MESA_SHADER_FRAGMENT && fragment_dynamic) {
+      struct brw_wm_prog_data *wm_prog_data =
+         container_of(prog_data, struct brw_wm_prog_data, base);
+
+      const uint32_t fs_msaa_flags_offset =
+         offsetof(struct anv_push_constants, gfx.fs_msaa_flags);
+      assert(fs_msaa_flags_offset >= push_start);
+      wm_prog_data->msaa_flags_param =
+         (fs_msaa_flags_offset - push_start) / 4;
+   }
+
+#if 0
+   fprintf(stderr, "stage=%s push ranges:\n", gl_shader_stage_name(nir->info.stage));
+   for (unsigned i = 0; i < ARRAY_SIZE(map->push_ranges); i++)
+      fprintf(stderr, "   range%i: %03u-%03u set=%u index=%u\n", i,
+              map->push_ranges[i].start,
+              map->push_ranges[i].length,
+              map->push_ranges[i].set,
+              map->push_ranges[i].index);
+#endif
+
    /* Now that we're done computing the push constant portion of the
     * bind map, hash it.  This lets us quickly determine if the actual
     * mapping has changed and not just a no-op pipeline change.
diff --git a/src/intel/vulkan/anv_nir_lower_load_patch_vertices_in.c b/src/intel/vulkan/anv_nir_lower_load_patch_vertices_in.c
new file mode 100644
index 00000000000..a9e0fde6f2e
--- /dev/null
+++ b/src/intel/vulkan/anv_nir_lower_load_patch_vertices_in.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * This file implements the lowering required for
+ * VK_EXT_extended_dynamic_state2 extendedDynamicState2PatchControlPoints.
+ *
+ * When VK_DYNAMIC_STATE_PATCH_CONTROL_POINTS_EXT is set on a pipeline, we
+ * need to compile the TCS shader assuming the max (32) number of control
+ * points. The actually value is provided through push constants.
+ */
+
+#include "anv_nir.h"
+#include "nir_builder.h"
+
+#define sizeof_field(type, field) sizeof(((type *)0)->field)
+
+static bool
+lower_patch_vertices_in_instr(nir_builder *b, nir_intrinsic_instr *load,
+                              UNUSED void *_data)
+{
+   if (load->intrinsic != nir_intrinsic_load_patch_vertices_in)
+      return false;
+
+   b->cursor = nir_before_instr(&load->instr);
+
+   nir_def_rewrite_uses(
+      &load->def,
+      nir_load_push_constant(
+         b, 1, 32,
+         nir_imm_int(b, 0),
+         .base = offsetof(struct anv_push_constants, gfx.tcs_input_vertices),
+         .range = sizeof_field(struct anv_push_constants, gfx.tcs_input_vertices)));
+   nir_instr_remove(&load->instr);
+
+   return true;
+}
+
+bool
+anv_nir_lower_load_patch_vertices_in(nir_shader *shader)
+{
+   return nir_shader_intrinsics_pass(shader, lower_patch_vertices_in_instr,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       NULL);
+}
diff --git a/src/intel/vulkan/anv_nir_lower_multiview.c b/src/intel/vulkan/anv_nir_lower_multiview.c
index 63d9f5a2e8e..b26dd6970db 100644
--- a/src/intel/vulkan/anv_nir_lower_multiview.c
+++ b/src/intel/vulkan/anv_nir_lower_multiview.c
@@ -23,7 +23,7 @@
 
 #include "anv_nir.h"
 #include "nir/nir_builder.h"
-#include "util/debug.h"
+#include "util/u_debug.h"
 
 /**
  * This file implements the lowering required for VK_KHR_multiview.
@@ -42,11 +42,12 @@ struct lower_multiview_state {
 
    uint32_t view_mask;
 
-   nir_ssa_def *instance_id;
-   nir_ssa_def *view_index;
+   nir_def *instance_id_with_views;
+   nir_def *instance_id;
+   nir_def *view_index;
 };
 
-static nir_ssa_def *
+static nir_def *
 build_instance_id(struct lower_multiview_state *state)
 {
    assert(state->builder.shader->info.stage == MESA_SHADER_VERTEX);
@@ -54,27 +55,31 @@ build_instance_id(struct lower_multiview_state *state)
    if (state->instance_id == NULL) {
       nir_builder *b = &state->builder;
 
-      b->cursor = nir_before_block(nir_start_block(b->impl));
+      b->cursor =
+         nir_after_instr(state->instance_id_with_views->parent_instr);
 
       /* We use instancing for implementing multiview.  The actual instance id
        * is given by dividing instance_id by the number of views in this
        * subpass.
        */
       state->instance_id =
-         nir_idiv(b, nir_load_instance_id(b),
+         nir_idiv(b, state->instance_id_with_views,
                      nir_imm_int(b, util_bitcount(state->view_mask)));
    }
 
    return state->instance_id;
 }
 
-static nir_ssa_def *
+static nir_def *
 build_view_index(struct lower_multiview_state *state)
 {
+   assert(state->builder.shader->info.stage != MESA_SHADER_FRAGMENT);
+
    if (state->view_index == NULL) {
       nir_builder *b = &state->builder;
 
-      b->cursor = nir_before_block(nir_start_block(b->impl));
+      b->cursor =
+         nir_after_instr(state->instance_id_with_views->parent_instr);
 
       assert(state->view_mask != 0);
       if (util_bitcount(state->view_mask) == 1) {
@@ -88,9 +93,9 @@ build_view_index(struct lower_multiview_state *state)
           * id is given by instance_id % view_count.  We then have to convert
           * that to an actual view id.
           */
-         nir_ssa_def *compacted =
-            nir_umod(b, nir_load_instance_id(b),
-                        nir_imm_int(b, util_bitcount(state->view_mask)));
+         nir_def *compacted =
+            nir_umod_imm(b, state->instance_id_with_views,
+                            util_bitcount(state->view_mask));
 
          if (util_is_power_of_two_or_zero(state->view_mask + 1)) {
             /* If we have a full view mask, then compacted is what we want */
@@ -107,24 +112,24 @@ build_view_index(struct lower_multiview_state *state)
                remap |= (uint64_t)bit << (i++ * 4);
             }
 
-            nir_ssa_def *shift = nir_imul(b, compacted, nir_imm_int(b, 4));
+            nir_def *shift = nir_imul_imm(b, compacted, 4);
 
             /* One of these days, when we have int64 everywhere, this will be
              * easier.
              */
-            nir_ssa_def *shifted;
+            nir_def *shifted;
             if (remap <= UINT32_MAX) {
                shifted = nir_ushr(b, nir_imm_int(b, remap), shift);
             } else {
-               nir_ssa_def *shifted_low =
+               nir_def *shifted_low =
                   nir_ushr(b, nir_imm_int(b, remap), shift);
-               nir_ssa_def *shifted_high =
+               nir_def *shifted_high =
                   nir_ushr(b, nir_imm_int(b, remap >> 32),
-                              nir_isub(b, shift, nir_imm_int(b, 32)));
-               shifted = nir_bcsel(b, nir_ilt(b, shift, nir_imm_int(b, 32)),
+                              nir_iadd_imm(b, shift, -32));
+               shifted = nir_bcsel(b, nir_ilt_imm(b, shift, 32),
                                       shifted_low, shifted_high);
             }
-            state->view_index = nir_iand(b, shifted, nir_imm_int(b, 0xf));
+            state->view_index = nir_iand_imm(b, shifted, 0xf);
          }
       } else {
          const struct glsl_type *type = glsl_int_type();
@@ -157,7 +162,7 @@ is_load_view_index(const nir_instr *instr, const void *data)
           nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_view_index;
 }
 
-static nir_ssa_def *
+static nir_def *
 replace_load_view_index_with_zero(struct nir_builder *b,
                                   nir_instr *instr, void *data)
 {
@@ -165,12 +170,19 @@ replace_load_view_index_with_zero(struct nir_builder *b,
    return nir_imm_zero(b, 1, 32);
 }
 
+static nir_def *
+replace_load_view_index_with_layer_id(struct nir_builder *b,
+                                      nir_instr *instr, void *data)
+{
+   assert(is_load_view_index(instr, data));
+   return nir_load_layer_id(b);
+}
+
 bool
-anv_nir_lower_multiview(nir_shader *shader,
-                        struct anv_graphics_pipeline *pipeline)
+anv_nir_lower_multiview(nir_shader *shader, uint32_t view_mask,
+                        bool use_primitive_replication)
 {
    assert(shader->info.stage != MESA_SHADER_COMPUTE);
-   uint32_t view_mask = pipeline->subpass->view_mask;
 
    /* If multiview isn't enabled, just lower the ViewIndex builtin to zero. */
    if (view_mask == 0) {
@@ -178,6 +190,11 @@ anv_nir_lower_multiview(nir_shader *shader,
                                            replace_load_view_index_with_zero, NULL);
    }
 
+   if (shader->info.stage == MESA_SHADER_FRAGMENT) {
+      return nir_shader_lower_instructions(shader, is_load_view_index,
+                                           replace_load_view_index_with_layer_id, NULL);
+   }
+
    /* This pass assumes a single entrypoint */
    nir_function_impl *entrypoint = nir_shader_get_entrypoint(shader);
 
@@ -186,16 +203,11 @@ anv_nir_lower_multiview(nir_shader *shader,
     * view, then it is possible to use the feature instead of instancing to
     * implement multiview.
     */
-   if (pipeline->use_primitive_replication) {
-      if (shader->info.stage == MESA_SHADER_FRAGMENT)
-         return false;
-
-      bool progress = nir_lower_multiview(shader, pipeline->subpass->view_mask);
+   if (use_primitive_replication) {
+      bool progress = nir_lower_multiview(shader, view_mask);
 
       if (progress) {
-         nir_builder b;
-         nir_builder_init(&b, entrypoint);
-         b.cursor = nir_before_cf_list(&entrypoint->body);
+         nir_builder b = nir_builder_at(nir_before_impl(entrypoint));
 
          /* Fill Layer ID with zero. Replication will use that as base to
           * apply the RTAI offsets.
@@ -214,81 +226,92 @@ anv_nir_lower_multiview(nir_shader *shader,
       .view_mask = view_mask,
    };
 
-   nir_builder_init(&state.builder, entrypoint);
+   state.builder = nir_builder_at(nir_before_impl(entrypoint));
+   nir_builder *b = &state.builder;
 
-   bool progress = false;
-   nir_foreach_block(block, entrypoint) {
-      nir_foreach_instr_safe(instr, block) {
-         if (instr->type != nir_instr_type_intrinsic)
-            continue;
-
-         nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr);
-
-         if (load->intrinsic != nir_intrinsic_load_instance_id &&
-             load->intrinsic != nir_intrinsic_load_view_index)
-            continue;
-
-         assert(load->dest.is_ssa);
-
-         nir_ssa_def *value;
-         if (load->intrinsic == nir_intrinsic_load_instance_id) {
-            value = build_instance_id(&state);
-         } else {
-            assert(load->intrinsic == nir_intrinsic_load_view_index);
-            value = build_view_index(&state);
-         }
-
-         nir_ssa_def_rewrite_uses(&load->dest.ssa, value);
-
-         nir_instr_remove(&load->instr);
-         progress = true;
-      }
-   }
+   /* Save the original "instance ID" which is the actual instance ID
+    * multiplied by the number of views.
+    */
+   state.instance_id_with_views = nir_load_instance_id(b);
 
    /* The view index is available in all stages but the instance id is only
     * available in the VS.  If it's not a fragment shader, we need to pass
     * the view index on to the next stage.
     */
-   if (shader->info.stage != MESA_SHADER_FRAGMENT) {
-      nir_ssa_def *view_index = build_view_index(&state);
+   nir_def *view_index = build_view_index(&state);
+
+   assert(view_index->parent_instr->block == nir_start_block(entrypoint));
+   b->cursor = nir_after_instr(view_index->parent_instr);
 
-      nir_builder *b = &state.builder;
+   /* Unless there is only one possible view index (that would be set
+    * directly), pass it to the next stage.
+    */
+   nir_variable *view_index_out = NULL;
+   if (util_bitcount(state.view_mask) != 1) {
+      view_index_out = nir_variable_create(shader, nir_var_shader_out,
+                                           glsl_int_type(), "view index");
+      view_index_out->data.location = VARYING_SLOT_VIEW_INDEX;
+   }
 
-      assert(view_index->parent_instr->block == nir_start_block(entrypoint));
-      b->cursor = nir_after_instr(view_index->parent_instr);
+   nir_variable *layer_id_out =
+      nir_variable_create(shader, nir_var_shader_out,
+                          glsl_int_type(), "layer ID");
+   layer_id_out->data.location = VARYING_SLOT_LAYER;
 
-      /* Unless there is only one possible view index (that would be set
-       * directly), pass it to the next stage. */
-      if (util_bitcount(state.view_mask) != 1) {
-         nir_variable *view_index_out =
-            nir_variable_create(shader, nir_var_shader_out,
-                                glsl_int_type(), "view index");
-         view_index_out->data.location = VARYING_SLOT_VIEW_INDEX;
+   if (shader->info.stage != MESA_SHADER_GEOMETRY) {
+      if (view_index_out)
          nir_store_var(b, view_index_out, view_index, 0x1);
-      }
 
-      nir_variable *layer_id_out =
-         nir_variable_create(shader, nir_var_shader_out,
-                             glsl_int_type(), "layer ID");
-      layer_id_out->data.location = VARYING_SLOT_LAYER;
       nir_store_var(b, layer_id_out, view_index, 0x1);
-
-      progress = true;
    }
 
-   if (progress) {
-      nir_metadata_preserve(entrypoint, nir_metadata_block_index |
-                                        nir_metadata_dominance);
+   nir_foreach_block(block, entrypoint) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr);
+
+         switch (load->intrinsic) {
+         case nir_intrinsic_load_instance_id:
+            if (&load->def != state.instance_id_with_views) {
+               nir_def_rewrite_uses(&load->def, build_instance_id(&state));
+               nir_instr_remove(&load->instr);
+            }
+            break;
+         case nir_intrinsic_load_view_index:
+            nir_def_rewrite_uses(&load->def, view_index);
+            nir_instr_remove(&load->instr);
+            break;
+         case nir_intrinsic_emit_vertex_with_counter:
+            /* In geometry shaders, outputs become undefined after every
+             * EmitVertex() call.  We need to re-emit them for each vertex.
+             */
+            b->cursor = nir_before_instr(instr);
+            if (view_index_out)
+               nir_store_var(b, view_index_out, view_index, 0x1);
+
+            nir_store_var(b, layer_id_out, view_index, 0x1);
+            break;
+         default:
+            break;
+         }
+      }
    }
 
-   return progress;
+   nir_metadata_preserve(entrypoint, nir_metadata_block_index |
+                                     nir_metadata_dominance);
+
+   return true;
 }
 
 bool
-anv_check_for_primitive_replication(nir_shader **shaders,
-                                    struct anv_graphics_pipeline *pipeline)
+anv_check_for_primitive_replication(struct anv_device *device,
+                                    VkShaderStageFlags stages,
+                                    nir_shader **shaders,
+                                    uint32_t view_mask)
 {
-   assert(pipeline->base.device->info.ver >= 12);
+   assert(device->info->ver >= 12);
 
    static int primitive_replication_max_views = -1;
    if (primitive_replication_max_views < 0) {
@@ -300,7 +323,7 @@ anv_check_for_primitive_replication(nir_shader **shaders,
 
       primitive_replication_max_views =
          MIN2(MAX_VIEWS_FOR_PRIMITIVE_REPLICATION,
-              env_var_as_unsigned("ANV_PRIMITIVE_REPLICATION_MAX_VIEWS",
+              debug_get_num_option("ANV_PRIMITIVE_REPLICATION_MAX_VIEWS",
                                   default_max_views));
    }
 
@@ -308,18 +331,15 @@ anv_check_for_primitive_replication(nir_shader **shaders,
     * later than Vertex.  In that case only the last stage can refer to
     * gl_ViewIndex.
     */
-   if (pipeline->active_stages != (VK_SHADER_STAGE_VERTEX_BIT |
-                                   VK_SHADER_STAGE_FRAGMENT_BIT)) {
+   if (stages & ~(VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT))
       return false;
-   }
 
-   uint32_t view_mask = pipeline->subpass->view_mask;
-   int view_count = util_bitcount(view_mask);
-   if (view_count == 1 || view_count > primitive_replication_max_views)
+   /* It's possible we have no vertex shader yet (with pipeline libraries) */
+   if (!(stages & VK_SHADER_STAGE_VERTEX_BIT))
       return false;
 
-   /* We can't access the view index in the fragment shader. */
-   if (nir_shader_uses_view_index(shaders[MESA_SHADER_FRAGMENT]))
+   int view_count = util_bitcount(view_mask);
+   if (view_count == 1 || view_count > primitive_replication_max_views)
       return false;
 
    return nir_can_lower_multiview(shaders[MESA_SHADER_VERTEX]);
diff --git a/src/intel/vulkan/anv_nir_lower_resource_intel.c b/src/intel/vulkan/anv_nir_lower_resource_intel.c
new file mode 100644
index 00000000000..92b18bf51b9
--- /dev/null
+++ b/src/intel/vulkan/anv_nir_lower_resource_intel.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_nir.h"
+#include "nir_builder.h"
+
+/* This pass updates the block index in the resource_intel intrinsics if the
+ * array index is constant.
+ *
+ * This pass must be run before anv_nir_compute_push_layout().
+ */
+static bool
+update_resource_intel_block(nir_builder *b, nir_intrinsic_instr *intrin,
+                            UNUSED void *data)
+{
+   if (intrin->intrinsic != nir_intrinsic_resource_intel)
+      return false;
+
+   /* If the array index in the descriptor binding is not const, we won't be
+    * able to turn this load_ubo into a push constant.
+    *
+    * Also if not pushable, set the block to 0xffffffff.
+    *
+    * Otherwise we need to update the block index by adding the array index so
+    * that when anv_nir_compute_push_layout() uses the block value it uses the
+    * right surface in the array of the binding.
+    */
+   if (!nir_src_is_const(intrin->src[2]) ||
+       !(nir_intrinsic_resource_access_intel(intrin) &
+         nir_resource_intel_pushable)) {
+      nir_intrinsic_set_resource_block_intel(intrin, 0xffffffff);
+      nir_intrinsic_set_resource_access_intel(
+         intrin,
+         nir_intrinsic_resource_access_intel(intrin) &
+         ~nir_resource_intel_pushable);
+   } else {
+      nir_intrinsic_set_resource_block_intel(
+         intrin,
+         nir_intrinsic_resource_block_intel(intrin) +
+         nir_src_as_uint(intrin->src[2]));
+   }
+
+   return true;
+}
+
+bool
+anv_nir_update_resource_intel_block(nir_shader *shader)
+{
+   return nir_shader_intrinsics_pass(shader, update_resource_intel_block,
+                                       nir_metadata_all,
+                                       NULL);
+}
+
+struct lower_resource_state {
+   enum anv_descriptor_set_layout_type desc_type;
+   const struct anv_physical_device *device;
+};
+
+/* This pass lower resource_intel surface_index source, combining the
+ * descriptor set offset with the surface offset in the descriptor set.
+ *
+ * This pass must be run after anv_nir_compute_push_layout() because we want
+ * the push constant selection to tell if the surface offset is constant. Once
+ * combined the constant detection does not work anymore.
+ */
+static bool
+lower_resource_intel(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
+{
+   if (intrin->intrinsic != nir_intrinsic_resource_intel)
+      return false;
+
+   const bool is_bindless =
+      (nir_intrinsic_resource_access_intel(intrin) &
+       nir_resource_intel_bindless) != 0;
+   const bool is_sampler =
+      (nir_intrinsic_resource_access_intel(intrin) &
+       nir_resource_intel_sampler) != 0;
+   const bool is_embedded_sampler =
+      (nir_intrinsic_resource_access_intel(intrin) &
+       nir_resource_intel_sampler_embedded) != 0;
+   const struct lower_resource_state *state = data;
+
+   /* Ignore binding table accesses & embedded samplers */
+   if (is_embedded_sampler) {
+      assert(state->desc_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER);
+      return false;
+   }
+
+   if (!is_bindless)
+      return true;
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   nir_def *set_offset = intrin->src[0].ssa;
+   nir_def *binding_offset = intrin->src[1].ssa;
+
+   /* When using indirect descriptor, the surface handles are loaded from the
+    * descriptor buffer and do not need any offset.
+    */
+   if (state->desc_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT ||
+       state->desc_type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER) {
+      if (!state->device->uses_ex_bso) {
+         /* We're trying to reduce the number of instructions in the shaders
+          * to compute surface handles. The assumption is that we're using
+          * more surface handles than sampler handles (UBO, SSBO, images,
+          * etc...) so it's worth optimizing that case.
+          *
+          * Surface handles in the extended descriptor message have to be
+          * shifted left by 6 prior to ex_bso (bits 31:12 in extended
+          * descriptor, match bits 25:6 of the surface handle). We have to
+          * combine 2 parts in the shader to build the final surface handle,
+          * base offset of the descriptor set (in the push constant, located
+          * in resource_intel::src[0]) and the relative descriptor offset
+          * (resource_intel::src[1]).
+          *
+          * For convenience, up to here, resource_intel::src[1] is in bytes.
+          * We now have to shift it left by 6 to match the shifted left by 6
+          * done for the push constant value provided in
+          * resource_intel::src[0]. That way the shader can just do a single
+          * ADD and get the surface handle.
+          */
+         if (!is_sampler)
+            binding_offset = nir_ishl_imm(b, binding_offset, 6);
+      }
+
+      nir_src_rewrite(&intrin->src[1],
+                      nir_iadd(b, set_offset, binding_offset));
+   }
+
+   /* Now unused values : set offset, array index */
+   nir_src_rewrite(&intrin->src[0], nir_imm_int(b, 0xdeaddeed));
+   nir_src_rewrite(&intrin->src[2], nir_imm_int(b, 0xdeaddeed));
+
+   return true;
+}
+
+bool
+anv_nir_lower_resource_intel(nir_shader *shader,
+                             const struct anv_physical_device *device,
+                             enum anv_descriptor_set_layout_type desc_type)
+{
+   struct lower_resource_state state = {
+      .desc_type = desc_type,
+      .device = device,
+   };
+   return nir_shader_intrinsics_pass(shader, lower_resource_intel,
+                                       nir_metadata_block_index |
+                                       nir_metadata_dominance,
+                                       &state);
+}
diff --git a/src/intel/vulkan/anv_nir_lower_ubo_loads.c b/src/intel/vulkan/anv_nir_lower_ubo_loads.c
index 35b963835e6..c85c656d296 100644
--- a/src/intel/vulkan/anv_nir_lower_ubo_loads.c
+++ b/src/intel/vulkan/anv_nir_lower_ubo_loads.c
@@ -25,29 +25,26 @@
 #include "nir_builder.h"
 
 static bool
-lower_ubo_load_instr(nir_builder *b, nir_instr *instr, UNUSED void *_data)
+lower_ubo_load_instr(nir_builder *b, nir_intrinsic_instr *load,
+                     UNUSED void *_data)
 {
-   if (instr->type != nir_instr_type_intrinsic)
-      return false;
-
-   nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr);
    if (load->intrinsic != nir_intrinsic_load_global_constant_offset &&
        load->intrinsic != nir_intrinsic_load_global_constant_bounded)
       return false;
 
-   b->cursor = nir_before_instr(instr);
+   b->cursor = nir_before_instr(&load->instr);
 
-   nir_ssa_def *base_addr = load->src[0].ssa;
-   nir_ssa_def *bound = NULL;
+   nir_def *base_addr = load->src[0].ssa;
+   nir_def *bound = NULL;
    if (load->intrinsic == nir_intrinsic_load_global_constant_bounded)
       bound = load->src[2].ssa;
 
-   unsigned bit_size = load->dest.ssa.bit_size;
+   unsigned bit_size = load->def.bit_size;
    assert(bit_size >= 8 && bit_size % 8 == 0);
    unsigned byte_size = bit_size / 8;
 
-   nir_ssa_def *val;
-   if (nir_src_is_const(load->src[1])) {
+   nir_def *val;
+   if (!nir_src_is_divergent(load->src[0]) && nir_src_is_const(load->src[1])) {
       uint32_t offset = nir_src_as_uint(load->src[1]);
 
       /* Things should be component-aligned. */
@@ -59,17 +56,16 @@ lower_ubo_load_instr(nir_builder *b, nir_instr *instr, UNUSED void *_data)
       uint64_t aligned_offset = offset - suboffset;
 
       /* Load two just in case we go over a 64B boundary */
-      nir_ssa_def *data[2];
+      nir_def *data[2];
       for (unsigned i = 0; i < 2; i++) {
-         nir_ssa_def *pred;
+         nir_def *pred;
          if (bound) {
-            pred = nir_ilt(b, nir_imm_int(b, aligned_offset + i * 64 + 63),
-                              bound);
+            pred = nir_igt_imm(b, bound, aligned_offset + i * 64 + 63);
          } else {
             pred = nir_imm_true(b);
          }
 
-         nir_ssa_def *addr = nir_iadd_imm(b, base_addr,
+         nir_def *addr = nir_iadd_imm(b, base_addr,
                                           aligned_offset + i * 64);
 
          data[i] = nir_load_global_const_block_intel(b, 16, addr, pred);
@@ -78,21 +74,21 @@ lower_ubo_load_instr(nir_builder *b, nir_instr *instr, UNUSED void *_data)
       val = nir_extract_bits(b, data, 2, suboffset * 8,
                              load->num_components, bit_size);
    } else {
-      nir_ssa_def *offset = load->src[1].ssa;
-      nir_ssa_def *addr = nir_iadd(b, base_addr, nir_u2u64(b, offset));
+      nir_def *offset = load->src[1].ssa;
+      nir_def *addr = nir_iadd(b, base_addr, nir_u2u64(b, offset));
 
       if (bound) {
-         nir_ssa_def *zero = nir_imm_zero(b, load->num_components, bit_size);
+         nir_def *zero = nir_imm_zero(b, load->num_components, bit_size);
 
          unsigned load_size = byte_size * load->num_components;
-         nir_ssa_def *in_bounds =
+         nir_def *in_bounds =
             nir_ilt(b, nir_iadd_imm(b, offset, load_size - 1), bound);
 
          nir_push_if(b, in_bounds);
 
-         nir_ssa_def *load_val =
-            nir_build_load_global_constant(b, load->dest.ssa.num_components,
-                                           load->dest.ssa.bit_size, addr,
+         nir_def *load_val =
+            nir_build_load_global_constant(b, load->def.num_components,
+                                           load->def.bit_size, addr,
                                            .access = nir_intrinsic_access(load),
                                            .align_mul = nir_intrinsic_align_mul(load),
                                            .align_offset = nir_intrinsic_align_offset(load));
@@ -101,15 +97,15 @@ lower_ubo_load_instr(nir_builder *b, nir_instr *instr, UNUSED void *_data)
 
          val = nir_if_phi(b, load_val, zero);
       } else {
-         val = nir_build_load_global_constant(b, load->dest.ssa.num_components,
-                                              load->dest.ssa.bit_size, addr,
+         val = nir_build_load_global_constant(b, load->def.num_components,
+                                              load->def.bit_size, addr,
                                               .access = nir_intrinsic_access(load),
                                               .align_mul = nir_intrinsic_align_mul(load),
                                               .align_offset = nir_intrinsic_align_offset(load));
       }
    }
 
-   nir_ssa_def_rewrite_uses(&load->dest.ssa, val);
+   nir_def_rewrite_uses(&load->def, val);
    nir_instr_remove(&load->instr);
 
    return true;
@@ -118,8 +114,7 @@ lower_ubo_load_instr(nir_builder *b, nir_instr *instr, UNUSED void *_data)
 bool
 anv_nir_lower_ubo_loads(nir_shader *shader)
 {
-   return nir_shader_instructions_pass(shader, lower_ubo_load_instr,
-                                       nir_metadata_block_index |
-                                       nir_metadata_dominance,
+   return nir_shader_intrinsics_pass(shader, lower_ubo_load_instr,
+                                       nir_metadata_none,
                                        NULL);
 }
diff --git a/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c b/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c
deleted file mode 100644
index a1504120247..00000000000
--- a/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- * Copyright © 2017 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "anv_nir.h"
-#include "anv_private.h"
-#include "nir/nir.h"
-#include "nir/nir_builder.h"
-#include "nir/nir_vulkan.h"
-
-struct ycbcr_state {
-   nir_builder *builder;
-   nir_ssa_def *image_size;
-   nir_tex_instr *origin_tex;
-   nir_deref_instr *tex_deref;
-   struct anv_ycbcr_conversion *conversion;
-};
-
-/* TODO: we should probably replace this with a push constant/uniform. */
-static nir_ssa_def *
-get_texture_size(struct ycbcr_state *state, nir_deref_instr *texture)
-{
-   if (state->image_size)
-      return state->image_size;
-
-   nir_builder *b = state->builder;
-   const struct glsl_type *type = texture->type;
-   nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
-
-   tex->op = nir_texop_txs;
-   tex->sampler_dim = glsl_get_sampler_dim(type);
-   tex->is_array = glsl_sampler_type_is_array(type);
-   tex->is_shadow = glsl_sampler_type_is_shadow(type);
-   tex->dest_type = nir_type_int32;
-
-   tex->src[0].src_type = nir_tex_src_texture_deref;
-   tex->src[0].src = nir_src_for_ssa(&texture->dest.ssa);
-
-   nir_ssa_dest_init(&tex->instr, &tex->dest,
-                     nir_tex_instr_dest_size(tex), 32, NULL);
-   nir_builder_instr_insert(b, &tex->instr);
-
-   state->image_size = nir_i2f32(b, &tex->dest.ssa);
-
-   return state->image_size;
-}
-
-static nir_ssa_def *
-implicit_downsampled_coord(nir_builder *b,
-                           nir_ssa_def *value,
-                           nir_ssa_def *max_value,
-                           int div_scale)
-{
-   return nir_fadd(b,
-                   value,
-                   nir_fdiv(b,
-                            nir_imm_float(b, 1.0f),
-                            nir_fmul(b,
-                                     nir_imm_float(b, div_scale),
-                                     max_value)));
-}
-
-static nir_ssa_def *
-implicit_downsampled_coords(struct ycbcr_state *state,
-                            nir_ssa_def *old_coords,
-                            const struct anv_format_plane *plane_format)
-{
-   nir_builder *b = state->builder;
-   struct anv_ycbcr_conversion *conversion = state->conversion;
-   nir_ssa_def *image_size = get_texture_size(state, state->tex_deref);
-   nir_ssa_def *comp[4] = { NULL, };
-   int c;
-
-   for (c = 0; c < ARRAY_SIZE(conversion->chroma_offsets); c++) {
-      if (plane_format->denominator_scales[c] > 1 &&
-          conversion->chroma_offsets[c] == VK_CHROMA_LOCATION_COSITED_EVEN) {
-         comp[c] = implicit_downsampled_coord(b,
-                                              nir_channel(b, old_coords, c),
-                                              nir_channel(b, image_size, c),
-                                              plane_format->denominator_scales[c]);
-      } else {
-         comp[c] = nir_channel(b, old_coords, c);
-      }
-   }
-
-   /* Leave other coordinates untouched */
-   for (; c < old_coords->num_components; c++)
-      comp[c] = nir_channel(b, old_coords, c);
-
-   return nir_vec(b, comp, old_coords->num_components);
-}
-
-static nir_ssa_def *
-create_plane_tex_instr_implicit(struct ycbcr_state *state,
-                                uint32_t plane)
-{
-   nir_builder *b = state->builder;
-   struct anv_ycbcr_conversion *conversion = state->conversion;
-   const struct anv_format_plane *plane_format =
-      &conversion->format->planes[plane];
-   nir_tex_instr *old_tex = state->origin_tex;
-   nir_tex_instr *tex = nir_tex_instr_create(b->shader, old_tex->num_srcs + 1);
-
-   for (uint32_t i = 0; i < old_tex->num_srcs; i++) {
-      tex->src[i].src_type = old_tex->src[i].src_type;
-
-      switch (old_tex->src[i].src_type) {
-      case nir_tex_src_coord:
-         if (plane_format->has_chroma && conversion->chroma_reconstruction) {
-            assert(old_tex->src[i].src.is_ssa);
-            tex->src[i].src =
-               nir_src_for_ssa(implicit_downsampled_coords(state,
-                                                           old_tex->src[i].src.ssa,
-                                                           plane_format));
-            break;
-         }
-         FALLTHROUGH;
-      default:
-         nir_src_copy(&tex->src[i].src, &old_tex->src[i].src);
-         break;
-      }
-   }
-   tex->src[tex->num_srcs - 1].src = nir_src_for_ssa(nir_imm_int(b, plane));
-   tex->src[tex->num_srcs - 1].src_type = nir_tex_src_plane;
-
-   tex->sampler_dim = old_tex->sampler_dim;
-   tex->dest_type = old_tex->dest_type;
-
-   tex->op = old_tex->op;
-   tex->coord_components = old_tex->coord_components;
-   tex->is_new_style_shadow = old_tex->is_new_style_shadow;
-   tex->component = old_tex->component;
-
-   tex->texture_index = old_tex->texture_index;
-   tex->sampler_index = old_tex->sampler_index;
-   tex->is_array = old_tex->is_array;
-
-   nir_ssa_dest_init(&tex->instr, &tex->dest,
-                     old_tex->dest.ssa.num_components,
-                     nir_dest_bit_size(old_tex->dest), NULL);
-   nir_builder_instr_insert(b, &tex->instr);
-
-   return &tex->dest.ssa;
-}
-
-static unsigned
-channel_to_component(enum isl_channel_select channel)
-{
-   switch (channel) {
-   case ISL_CHANNEL_SELECT_RED:
-      return 0;
-   case ISL_CHANNEL_SELECT_GREEN:
-      return 1;
-   case ISL_CHANNEL_SELECT_BLUE:
-      return 2;
-   case ISL_CHANNEL_SELECT_ALPHA:
-      return 3;
-   default:
-      unreachable("invalid channel");
-      return 0;
-   }
-}
-
-static enum isl_channel_select
-swizzle_channel(struct isl_swizzle swizzle, unsigned channel)
-{
-   switch (channel) {
-   case 0:
-      return swizzle.r;
-   case 1:
-      return swizzle.g;
-   case 2:
-      return swizzle.b;
-   case 3:
-      return swizzle.a;
-   default:
-      unreachable("invalid channel");
-      return 0;
-   }
-}
-
-static bool
-try_lower_tex_ycbcr(const struct anv_pipeline_layout *layout,
-                    nir_builder *builder,
-                    nir_tex_instr *tex)
-{
-   int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
-   assert(deref_src_idx >= 0);
-   nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
-
-   nir_variable *var = nir_deref_instr_get_variable(deref);
-   const struct anv_descriptor_set_layout *set_layout =
-      layout->set[var->data.descriptor_set].layout;
-   const struct anv_descriptor_set_binding_layout *binding =
-      &set_layout->binding[var->data.binding];
-
-   /* For the following instructions, we don't apply any change and let the
-    * instruction apply to the first plane.
-    */
-   if (tex->op == nir_texop_txs ||
-       tex->op == nir_texop_query_levels ||
-       tex->op == nir_texop_lod)
-      return false;
-
-   if (binding->immutable_samplers == NULL)
-      return false;
-
-   assert(tex->texture_index == 0);
-   unsigned array_index = 0;
-   if (deref->deref_type != nir_deref_type_var) {
-      assert(deref->deref_type == nir_deref_type_array);
-      if (!nir_src_is_const(deref->arr.index))
-         return false;
-      array_index = nir_src_as_uint(deref->arr.index);
-      array_index = MIN2(array_index, binding->array_size - 1);
-   }
-   const struct anv_sampler *sampler = binding->immutable_samplers[array_index];
-
-   if (sampler->conversion == NULL)
-      return false;
-
-   struct ycbcr_state state = {
-      .builder = builder,
-      .origin_tex = tex,
-      .tex_deref = deref,
-      .conversion = sampler->conversion,
-   };
-
-   builder->cursor = nir_before_instr(&tex->instr);
-
-   const struct anv_format *format = state.conversion->format;
-   const struct isl_format_layout *y_isl_layout = NULL;
-   for (uint32_t p = 0; p < format->n_planes; p++) {
-      if (!format->planes[p].has_chroma)
-         y_isl_layout = isl_format_get_layout(format->planes[p].isl_format);
-   }
-   assert(y_isl_layout != NULL);
-   uint8_t y_bpc = y_isl_layout->channels_array[0].bits;
-
-   /* |ycbcr_comp| holds components in the order : Cr-Y-Cb */
-   nir_ssa_def *zero = nir_imm_float(builder, 0.0f);
-   nir_ssa_def *one = nir_imm_float(builder, 1.0f);
-   /* Use extra 2 channels for following swizzle */
-   nir_ssa_def *ycbcr_comp[5] = { zero, zero, zero, one, zero };
-
-   uint8_t ycbcr_bpcs[5];
-   memset(ycbcr_bpcs, y_bpc, sizeof(ycbcr_bpcs));
-
-   /* Go through all the planes and gather the samples into a |ycbcr_comp|
-    * while applying a swizzle required by the spec:
-    *
-    *    R, G, B should respectively map to Cr, Y, Cb
-    */
-   for (uint32_t p = 0; p < format->n_planes; p++) {
-      const struct anv_format_plane *plane_format = &format->planes[p];
-      nir_ssa_def *plane_sample = create_plane_tex_instr_implicit(&state, p);
-
-      for (uint32_t pc = 0; pc < 4; pc++) {
-         enum isl_channel_select ycbcr_swizzle =
-            swizzle_channel(plane_format->ycbcr_swizzle, pc);
-         if (ycbcr_swizzle == ISL_CHANNEL_SELECT_ZERO)
-            continue;
-
-         unsigned ycbcr_component = channel_to_component(ycbcr_swizzle);
-         ycbcr_comp[ycbcr_component] = nir_channel(builder, plane_sample, pc);
-
-         /* Also compute the number of bits for each component. */
-         const struct isl_format_layout *isl_layout =
-            isl_format_get_layout(plane_format->isl_format);
-         ycbcr_bpcs[ycbcr_component] = isl_layout->channels_array[pc].bits;
-      }
-   }
-
-   /* Now remaps components to the order specified by the conversion. */
-   nir_ssa_def *swizzled_comp[4] = { NULL, };
-   uint32_t swizzled_bpcs[4] = { 0, };
-
-   for (uint32_t i = 0; i < ARRAY_SIZE(state.conversion->mapping); i++) {
-      /* Maps to components in |ycbcr_comp| */
-      static const uint32_t swizzle_mapping[] = {
-         [VK_COMPONENT_SWIZZLE_ZERO] = 4,
-         [VK_COMPONENT_SWIZZLE_ONE]  = 3,
-         [VK_COMPONENT_SWIZZLE_R]    = 0,
-         [VK_COMPONENT_SWIZZLE_G]    = 1,
-         [VK_COMPONENT_SWIZZLE_B]    = 2,
-         [VK_COMPONENT_SWIZZLE_A]    = 3,
-      };
-      const VkComponentSwizzle m = state.conversion->mapping[i];
-
-      if (m == VK_COMPONENT_SWIZZLE_IDENTITY) {
-         swizzled_comp[i] = ycbcr_comp[i];
-         swizzled_bpcs[i] = ycbcr_bpcs[i];
-      } else {
-         swizzled_comp[i] = ycbcr_comp[swizzle_mapping[m]];
-         swizzled_bpcs[i] = ycbcr_bpcs[swizzle_mapping[m]];
-      }
-   }
-
-   nir_ssa_def *result = nir_vec(builder, swizzled_comp, 4);
-   if (state.conversion->ycbcr_model != VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY) {
-      result = nir_convert_ycbcr_to_rgb(builder,
-                                        state.conversion->ycbcr_model,
-                                        state.conversion->ycbcr_range,
-                                        result,
-                                        swizzled_bpcs);
-   }
-
-   nir_ssa_def_rewrite_uses(&tex->dest.ssa, result);
-   nir_instr_remove(&tex->instr);
-
-   return true;
-}
-
-bool
-anv_nir_lower_ycbcr_textures(nir_shader *shader,
-                             const struct anv_pipeline_layout *layout)
-{
-   bool progress = false;
-
-   nir_foreach_function(function, shader) {
-      if (!function->impl)
-         continue;
-
-      bool function_progress = false;
-      nir_builder builder;
-      nir_builder_init(&builder, function->impl);
-
-      nir_foreach_block(block, function->impl) {
-         nir_foreach_instr_safe(instr, block) {
-            if (instr->type != nir_instr_type_tex)
-               continue;
-
-            nir_tex_instr *tex = nir_instr_as_tex(instr);
-            function_progress |= try_lower_tex_ycbcr(layout, &builder, tex);
-         }
-      }
-
-      if (function_progress) {
-         nir_metadata_preserve(function->impl,
-                               nir_metadata_block_index |
-                               nir_metadata_dominance);
-      }
-
-      progress |= function_progress;
-   }
-
-   return progress;
-}
diff --git a/src/intel/vulkan/anv_nir_push_descriptor_analysis.c b/src/intel/vulkan/anv_nir_push_descriptor_analysis.c
new file mode 100644
index 00000000000..c6dcb03769d
--- /dev/null
+++ b/src/intel/vulkan/anv_nir_push_descriptor_analysis.c
@@ -0,0 +1,261 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_nir.h"
+
+#include "compiler/brw_nir.h"
+
+const struct anv_descriptor_set_layout *
+anv_pipeline_layout_get_push_set(const struct anv_pipeline_sets_layout *layout,
+                                 uint8_t *set_idx)
+{
+   for (unsigned s = 0; s < ARRAY_SIZE(layout->set); s++) {
+      struct anv_descriptor_set_layout *set_layout = layout->set[s].layout;
+
+      if (!set_layout ||
+          !(set_layout->flags &
+            VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR))
+         continue;
+
+      if (set_idx)
+         *set_idx = s;
+
+      return set_layout;
+   }
+
+   return NULL;
+}
+
+/* This function returns a bitfield of used descriptors in the push descriptor
+ * set. You can only call this function before calling
+ * anv_nir_apply_pipeline_layout() as information required is lost after
+ * applying the pipeline layout.
+ */
+uint32_t
+anv_nir_compute_used_push_descriptors(nir_shader *shader,
+                                      const struct anv_pipeline_sets_layout *layout)
+{
+   uint8_t push_set;
+   const struct anv_descriptor_set_layout *push_set_layout =
+      anv_pipeline_layout_get_push_set(layout, &push_set);
+   if (push_set_layout == NULL)
+      return 0;
+
+   uint32_t used_push_bindings = 0;
+   nir_foreach_variable_with_modes(var, shader,
+                                   nir_var_uniform |
+                                   nir_var_image |
+                                   nir_var_mem_ubo |
+                                   nir_var_mem_ssbo) {
+      if (var->data.descriptor_set == push_set) {
+         uint32_t desc_idx =
+            push_set_layout->binding[var->data.binding].descriptor_index;
+         assert(desc_idx < MAX_PUSH_DESCRIPTORS);
+         used_push_bindings |= BITFIELD_BIT(desc_idx);
+      }
+   }
+
+   nir_foreach_function_impl(impl, shader) {
+      nir_foreach_block(block, impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            if (intrin->intrinsic != nir_intrinsic_vulkan_resource_index)
+               continue;
+
+            uint8_t set = nir_intrinsic_desc_set(intrin);
+            if (set != push_set)
+               continue;
+
+            uint32_t binding = nir_intrinsic_binding(intrin);
+            uint32_t desc_idx =
+               push_set_layout->binding[binding].descriptor_index;
+            assert(desc_idx < MAX_PUSH_DESCRIPTORS);
+
+            used_push_bindings |= BITFIELD_BIT(desc_idx);
+         }
+      }
+   }
+
+   return used_push_bindings;
+}
+
+/* This function checks whether the shader accesses the push descriptor
+ * buffer. This function must be called after anv_nir_compute_push_layout().
+ */
+bool
+anv_nir_loads_push_desc_buffer(nir_shader *nir,
+                               const struct anv_pipeline_sets_layout *layout,
+                               const struct anv_pipeline_bind_map *bind_map)
+{
+   uint8_t push_set;
+   const struct anv_descriptor_set_layout *push_set_layout =
+      anv_pipeline_layout_get_push_set(layout, &push_set);
+   if (push_set_layout == NULL)
+      return false;
+
+   nir_foreach_function_impl(impl, nir) {
+      nir_foreach_block(block, impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            if (intrin->intrinsic != nir_intrinsic_load_ubo)
+               continue;
+
+            const unsigned bt_idx =
+               brw_nir_ubo_surface_index_get_bti(intrin->src[0]);
+            if (bt_idx == UINT32_MAX)
+               continue;
+
+            const struct anv_pipeline_binding *binding =
+               &bind_map->surface_to_descriptor[bt_idx];
+            if ((binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS ||
+                 binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER) &&
+                binding->index == push_set) {
+               return true;
+            }
+         }
+      }
+   }
+
+   return false;
+}
+
+/* This function computes a bitfield of all the UBOs bindings in the push
+ * descriptor set that are fully promoted to push constants. If a binding's
+ * bit in the field is set, the corresponding binding table entry will not be
+ * accessed by the shader. This function must be called after
+ * anv_nir_compute_push_layout().
+ */
+uint32_t
+anv_nir_push_desc_ubo_fully_promoted(nir_shader *nir,
+                                     const struct anv_pipeline_sets_layout *layout,
+                                     const struct anv_pipeline_bind_map *bind_map)
+{
+   uint8_t push_set;
+   const struct anv_descriptor_set_layout *push_set_layout =
+      anv_pipeline_layout_get_push_set(layout, &push_set);
+   if (push_set_layout == NULL)
+      return 0;
+
+   /* Assume every UBO can be promoted first. */
+   uint32_t ubos_fully_promoted = 0;
+   for (uint32_t b = 0; b < push_set_layout->binding_count; b++) {
+      const struct anv_descriptor_set_binding_layout *bind_layout =
+         &push_set_layout->binding[b];
+      if (bind_layout->type == -1)
+         continue;
+
+      assert(bind_layout->descriptor_index < MAX_PUSH_DESCRIPTORS);
+      if (bind_layout->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
+         ubos_fully_promoted |= BITFIELD_BIT(bind_layout->descriptor_index);
+   }
+
+   /* For each load_ubo intrinsic, if the descriptor index or the offset is
+    * not a constant, we could not promote to push constant. Then check the
+    * offset + size against the push ranges.
+    */
+   nir_foreach_function_impl(impl, nir) {
+      nir_foreach_block(block, impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            if (intrin->intrinsic != nir_intrinsic_load_ubo)
+               continue;
+
+            /* Don't check the load_ubo from descriptor buffers */
+            nir_intrinsic_instr *resource =
+               intrin->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic ?
+               nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr) : NULL;
+            if (resource == NULL || resource->intrinsic != nir_intrinsic_resource_intel)
+               continue;
+
+            /* Skip load_ubo not loading from the push descriptor */
+            if (nir_intrinsic_desc_set(resource) != push_set)
+               continue;
+
+            uint32_t binding = nir_intrinsic_binding(resource);
+
+            /* If we have indirect indexing in the binding, no push promotion
+             * in possible for the entire binding.
+             */
+            if (!nir_src_is_const(resource->src[1])) {
+               for (uint32_t i = 0; i < push_set_layout->binding[binding].array_size; i++) {
+                  ubos_fully_promoted &=
+                     ~BITFIELD_BIT(push_set_layout->binding[binding].descriptor_index + i);
+               }
+               continue;
+            }
+
+            const nir_const_value *const_bt_id =
+               nir_src_as_const_value(resource->src[1]);
+            uint32_t bt_id = const_bt_id[0].u32;
+
+            const struct anv_pipeline_binding *pipe_bind =
+               &bind_map->surface_to_descriptor[bt_id];
+
+            const uint32_t desc_idx =
+               push_set_layout->binding[binding].descriptor_index;
+
+            /* If the offset in the entry is dynamic, we can't tell if
+             * promoted or not.
+             */
+            const nir_const_value *const_load_offset =
+               nir_src_as_const_value(intrin->src[1]);
+            if (const_load_offset == NULL) {
+               ubos_fully_promoted &= ~BITFIELD_BIT(desc_idx);
+               continue;
+            }
+
+            /* Check if the load was promoted to a push constant. */
+            const unsigned load_offset = const_load_offset[0].u32;
+            const int load_bytes = nir_intrinsic_dest_components(intrin) *
+               (intrin->def.bit_size / 8);
+
+            bool promoted = false;
+            for (unsigned i = 0; i < ARRAY_SIZE(bind_map->push_ranges); i++) {
+               if (bind_map->push_ranges[i].set == pipe_bind->set &&
+                   bind_map->push_ranges[i].index == desc_idx &&
+                   bind_map->push_ranges[i].start * 32 <= load_offset &&
+                   (bind_map->push_ranges[i].start +
+                    bind_map->push_ranges[i].length) * 32 >=
+                   (load_offset + load_bytes)) {
+                  promoted = true;
+                  break;
+               }
+            }
+
+            if (!promoted)
+               ubos_fully_promoted &= ~BITFIELD_BIT(desc_idx);
+         }
+      }
+   }
+
+   return ubos_fully_promoted;
+}
diff --git a/src/intel/vulkan/anv_pass.c b/src/intel/vulkan/anv_pass.c
deleted file mode 100644
index 634a3a3e24e..00000000000
--- a/src/intel/vulkan/anv_pass.c
+++ /dev/null
@@ -1,490 +0,0 @@
-/*
- * Copyright © 2015 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "anv_private.h"
-
-#include "vk_format.h"
-#include "vk_util.h"
-
-static void
-anv_render_pass_add_subpass_dep(struct anv_device *device,
-                                struct anv_render_pass *pass,
-                                const VkSubpassDependency2KHR *dep)
-{
-   if (dep->dstSubpass == VK_SUBPASS_EXTERNAL) {
-      pass->subpass_flushes[pass->subpass_count] |=
-         anv_pipe_invalidate_bits_for_access_flags(device, dep->dstAccessMask);
-   } else {
-      assert(dep->dstSubpass < pass->subpass_count);
-      pass->subpass_flushes[dep->dstSubpass] |=
-         anv_pipe_invalidate_bits_for_access_flags(device, dep->dstAccessMask);
-   }
-
-   if (dep->srcSubpass == VK_SUBPASS_EXTERNAL) {
-      pass->subpass_flushes[0] |=
-         anv_pipe_flush_bits_for_access_flags(device, dep->srcAccessMask);
-   } else {
-      assert(dep->srcSubpass < pass->subpass_count);
-      pass->subpass_flushes[dep->srcSubpass + 1] |=
-         anv_pipe_flush_bits_for_access_flags(device, dep->srcAccessMask);
-   }
-}
-
-/* Do a second "compile" step on a render pass */
-static void
-anv_render_pass_compile(struct anv_render_pass *pass)
-{
-   /* The CreateRenderPass code zeros the entire render pass and also uses a
-    * designated initializer for filling these out.  There's no need for us to
-    * do it again.
-    *
-    * for (uint32_t i = 0; i < pass->attachment_count; i++) {
-    *    pass->attachments[i].usage = 0;
-    *    pass->attachments[i].first_subpass_layout = VK_IMAGE_LAYOUT_UNDEFINED;
-    * }
-    */
-
-   VkImageUsageFlags all_usage = 0;
-   for (uint32_t i = 0; i < pass->subpass_count; i++) {
-      struct anv_subpass *subpass = &pass->subpasses[i];
-
-      /* We don't allow depth_stencil_attachment to be non-NULL and be
-       * VK_ATTACHMENT_UNUSED.  This way something can just check for NULL
-       * and be guaranteed that they have a valid attachment.
-       */
-      if (subpass->depth_stencil_attachment &&
-          subpass->depth_stencil_attachment->attachment == VK_ATTACHMENT_UNUSED)
-         subpass->depth_stencil_attachment = NULL;
-
-      if (subpass->ds_resolve_attachment &&
-          subpass->ds_resolve_attachment->attachment == VK_ATTACHMENT_UNUSED)
-         subpass->ds_resolve_attachment = NULL;
-
-      for (uint32_t j = 0; j < subpass->attachment_count; j++) {
-         struct anv_subpass_attachment *subpass_att = &subpass->attachments[j];
-         if (subpass_att->attachment == VK_ATTACHMENT_UNUSED)
-            continue;
-
-         struct anv_render_pass_attachment *pass_att =
-            &pass->attachments[subpass_att->attachment];
-
-         pass_att->usage |= subpass_att->usage;
-         pass_att->last_subpass_idx = i;
-
-         all_usage |= subpass_att->usage;
-
-         if (pass_att->first_subpass_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
-            pass_att->first_subpass_layout = subpass_att->layout;
-            assert(pass_att->first_subpass_layout != VK_IMAGE_LAYOUT_UNDEFINED);
-         }
-
-         if (subpass_att->usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT &&
-             subpass->depth_stencil_attachment &&
-             subpass_att->attachment == subpass->depth_stencil_attachment->attachment)
-            subpass->has_ds_self_dep = true;
-      }
-
-      /* We have to handle resolve attachments specially */
-      subpass->has_color_resolve = false;
-      if (subpass->resolve_attachments) {
-         for (uint32_t j = 0; j < subpass->color_count; j++) {
-            struct anv_subpass_attachment *color_att =
-               &subpass->color_attachments[j];
-            struct anv_subpass_attachment *resolve_att =
-               &subpass->resolve_attachments[j];
-            if (resolve_att->attachment == VK_ATTACHMENT_UNUSED)
-               continue;
-
-            subpass->has_color_resolve = true;
-
-            assert(color_att->attachment < pass->attachment_count);
-            struct anv_render_pass_attachment *color_pass_att =
-               &pass->attachments[color_att->attachment];
-
-            assert(resolve_att->usage == VK_IMAGE_USAGE_TRANSFER_DST_BIT);
-            assert(color_att->usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT);
-            color_pass_att->usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
-         }
-      }
-
-      if (subpass->ds_resolve_attachment) {
-         struct anv_subpass_attachment *ds_att =
-            subpass->depth_stencil_attachment;
-         UNUSED struct anv_subpass_attachment *resolve_att =
-            subpass->ds_resolve_attachment;
-
-         assert(ds_att->attachment < pass->attachment_count);
-         struct anv_render_pass_attachment *ds_pass_att =
-            &pass->attachments[ds_att->attachment];
-
-         assert(resolve_att->usage == VK_IMAGE_USAGE_TRANSFER_DST_BIT);
-         assert(ds_att->usage == VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT);
-         ds_pass_att->usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
-      }
-
-      for (uint32_t j = 0; j < subpass->attachment_count; j++)
-         assert(__builtin_popcount(subpass->attachments[j].usage) == 1);
-   }
-
-   /* From the Vulkan 1.0.39 spec:
-    *
-    *    If there is no subpass dependency from VK_SUBPASS_EXTERNAL to the
-    *    first subpass that uses an attachment, then an implicit subpass
-    *    dependency exists from VK_SUBPASS_EXTERNAL to the first subpass it is
-    *    used in. The subpass dependency operates as if defined with the
-    *    following parameters:
-    *
-    *    VkSubpassDependency implicitDependency = {
-    *        .srcSubpass = VK_SUBPASS_EXTERNAL;
-    *        .dstSubpass = firstSubpass; // First subpass attachment is used in
-    *        .srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
-    *        .dstStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
-    *        .srcAccessMask = 0;
-    *        .dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |
-    *                         VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
-    *                         VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
-    *                         VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
-    *                         VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
-    *        .dependencyFlags = 0;
-    *    };
-    *
-    *    Similarly, if there is no subpass dependency from the last subpass
-    *    that uses an attachment to VK_SUBPASS_EXTERNAL, then an implicit
-    *    subpass dependency exists from the last subpass it is used in to
-    *    VK_SUBPASS_EXTERNAL. The subpass dependency operates as if defined
-    *    with the following parameters:
-    *
-    *    VkSubpassDependency implicitDependency = {
-    *        .srcSubpass = lastSubpass; // Last subpass attachment is used in
-    *        .dstSubpass = VK_SUBPASS_EXTERNAL;
-    *        .srcStageMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
-    *        .dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
-    *        .srcAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |
-    *                         VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
-    *                         VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
-    *                         VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
-    *                         VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
-    *        .dstAccessMask = 0;
-    *        .dependencyFlags = 0;
-    *    };
-    *
-    * We could implement this by walking over all of the attachments and
-    * subpasses and checking to see if any of them don't have an external
-    * dependency.  Or, we could just be lazy and add a couple extra flushes.
-    * We choose to be lazy.
-    *
-    * From the documentation for vkCmdNextSubpass:
-    *
-    *    "Moving to the next subpass automatically performs any multisample
-    *    resolve operations in the subpass being ended. End-of-subpass
-    *    multisample resolves are treated as color attachment writes for the
-    *    purposes of synchronization. This applies to resolve operations for
-    *    both color and depth/stencil attachments. That is, they are
-    *    considered to execute in the
-    *    VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT pipeline stage and
-    *    their writes are synchronized with
-    *    VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT."
-    *
-    * Therefore, the above flags concerning color attachments also apply to
-    * color and depth/stencil resolve attachments.
-    */
-   if (all_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) {
-      pass->subpass_flushes[0] |=
-         ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
-   }
-   if (all_usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
-                    VK_IMAGE_USAGE_TRANSFER_DST_BIT)) {
-      pass->subpass_flushes[pass->subpass_count] |=
-         ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
-   }
-   if (all_usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
-      pass->subpass_flushes[pass->subpass_count] |=
-         ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
-   }
-}
-
-static unsigned
-num_subpass_attachments2(const VkSubpassDescription2KHR *desc)
-{
-   const VkSubpassDescriptionDepthStencilResolveKHR *ds_resolve =
-      vk_find_struct_const(desc->pNext,
-                           SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE_KHR);
-
-   return desc->inputAttachmentCount +
-          desc->colorAttachmentCount +
-          (desc->pResolveAttachments ? desc->colorAttachmentCount : 0) +
-          (desc->pDepthStencilAttachment != NULL) +
-          (ds_resolve && ds_resolve->pDepthStencilResolveAttachment);
-}
-
-static bool
-vk_image_layout_depth_only(VkImageLayout layout)
-{
-   switch (layout) {
-   case VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_OPTIMAL:
-   case VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_OPTIMAL:
-      return true;
-
-   default:
-      return false;
-   }
-}
-
-/* From the Vulkan Specification 1.2.166 - VkAttachmentReference2:
- *
- *   "If layout only specifies the layout of the depth aspect of the
- *    attachment, the layout of the stencil aspect is specified by the
- *    stencilLayout member of a VkAttachmentReferenceStencilLayout structure
- *    included in the pNext chain. Otherwise, layout describes the layout for
- *    all relevant image aspects."
- */
-static VkImageLayout
-stencil_ref_layout(const VkAttachmentReference2KHR *att_ref)
-{
-   if (!vk_image_layout_depth_only(att_ref->layout))
-      return att_ref->layout;
-
-   const VkAttachmentReferenceStencilLayoutKHR *stencil_ref =
-      vk_find_struct_const(att_ref->pNext,
-                           ATTACHMENT_REFERENCE_STENCIL_LAYOUT_KHR);
-   if (!stencil_ref)
-      return VK_IMAGE_LAYOUT_UNDEFINED;
-   return stencil_ref->stencilLayout;
-}
-
-/* From the Vulkan Specification 1.2.166 - VkAttachmentDescription2:
- *
- *   "If format is a depth/stencil format, and initialLayout only specifies
- *    the initial layout of the depth aspect of the attachment, the initial
- *    layout of the stencil aspect is specified by the stencilInitialLayout
- *    member of a VkAttachmentDescriptionStencilLayout structure included in
- *    the pNext chain. Otherwise, initialLayout describes the initial layout
- *    for all relevant image aspects."
- */
-static VkImageLayout
-stencil_desc_layout(const VkAttachmentDescription2KHR *att_desc, bool final)
-{
-   if (!vk_format_has_stencil(att_desc->format))
-      return VK_IMAGE_LAYOUT_UNDEFINED;
-
-   const VkImageLayout main_layout =
-      final ? att_desc->finalLayout : att_desc->initialLayout;
-   if (!vk_image_layout_depth_only(main_layout))
-      return main_layout;
-
-   const VkAttachmentDescriptionStencilLayoutKHR *stencil_desc =
-      vk_find_struct_const(att_desc->pNext,
-                           ATTACHMENT_DESCRIPTION_STENCIL_LAYOUT_KHR);
-   assert(stencil_desc);
-   return final ?
-      stencil_desc->stencilFinalLayout :
-      stencil_desc->stencilInitialLayout;
-}
-
-VkResult anv_CreateRenderPass2(
-    VkDevice                                    _device,
-    const VkRenderPassCreateInfo2KHR*           pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkRenderPass*                               pRenderPass)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR);
-
-   VK_MULTIALLOC(ma);
-   VK_MULTIALLOC_DECL(&ma, struct anv_render_pass, pass, 1);
-   VK_MULTIALLOC_DECL(&ma, struct anv_subpass, subpasses,
-                           pCreateInfo->subpassCount);
-   VK_MULTIALLOC_DECL(&ma, struct anv_render_pass_attachment, attachments,
-                           pCreateInfo->attachmentCount);
-   VK_MULTIALLOC_DECL(&ma, enum anv_pipe_bits, subpass_flushes,
-                           pCreateInfo->subpassCount + 1);
-
-   uint32_t subpass_attachment_count = 0;
-   for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
-      subpass_attachment_count +=
-         num_subpass_attachments2(&pCreateInfo->pSubpasses[i]);
-   }
-   VK_MULTIALLOC_DECL(&ma, struct anv_subpass_attachment, subpass_attachments,
-                      subpass_attachment_count);
-
-   if (!vk_object_multizalloc(&device->vk, &ma, pAllocator,
-                              VK_OBJECT_TYPE_RENDER_PASS))
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   /* Clear the subpasses along with the parent pass. This required because
-    * each array member of anv_subpass must be a valid pointer if not NULL.
-    */
-   pass->attachment_count = pCreateInfo->attachmentCount;
-   pass->subpass_count = pCreateInfo->subpassCount;
-   pass->attachments = attachments;
-   pass->subpass_flushes = subpass_flushes;
-
-   for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
-      pass->attachments[i] = (struct anv_render_pass_attachment) {
-         .format                 = pCreateInfo->pAttachments[i].format,
-         .samples                = pCreateInfo->pAttachments[i].samples,
-         .load_op                = pCreateInfo->pAttachments[i].loadOp,
-         .store_op               = pCreateInfo->pAttachments[i].storeOp,
-         .stencil_load_op        = pCreateInfo->pAttachments[i].stencilLoadOp,
-         .initial_layout         = pCreateInfo->pAttachments[i].initialLayout,
-         .final_layout           = pCreateInfo->pAttachments[i].finalLayout,
-
-         .stencil_initial_layout = stencil_desc_layout(&pCreateInfo->pAttachments[i],
-                                                       false),
-         .stencil_final_layout   = stencil_desc_layout(&pCreateInfo->pAttachments[i],
-                                                       true),
-      };
-   }
-
-   for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
-      const VkSubpassDescription2KHR *desc = &pCreateInfo->pSubpasses[i];
-      struct anv_subpass *subpass = &pass->subpasses[i];
-
-      subpass->input_count = desc->inputAttachmentCount;
-      subpass->color_count = desc->colorAttachmentCount;
-      subpass->attachment_count = num_subpass_attachments2(desc);
-      subpass->attachments = subpass_attachments;
-      subpass->view_mask = desc->viewMask;
-
-      if (desc->inputAttachmentCount > 0) {
-         subpass->input_attachments = subpass_attachments;
-         subpass_attachments += desc->inputAttachmentCount;
-
-         for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) {
-            subpass->input_attachments[j] = (struct anv_subpass_attachment) {
-               .usage =          VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT,
-               .attachment =     desc->pInputAttachments[j].attachment,
-               .layout =         desc->pInputAttachments[j].layout,
-               .stencil_layout = stencil_ref_layout(&desc->pInputAttachments[j]),
-            };
-         }
-      }
-
-      if (desc->colorAttachmentCount > 0) {
-         subpass->color_attachments = subpass_attachments;
-         subpass_attachments += desc->colorAttachmentCount;
-
-         for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
-            subpass->color_attachments[j] = (struct anv_subpass_attachment) {
-               .usage =       VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
-               .attachment =  desc->pColorAttachments[j].attachment,
-               .layout =      desc->pColorAttachments[j].layout,
-            };
-         }
-      }
-
-      if (desc->pResolveAttachments) {
-         subpass->resolve_attachments = subpass_attachments;
-         subpass_attachments += desc->colorAttachmentCount;
-
-         for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
-            subpass->resolve_attachments[j] = (struct anv_subpass_attachment) {
-               .usage =       VK_IMAGE_USAGE_TRANSFER_DST_BIT,
-               .attachment =  desc->pResolveAttachments[j].attachment,
-               .layout =      desc->pResolveAttachments[j].layout,
-            };
-         }
-      }
-
-      if (desc->pDepthStencilAttachment) {
-         subpass->depth_stencil_attachment = subpass_attachments++;
-
-         *subpass->depth_stencil_attachment = (struct anv_subpass_attachment) {
-            .usage =          VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
-            .attachment =     desc->pDepthStencilAttachment->attachment,
-            .layout =         desc->pDepthStencilAttachment->layout,
-            .stencil_layout = stencil_ref_layout(desc->pDepthStencilAttachment),
-         };
-      }
-
-      const VkSubpassDescriptionDepthStencilResolveKHR *ds_resolve =
-         vk_find_struct_const(desc->pNext,
-                              SUBPASS_DESCRIPTION_DEPTH_STENCIL_RESOLVE_KHR);
-
-      if (ds_resolve && ds_resolve->pDepthStencilResolveAttachment) {
-         subpass->ds_resolve_attachment = subpass_attachments++;
-
-         *subpass->ds_resolve_attachment = (struct anv_subpass_attachment) {
-            .usage =          VK_IMAGE_USAGE_TRANSFER_DST_BIT,
-            .attachment =     ds_resolve->pDepthStencilResolveAttachment->attachment,
-            .layout =         ds_resolve->pDepthStencilResolveAttachment->layout,
-            .stencil_layout = stencil_ref_layout(ds_resolve->pDepthStencilResolveAttachment),
-         };
-         subpass->depth_resolve_mode = ds_resolve->depthResolveMode;
-         subpass->stencil_resolve_mode = ds_resolve->stencilResolveMode;
-      }
-   }
-
-   for (uint32_t i = 0; i < pCreateInfo->dependencyCount; i++) {
-      anv_render_pass_add_subpass_dep(device, pass,
-                                      &pCreateInfo->pDependencies[i]);
-   }
-
-   vk_foreach_struct(ext, pCreateInfo->pNext) {
-      switch (ext->sType) {
-      default:
-         anv_debug_ignored_stype(ext->sType);
-      }
-   }
-
-   anv_render_pass_compile(pass);
-
-   *pRenderPass = anv_render_pass_to_handle(pass);
-
-   return VK_SUCCESS;
-}
-
-void anv_DestroyRenderPass(
-    VkDevice                                    _device,
-    VkRenderPass                                _pass,
-    const VkAllocationCallbacks*                pAllocator)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_render_pass, pass, _pass);
-
-   if (!pass)
-      return;
-
-   vk_object_free(&device->vk, pAllocator, pass);
-}
-
-void anv_GetRenderAreaGranularity(
-    VkDevice                                    device,
-    VkRenderPass                                renderPass,
-    VkExtent2D*                                 pGranularity)
-{
-   ANV_FROM_HANDLE(anv_render_pass, pass, renderPass);
-
-   /* This granularity satisfies HiZ fast clear alignment requirements
-    * for all sample counts.
-    */
-   for (unsigned i = 0; i < pass->subpass_count; ++i) {
-      if (pass->subpasses[i].depth_stencil_attachment) {
-         *pGranularity = (VkExtent2D) { .width = 8, .height = 4 };
-         return;
-      }
-   }
-
-   *pGranularity = (VkExtent2D) { 1, 1 };
-}
diff --git a/src/intel/vulkan/anv_perf.c b/src/intel/vulkan/anv_perf.c
index 560da6a7c31..3b23067ab23 100644
--- a/src/intel/vulkan/anv_perf.c
+++ b/src/intel/vulkan/anv_perf.c
@@ -36,39 +36,21 @@
 void
 anv_physical_device_init_perf(struct anv_physical_device *device, int fd)
 {
-   const struct intel_device_info *devinfo = &device->info;
-
    device->perf = NULL;
 
-   /* We need self modifying batches. The i915 parser prevents it on
-    * Gfx7.5 :( maybe one day.
-    */
-   if (devinfo->ver < 8)
-      return;
-
    struct intel_perf_config *perf = intel_perf_new(NULL);
 
    intel_perf_init_metrics(perf, &device->info, fd,
                            false /* pipeline statistics */,
                            true /* register snapshots */);
 
-   if (!perf->n_queries) {
-      if (perf->platform_supported) {
-         static bool warned_once = false;
-
-         if (!warned_once) {
-            mesa_logw("Performance support disabled, "
-                      "consider sysctl dev.i915.perf_stream_paranoid=0\n");
-            warned_once = true;
-         }
-      }
+   if (!perf->n_queries)
       goto err;
-   }
 
    /* We need DRM_I915_PERF_PROP_HOLD_PREEMPTION support, only available in
     * perf revision 2.
     */
-   if (!(INTEL_DEBUG & DEBUG_NO_OACONFIG)) {
+   if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) {
       if (!intel_perf_has_hold_preemption(perf))
          goto err;
    }
@@ -89,10 +71,13 @@ anv_physical_device_init_perf(struct anv_physical_device *device, int fd)
          break;
       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
+      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
          device->n_perf_query_commands += field->size / 4;
          break;
+      default:
+         unreachable("Unhandled register type");
       }
    }
    device->n_perf_query_commands *= 2; /* Begin & End */
@@ -124,9 +109,10 @@ anv_device_perf_open(struct anv_device *device, uint64_t metric_id)
    properties[p++] = metric_id;
 
    properties[p++] = DRM_I915_PERF_PROP_OA_FORMAT;
-   properties[p++] = device->info.ver >= 8 ?
-      I915_OA_FORMAT_A32u40_A4u32_B8_C8 :
-      I915_OA_FORMAT_A45_B8_C8;
+   properties[p++] =
+      device->info->verx10 >= 125 ?
+      I915_OA_FORMAT_A24u40_A14u32_B8_C8 :
+      I915_OA_FORMAT_A32u40_A4u32_B8_C8;
 
    properties[p++] = DRM_I915_PERF_PROP_OA_EXPONENT;
    properties[p++] = 31; /* slowest sampling period */
@@ -141,8 +127,12 @@ anv_device_perf_open(struct anv_device *device, uint64_t metric_id)
     * Gfx11 for instance we use the full EU array. Initially when perf was
     * enabled we would use only half on Gfx11 because of functional
     * requirements.
+    *
+    * Temporary disable this option on Gfx12.5+, kernel doesn't appear to
+    * support it.
     */
-   if (intel_perf_has_global_sseu(device->physical->perf)) {
+   if (intel_perf_has_global_sseu(device->physical->perf) &&
+       device->info->verx10 < 125) {
       properties[p++] = DRM_I915_PERF_PROP_GLOBAL_SSEU;
       properties[p++] = (uintptr_t) &device->physical->perf->sseu;
    }
@@ -223,9 +213,9 @@ VkResult anv_AcquirePerformanceConfigurationINTEL(
    config = vk_object_alloc(&device->vk, NULL, sizeof(*config),
                             VK_OBJECT_TYPE_PERFORMANCE_CONFIGURATION_INTEL);
    if (!config)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   if (!(INTEL_DEBUG & DEBUG_NO_OACONFIG)) {
+   if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) {
       config->register_config =
          intel_perf_load_configuration(device->physical->perf, device->fd,
                                      INTEL_PERF_QUERY_GUID_MDAPI);
@@ -258,7 +248,7 @@ VkResult anv_ReleasePerformanceConfigurationINTEL(
    ANV_FROM_HANDLE(anv_device, device, _device);
    ANV_FROM_HANDLE(anv_performance_configuration_intel, config, _configuration);
 
-   if (!(INTEL_DEBUG & DEBUG_NO_OACONFIG))
+   if (!INTEL_DEBUG(DEBUG_NO_OACONFIG))
       intel_ioctl(device->fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG, &config->config_id);
 
    ralloc_free(config->register_config);
@@ -276,7 +266,7 @@ VkResult anv_QueueSetPerformanceConfigurationINTEL(
    ANV_FROM_HANDLE(anv_performance_configuration_intel, config, _configuration);
    struct anv_device *device = queue->device;
 
-   if (!(INTEL_DEBUG & DEBUG_NO_OACONFIG)) {
+   if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) {
       if (device->perf_fd < 0) {
          device->perf_fd = anv_device_perf_open(device, config->config_id);
          if (device->perf_fd < 0)
@@ -285,7 +275,7 @@ VkResult anv_QueueSetPerformanceConfigurationINTEL(
          int ret = intel_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG,
                                (void *)(uintptr_t) config->config_id);
          if (ret < 0)
-            return anv_device_set_lost(device, "i915-perf config failed: %m");
+            return vk_device_set_lost(&device->vk, "i915-perf config failed: %m");
       }
    }
 
@@ -346,15 +336,25 @@ VkResult anv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
 
    uint32_t desc_count = *pCounterCount;
 
-   VK_OUTARRAY_MAKE(out, pCounters, pCounterCount);
-   VK_OUTARRAY_MAKE(out_desc, pCounterDescriptions, &desc_count);
+   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, out, pCounters, pCounterCount);
+   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, out_desc,
+                          pCounterDescriptions, &desc_count);
+
+   /* We cannot support performance queries on anything other than RCS,
+    * because the MI_REPORT_PERF_COUNT command is not available on other
+    * engines.
+    */
+   struct anv_queue_family *queue_family =
+      &pdevice->queue.families[queueFamilyIndex];
+   if (queue_family->engine_class != INTEL_ENGINE_CLASS_RENDER)
+      return vk_outarray_status(&out);
 
    for (int c = 0; c < (perf ? perf->n_counters : 0); c++) {
       const struct intel_perf_query_counter *intel_counter = perf->counter_infos[c].counter;
 
-      vk_outarray_append(&out, counter) {
+      vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
          counter->unit = intel_perf_counter_unit_to_vk_unit[intel_counter->units];
-         counter->scope = VK_QUERY_SCOPE_COMMAND_KHR;
+         counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
          counter->storage = intel_perf_counter_data_type_to_vk_storage[intel_counter->data_type];
 
          unsigned char sha1_result[20];
@@ -364,9 +364,12 @@ VkResult anv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
          memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
       }
 
-      vk_outarray_append(&out_desc, desc) {
+      vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, &out_desc, desc) {
          desc->flags = 0; /* None so far. */
-         snprintf(desc->name, sizeof(desc->name), "%s", intel_counter->name);
+         snprintf(desc->name, sizeof(desc->name), "%s",
+                  INTEL_DEBUG(DEBUG_PERF_SYMBOL_NAMES) ?
+                  intel_counter->symbol_name :
+                  intel_counter->name);
          snprintf(desc->category, sizeof(desc->category), "%s", intel_counter->category);
          snprintf(desc->description, sizeof(desc->description), "%s", intel_counter->desc);
       }
@@ -405,7 +408,7 @@ VkResult anv_AcquireProfilingLockKHR(
 
    assert(device->perf_fd == -1);
 
-   if (!(INTEL_DEBUG & DEBUG_NO_OACONFIG)) {
+   if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) {
       fd = anv_device_perf_open(device, first_metric_set->oa_metrics_set_id);
       if (fd < 0)
          return VK_TIMEOUT;
@@ -420,7 +423,7 @@ void anv_ReleaseProfilingLockKHR(
 {
    ANV_FROM_HANDLE(anv_device, device, _device);
 
-   if (!(INTEL_DEBUG & DEBUG_NO_OACONFIG)) {
+   if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) {
       assert(device->perf_fd >= 0);
       close(device->perf_fd);
    }
@@ -433,10 +436,12 @@ anv_perf_write_pass_results(struct intel_perf_config *perf,
                             const struct intel_perf_query_result *accumulated_results,
                             union VkPerformanceCounterResultKHR *results)
 {
+   const struct intel_perf_query_info *query = pool->pass_query[pass];
+
    for (uint32_t c = 0; c < pool->n_counters; c++) {
       const struct intel_perf_counter_pass *counter_pass = &pool->counter_pass[c];
 
-      if (counter_pass->pass != pass)
+      if (counter_pass->query != query)
          continue;
 
       switch (pool->pass_query[pass]->kind) {
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 864c9733224..6d417fda354 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -30,80 +30,122 @@
 #include "util/mesa-sha1.h"
 #include "util/os_time.h"
 #include "common/intel_l3_config.h"
-#include "common/intel_disasm.h"
 #include "common/intel_sample_positions.h"
+#include "compiler/brw_disasm.h"
 #include "anv_private.h"
 #include "compiler/brw_nir.h"
 #include "compiler/brw_nir_rt.h"
+#include "compiler/intel_nir.h"
 #include "anv_nir.h"
 #include "nir/nir_xfb_info.h"
 #include "spirv/nir_spirv.h"
+#include "vk_nir_convert_ycbcr.h"
+#include "vk_nir.h"
+#include "vk_pipeline.h"
+#include "vk_render_pass.h"
 #include "vk_util.h"
 
-/* Needed for SWIZZLE macros */
-#include "program/prog_instruction.h"
+struct lower_set_vtx_and_prim_count_state {
+   nir_variable *primitive_count;
+};
 
-// Shader functions
-#define SPIR_V_MAGIC_NUMBER 0x07230203
+static nir_variable *
+anv_nir_prim_count_store(nir_builder *b, nir_def *val)
+{
+   nir_variable *primitive_count =
+         nir_variable_create(b->shader,
+                             nir_var_shader_out,
+                             glsl_uint_type(),
+                             "gl_PrimitiveCountNV");
+   primitive_count->data.location = VARYING_SLOT_PRIMITIVE_COUNT;
+   primitive_count->data.interpolation = INTERP_MODE_NONE;
+
+   nir_def *local_invocation_index = nir_load_local_invocation_index(b);
+
+   nir_def *cmp = nir_ieq_imm(b, local_invocation_index, 0);
+   nir_if *if_stmt = nir_push_if(b, cmp);
+   {
+      nir_deref_instr *prim_count_deref = nir_build_deref_var(b, primitive_count);
+      nir_store_deref(b, prim_count_deref, val, 1);
+   }
+   nir_pop_if(b, if_stmt);
 
-struct anv_spirv_debug_data {
-   struct anv_device *device;
-   const struct vk_shader_module *module;
-};
+   return primitive_count;
+}
 
-static void anv_spirv_nir_debug(void *private_data,
-                                enum nir_spirv_debug_level level,
-                                size_t spirv_offset,
-                                const char *message)
+static bool
+anv_nir_lower_set_vtx_and_prim_count_instr(nir_builder *b,
+                                           nir_intrinsic_instr *intrin,
+                                           void *data)
 {
-   struct anv_spirv_debug_data *debug_data = private_data;
-   struct anv_instance *instance = debug_data->device->physical->instance;
+   if (intrin->intrinsic != nir_intrinsic_set_vertex_and_primitive_count)
+      return false;
 
-   static const VkDebugReportFlagsEXT vk_flags[] = {
-      [NIR_SPIRV_DEBUG_LEVEL_INFO] = VK_DEBUG_REPORT_INFORMATION_BIT_EXT,
-      [NIR_SPIRV_DEBUG_LEVEL_WARNING] = VK_DEBUG_REPORT_WARNING_BIT_EXT,
-      [NIR_SPIRV_DEBUG_LEVEL_ERROR] = VK_DEBUG_REPORT_ERROR_BIT_EXT,
-   };
-   char buffer[256];
+   /* Detect some cases of invalid primitive count. They might lead to URB
+    * memory corruption, where workgroups overwrite each other output memory.
+    */
+   if (nir_src_is_const(intrin->src[1]) &&
+         nir_src_as_uint(intrin->src[1]) > b->shader->info.mesh.max_primitives_out) {
+      assert(!"number of primitives bigger than max specified");
+   }
+
+   struct lower_set_vtx_and_prim_count_state *state = data;
+   /* this intrinsic should show up only once */
+   assert(state->primitive_count == NULL);
+
+   b->cursor = nir_before_instr(&intrin->instr);
+
+   state->primitive_count = anv_nir_prim_count_store(b, intrin->src[1].ssa);
 
-   snprintf(buffer, sizeof(buffer), "SPIR-V offset %lu: %s", (unsigned long) spirv_offset, message);
+   nir_instr_remove(&intrin->instr);
 
-   vk_debug_report(&instance->vk, vk_flags[level],
-                   &debug_data->module->base,
-                   0, 0, "anv", buffer);
+   return true;
+}
+
+static bool
+anv_nir_lower_set_vtx_and_prim_count(nir_shader *nir)
+{
+   struct lower_set_vtx_and_prim_count_state state = { NULL, };
+
+   nir_shader_intrinsics_pass(nir, anv_nir_lower_set_vtx_and_prim_count_instr,
+                                nir_metadata_none,
+                                &state);
+
+   /* If we didn't find set_vertex_and_primitive_count, then we have to
+    * insert store of value 0 to primitive_count.
+    */
+   if (state.primitive_count == NULL) {
+      nir_builder b;
+      nir_function_impl *entrypoint = nir_shader_get_entrypoint(nir);
+      b = nir_builder_at(nir_before_impl(entrypoint));
+      nir_def *zero = nir_imm_int(&b, 0);
+      state.primitive_count = anv_nir_prim_count_store(&b, zero);
+   }
+
+   assert(state.primitive_count != NULL);
+   return true;
 }
 
 /* Eventually, this will become part of anv_CreateShader.  Unfortunately,
  * we can't do that yet because we don't have the ability to copy nir.
  */
 static nir_shader *
-anv_shader_compile_to_nir(struct anv_device *device,
-                          void *mem_ctx,
-                          const struct vk_shader_module *module,
-                          const char *entrypoint_name,
-                          gl_shader_stage stage,
-                          const VkSpecializationInfo *spec_info)
+anv_shader_stage_to_nir(struct anv_device *device,
+                        const VkPipelineShaderStageCreateInfo *stage_info,
+                        enum brw_robustness_flags robust_flags,
+                        void *mem_ctx)
 {
    const struct anv_physical_device *pdevice = device->physical;
    const struct brw_compiler *compiler = pdevice->compiler;
+   gl_shader_stage stage = vk_to_mesa_shader_stage(stage_info->stage);
    const nir_shader_compiler_options *nir_options =
-      compiler->glsl_compiler_options[stage].NirOptions;
+      compiler->nir_options[stage];
 
-   uint32_t *spirv = (uint32_t *) module->data;
-   assert(spirv[0] == SPIR_V_MAGIC_NUMBER);
-   assert(module->size % 4 == 0);
-
-   uint32_t num_spec_entries = 0;
-   struct nir_spirv_specialization *spec_entries =
-      vk_spec_info_to_nir_spirv(spec_info, &num_spec_entries);
-
-   struct anv_spirv_debug_data spirv_debug_data = {
-      .device = device,
-      .module = module,
-   };
-   struct spirv_to_nir_options spirv_options = {
-      .frag_coord_is_sysval = true,
+   const bool rt_enabled = ANV_SUPPORT_RT && pdevice->info.has_ray_tracing;
+   const struct spirv_to_nir_options spirv_options = {
       .caps = {
+         .amd_image_gather_bias_lod = pdevice->info.ver >= 20,
+         .cooperative_matrix = anv_has_cooperative_matrix(pdevice),
          .demote_to_helper_invocation = true,
          .derivative_group = true,
          .descriptor_array_dynamic_indexing = true,
@@ -111,51 +153,60 @@ anv_shader_compile_to_nir(struct anv_device *device,
          .descriptor_indexing = true,
          .device_group = true,
          .draw_parameters = true,
-         .float16 = pdevice->info.ver >= 8,
+         .float16 = true,
          .float32_atomic_add = pdevice->info.has_lsc,
-         .float32_atomic_min_max = pdevice->info.ver >= 9,
-         .float64 = pdevice->info.ver >= 8,
+         .float32_atomic_min_max = true,
+         .float64 = true,
          .float64_atomic_min_max = pdevice->info.has_lsc,
-         .fragment_shader_sample_interlock = pdevice->info.ver >= 9,
-         .fragment_shader_pixel_interlock = pdevice->info.ver >= 9,
+         .fragment_shader_sample_interlock = true,
+         .fragment_shader_pixel_interlock = true,
          .geometry_streams = true,
+         .image_read_without_format = true,
          .image_write_without_format = true,
-         .int8 = pdevice->info.ver >= 8,
-         .int16 = pdevice->info.ver >= 8,
-         .int64 = pdevice->info.ver >= 8,
-         .int64_atomics = pdevice->info.ver >= 9 && pdevice->use_softpin,
-         .integer_functions2 = pdevice->info.ver >= 8,
+         .int8 = true,
+         .int16 = true,
+         .int64 = true,
+         .int64_atomics = true,
+         .integer_functions2 = true,
+         .mesh_shading = pdevice->vk.supported_extensions.EXT_mesh_shader,
+         .mesh_shading_nv = false,
          .min_lod = true,
          .multiview = true,
-         .physical_storage_buffer_address = pdevice->has_a64_buffer_access,
-         .post_depth_coverage = pdevice->info.ver >= 9,
+         .physical_storage_buffer_address = true,
+         .post_depth_coverage = true,
+         .quad_control = true,
          .runtime_descriptor_array = true,
-         .float_controls = pdevice->info.ver >= 8,
-         .ray_tracing = pdevice->info.has_ray_tracing,
+         .float_controls = true,
+         .float_controls2 = true,
+         .ray_cull_mask = rt_enabled,
+         .ray_query = rt_enabled,
+         .ray_tracing = rt_enabled,
+         .ray_tracing_position_fetch = rt_enabled,
          .shader_clock = true,
          .shader_viewport_index_layer = true,
-         .stencil_export = pdevice->info.ver >= 9,
-         .storage_8bit = pdevice->info.ver >= 8,
-         .storage_16bit = pdevice->info.ver >= 8,
+         .sparse_residency = pdevice->sparse_type != ANV_SPARSE_TYPE_NOT_SUPPORTED,
+         .stencil_export = true,
+         .storage_8bit = true,
+         .storage_16bit = true,
          .subgroup_arithmetic = true,
          .subgroup_basic = true,
          .subgroup_ballot = true,
+         .subgroup_dispatch = true,
          .subgroup_quad = true,
+         .subgroup_rotate = true,
          .subgroup_uniform_control_flow = true,
          .subgroup_shuffle = true,
          .subgroup_vote = true,
          .tessellation = true,
-         .transform_feedback = pdevice->info.ver >= 8,
+         .transform_feedback = true,
          .variable_pointers = true,
          .vk_memory_model = true,
          .vk_memory_model_device_scope = true,
          .workgroup_memory_explicit_layout = true,
          .fragment_shading_rate = pdevice->info.ver >= 11,
       },
-      .ubo_addr_format =
-         anv_nir_ubo_addr_format(pdevice, device->robust_buffer_access),
-      .ssbo_addr_format =
-          anv_nir_ssbo_addr_format(pdevice, device->robust_buffer_access),
+      .ubo_addr_format = anv_nir_ubo_addr_format(pdevice, robust_flags),
+      .ssbo_addr_format = anv_nir_ssbo_addr_format(pdevice, robust_flags),
       .phys_ssbo_addr_format = nir_address_format_64bit_global,
       .push_const_addr_format = nir_address_format_logical,
 
@@ -164,89 +215,36 @@ anv_shader_compile_to_nir(struct anv_device *device,
        * with certain code / code generators.
        */
       .shared_addr_format = nir_address_format_32bit_offset,
-      .debug = {
-         .func = anv_spirv_nir_debug,
-         .private_data = &spirv_debug_data,
-      },
-   };
 
+      .min_ubo_alignment = ANV_UBO_ALIGNMENT,
+      .min_ssbo_alignment = ANV_SSBO_ALIGNMENT,
+   };
 
-   nir_shader *nir =
-      spirv_to_nir(spirv, module->size / 4,
-                   spec_entries, num_spec_entries,
-                   stage, entrypoint_name, &spirv_options, nir_options);
-   if (!nir) {
-      free(spec_entries);
+   nir_shader *nir;
+   VkResult result =
+      vk_pipeline_shader_stage_to_nir(&device->vk, stage_info,
+                                      &spirv_options, nir_options,
+                                      mem_ctx, &nir);
+   if (result != VK_SUCCESS)
       return NULL;
-   }
-
-   assert(nir->info.stage == stage);
-   nir_validate_shader(nir, "after spirv_to_nir");
-   nir_validate_ssa_dominance(nir, "after spirv_to_nir");
-   ralloc_steal(mem_ctx, nir);
-
-   free(spec_entries);
 
-   if (INTEL_DEBUG & intel_debug_flag_for_shader_stage(stage)) {
+   if (INTEL_DEBUG(intel_debug_flag_for_shader_stage(stage))) {
       fprintf(stderr, "NIR (from SPIR-V) for %s shader:\n",
               gl_shader_stage_name(stage));
       nir_print_shader(nir, stderr);
    }
 
-   /* We have to lower away local constant initializers right before we
-    * inline functions.  That way they get properly initialized at the top
-    * of the function and not at the top of its caller.
-    */
-   NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
-   NIR_PASS_V(nir, nir_lower_returns);
-   NIR_PASS_V(nir, nir_inline_functions);
-   NIR_PASS_V(nir, nir_copy_prop);
-   NIR_PASS_V(nir, nir_opt_deref);
-
-   /* Pick off the single entrypoint that we want */
-   foreach_list_typed_safe(nir_function, func, node, &nir->functions) {
-      if (!func->is_entrypoint)
-         exec_node_remove(&func->node);
-   }
-   assert(exec_list_length(&nir->functions) == 1);
-
-   /* Now that we've deleted all but the main function, we can go ahead and
-    * lower the rest of the constant initializers.  We do this here so that
-    * nir_remove_dead_variables and split_per_member_structs below see the
-    * corresponding stores.
-    */
-   NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);
-
-   /* Split member structs.  We do this before lower_io_to_temporaries so that
-    * it doesn't lower system values to temporaries by accident.
-    */
-   NIR_PASS_V(nir, nir_split_var_copies);
-   NIR_PASS_V(nir, nir_split_per_member_structs);
-
-   NIR_PASS_V(nir, nir_remove_dead_variables,
-              nir_var_shader_in | nir_var_shader_out | nir_var_system_value |
-              nir_var_shader_call_data | nir_var_ray_hit_attrib,
-              NULL);
-
-   NIR_PASS_V(nir, nir_propagate_invariant, false);
    NIR_PASS_V(nir, nir_lower_io_to_temporaries,
               nir_shader_get_entrypoint(nir), true, false);
 
-   NIR_PASS_V(nir, nir_lower_frexp);
-
-   /* Vulkan uses the separate-shader linking model */
-   nir->info.separate_shader = true;
-
-   brw_preprocess_nir(compiler, nir, NULL);
-
    return nir;
 }
 
-VkResult
+static VkResult
 anv_pipeline_init(struct anv_pipeline *pipeline,
                   struct anv_device *device,
                   enum anv_pipeline_type type,
-                  VkPipelineCreateFlags flags,
+                  VkPipelineCreateFlags2KHR flags,
                   const VkAllocationCallbacks *pAllocator)
 {
    VkResult result;
@@ -264,8 +262,9 @@ anv_pipeline_init(struct anv_pipeline *pipeline,
    pipeline->batch.relocs = &pipeline->batch_relocs;
    pipeline->batch.status = VK_SUCCESS;
 
+   const bool uses_relocs = device->physical->uses_relocs;
    result = anv_reloc_list_init(&pipeline->batch_relocs,
-                                pipeline->batch.alloc);
+                                pipeline->batch.alloc, uses_relocs);
    if (result != VK_SUCCESS)
       return result;
 
@@ -276,16 +275,40 @@ anv_pipeline_init(struct anv_pipeline *pipeline,
 
    util_dynarray_init(&pipeline->executables, pipeline->mem_ctx);
 
+   anv_pipeline_sets_layout_init(&pipeline->layout, device,
+                                 false /* independent_sets */);
+
    return VK_SUCCESS;
 }
 
-void
+static void
+anv_pipeline_init_layout(struct anv_pipeline *pipeline,
+                         struct anv_pipeline_layout *pipeline_layout)
+{
+   if (pipeline_layout) {
+      struct anv_pipeline_sets_layout *layout = &pipeline_layout->sets_layout;
+      for (uint32_t s = 0; s < layout->num_sets; s++) {
+         if (layout->set[s].layout == NULL)
+            continue;
+
+         anv_pipeline_sets_layout_add(&pipeline->layout, s,
+                                      layout->set[s].layout);
+      }
+   }
+
+   anv_pipeline_sets_layout_hash(&pipeline->layout);
+   assert(!pipeline_layout ||
+          !memcmp(pipeline->layout.sha1,
+                  pipeline_layout->sets_layout.sha1,
+                  sizeof(pipeline_layout->sets_layout.sha1)));
+}
+
+static void
 anv_pipeline_finish(struct anv_pipeline *pipeline,
-                    struct anv_device *device,
-                    const VkAllocationCallbacks *pAllocator)
+                    struct anv_device *device)
 {
-   anv_reloc_list_finish(&pipeline->batch_relocs,
-                         pAllocator ? pAllocator : &device->vk.alloc);
+   anv_pipeline_sets_layout_fini(&pipeline->layout);
+   anv_reloc_list_finish(&pipeline->batch_relocs);
    ralloc_free(pipeline->mem_ctx);
    vk_object_base_finish(&pipeline->base);
 }
@@ -301,19 +324,27 @@ void anv_DestroyPipeline(
    if (!pipeline)
       return;
 
+   ANV_RMV(resource_destroy, device, pipeline);
+
    switch (pipeline->type) {
+   case ANV_PIPELINE_GRAPHICS_LIB: {
+      struct anv_graphics_lib_pipeline *gfx_pipeline =
+         anv_pipeline_to_graphics_lib(pipeline);
+
+      for (unsigned s = 0; s < ARRAY_SIZE(gfx_pipeline->base.shaders); s++) {
+         if (gfx_pipeline->base.shaders[s])
+            anv_shader_bin_unref(device, gfx_pipeline->base.shaders[s]);
+      }
+      break;
+   }
+
    case ANV_PIPELINE_GRAPHICS: {
       struct anv_graphics_pipeline *gfx_pipeline =
          anv_pipeline_to_graphics(pipeline);
 
-      if (gfx_pipeline->blend_state.map)
-         anv_state_pool_free(&device->dynamic_state_pool, gfx_pipeline->blend_state);
-      if (gfx_pipeline->cps_state.map)
-         anv_state_pool_free(&device->dynamic_state_pool, gfx_pipeline->cps_state);
-
-      for (unsigned s = 0; s < ARRAY_SIZE(gfx_pipeline->shaders); s++) {
-         if (gfx_pipeline->shaders[s])
-            anv_shader_bin_unref(device, gfx_pipeline->shaders[s]);
+      for (unsigned s = 0; s < ARRAY_SIZE(gfx_pipeline->base.shaders); s++) {
+         if (gfx_pipeline->base.shaders[s])
+            anv_shader_bin_unref(device, gfx_pipeline->base.shaders[s]);
       }
       break;
    }
@@ -343,358 +374,436 @@ void anv_DestroyPipeline(
       unreachable("invalid pipeline type");
    }
 
-   anv_pipeline_finish(pipeline, device, pAllocator);
+   anv_pipeline_finish(pipeline, device);
    vk_free2(&device->vk.alloc, pAllocator, pipeline);
 }
 
-static const uint32_t vk_to_intel_primitive_type[] = {
-   [VK_PRIMITIVE_TOPOLOGY_POINT_LIST]                    = _3DPRIM_POINTLIST,
-   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST]                     = _3DPRIM_LINELIST,
-   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP]                    = _3DPRIM_LINESTRIP,
-   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST]                 = _3DPRIM_TRILIST,
-   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP]                = _3DPRIM_TRISTRIP,
-   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN]                  = _3DPRIM_TRIFAN,
-   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY]      = _3DPRIM_LINELIST_ADJ,
-   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY]     = _3DPRIM_LINESTRIP_ADJ,
-   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY]  = _3DPRIM_TRILIST_ADJ,
-   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
-};
+struct anv_pipeline_stage {
+   gl_shader_stage stage;
 
-static void
-populate_sampler_prog_key(const struct intel_device_info *devinfo,
-                          struct brw_sampler_prog_key_data *key)
-{
-   /* Almost all multisampled textures are compressed.  The only time when we
-    * don't compress a multisampled texture is for 16x MSAA with a surface
-    * width greater than 8k which is a bit of an edge case.  Since the sampler
-    * just ignores the MCS parameter to ld2ms when MCS is disabled, it's safe
-    * to tell the compiler to always assume compression.
-    */
-   key->compressed_multisample_layout_mask = ~0;
-
-   /* SkyLake added support for 16x MSAA.  With this came a new message for
-    * reading from a 16x MSAA surface with compression.  The new message was
-    * needed because now the MCS data is 64 bits instead of 32 or lower as is
-    * the case for 8x, 4x, and 2x.  The key->msaa_16 bit-field controls which
-    * message we use.  Fortunately, the 16x message works for 8x, 4x, and 2x
-    * so we can just use it unconditionally.  This may not be quite as
-    * efficient but it saves us from recompiling.
+   struct vk_pipeline_robustness_state rstate;
+
+   /* VkComputePipelineCreateInfo, VkGraphicsPipelineCreateInfo or
+    * VkRayTracingPipelineCreateInfoKHR pNext field
     */
-   if (devinfo->ver >= 9)
-      key->msaa_16 = ~0;
+   const void *pipeline_pNext;
+   const VkPipelineShaderStageCreateInfo *info;
 
-   /* XXX: Handle texture swizzle on HSW- */
-   for (int i = 0; i < MAX_SAMPLERS; i++) {
-      /* Assume color sampler, no swizzling. (Works for BDW+) */
-      key->swizzles[i] = SWIZZLE_XYZW;
-   }
-}
+   unsigned char shader_sha1[20];
+   uint32_t      source_hash;
+
+   union brw_any_prog_key key;
+
+   struct {
+      gl_shader_stage stage;
+      unsigned char sha1[20];
+   } cache_key;
+
+   nir_shader *nir;
+
+   struct {
+      nir_shader *nir;
+      struct anv_shader_bin *bin;
+   } imported;
+
+   struct anv_push_descriptor_info push_desc_info;
+
+   enum gl_subgroup_size subgroup_size_type;
+
+   enum brw_robustness_flags robust_flags;
+
+   struct anv_pipeline_bind_map bind_map;
+
+   bool uses_bt_for_push_descs;
+
+   enum anv_dynamic_push_bits dynamic_push_values;
+
+   union brw_any_prog_data prog_data;
+
+   uint32_t num_stats;
+   struct brw_compile_stats stats[3];
+   char *disasm[3];
+
+   VkPipelineCreationFeedback feedback;
+   uint32_t feedback_idx;
+
+   const unsigned *code;
+
+   struct anv_shader_bin *bin;
+};
 
 static void
-populate_base_prog_key(const struct intel_device_info *devinfo,
-                       VkPipelineShaderStageCreateFlags flags,
-                       bool robust_buffer_acccess,
-                       struct brw_base_prog_key *key)
+anv_stage_allocate_bind_map_tables(struct anv_pipeline *pipeline,
+                                   struct anv_pipeline_stage *stage,
+                                   void *mem_ctx)
 {
-   if (flags & VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT)
-      key->subgroup_size_type = BRW_SUBGROUP_SIZE_VARYING;
-   else
-      key->subgroup_size_type = BRW_SUBGROUP_SIZE_API_CONSTANT;
-
-   key->robust_buffer_access = robust_buffer_acccess;
+   struct anv_pipeline_binding *surface_bindings =
+      brw_shader_stage_requires_bindless_resources(stage->stage) ? NULL :
+      rzalloc_array(mem_ctx, struct anv_pipeline_binding, 256);
+   struct anv_pipeline_binding *sampler_bindings =
+      brw_shader_stage_requires_bindless_resources(stage->stage) ? NULL :
+      rzalloc_array(mem_ctx, struct anv_pipeline_binding, 256);
+   struct anv_pipeline_embedded_sampler_binding *embedded_sampler_bindings =
+      rzalloc_array(mem_ctx, struct anv_pipeline_embedded_sampler_binding,
+                    anv_pipeline_sets_layout_embedded_sampler_count(
+                       &pipeline->layout));
+
+   stage->bind_map = (struct anv_pipeline_bind_map) {
+      .surface_to_descriptor = surface_bindings,
+      .sampler_to_descriptor = sampler_bindings,
+      .embedded_sampler_to_binding = embedded_sampler_bindings,
+   };
+}
 
-   populate_sampler_prog_key(devinfo, &key->tex);
+static enum brw_robustness_flags
+anv_get_robust_flags(const struct vk_pipeline_robustness_state *rstate)
+{
+   return
+      ((rstate->storage_buffers !=
+        VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT) ?
+       BRW_ROBUSTNESS_SSBO : 0) |
+      ((rstate->uniform_buffers !=
+        VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT) ?
+       BRW_ROBUSTNESS_UBO : 0);
 }
 
 static void
-populate_vs_prog_key(const struct intel_device_info *devinfo,
-                     VkPipelineShaderStageCreateFlags flags,
-                     bool robust_buffer_acccess,
-                     struct brw_vs_prog_key *key)
+populate_base_prog_key(struct anv_pipeline_stage *stage,
+                       const struct anv_device *device)
 {
-   memset(key, 0, sizeof(*key));
-
-   populate_base_prog_key(devinfo, flags, robust_buffer_acccess, &key->base);
+   stage->key.base.robust_flags = anv_get_robust_flags(&stage->rstate);
+   stage->key.base.limit_trig_input_range =
+      device->physical->instance->limit_trig_input_range;
+}
 
-   /* XXX: Handle vertex input work-arounds */
+static void
+populate_vs_prog_key(struct anv_pipeline_stage *stage,
+                     const struct anv_device *device)
+{
+   memset(&stage->key, 0, sizeof(stage->key));
 
-   /* XXX: Handle sampler_prog_key */
+   populate_base_prog_key(stage, device);
 }
 
 static void
-populate_tcs_prog_key(const struct intel_device_info *devinfo,
-                      VkPipelineShaderStageCreateFlags flags,
-                      bool robust_buffer_acccess,
-                      unsigned input_vertices,
-                      struct brw_tcs_prog_key *key)
+populate_tcs_prog_key(struct anv_pipeline_stage *stage,
+                      const struct anv_device *device,
+                      unsigned input_vertices)
 {
-   memset(key, 0, sizeof(*key));
+   memset(&stage->key, 0, sizeof(stage->key));
 
-   populate_base_prog_key(devinfo, flags, robust_buffer_acccess, &key->base);
+   populate_base_prog_key(stage, device);
 
-   key->input_vertices = input_vertices;
+   stage->key.tcs.input_vertices = input_vertices;
 }
 
 static void
-populate_tes_prog_key(const struct intel_device_info *devinfo,
-                      VkPipelineShaderStageCreateFlags flags,
-                      bool robust_buffer_acccess,
-                      struct brw_tes_prog_key *key)
+populate_tes_prog_key(struct anv_pipeline_stage *stage,
+                      const struct anv_device *device)
 {
-   memset(key, 0, sizeof(*key));
+   memset(&stage->key, 0, sizeof(stage->key));
 
-   populate_base_prog_key(devinfo, flags, robust_buffer_acccess, &key->base);
+   populate_base_prog_key(stage, device);
 }
 
 static void
-populate_gs_prog_key(const struct intel_device_info *devinfo,
-                     VkPipelineShaderStageCreateFlags flags,
-                     bool robust_buffer_acccess,
-                     struct brw_gs_prog_key *key)
+populate_gs_prog_key(struct anv_pipeline_stage *stage,
+                     const struct anv_device *device)
 {
-   memset(key, 0, sizeof(*key));
+   memset(&stage->key, 0, sizeof(stage->key));
 
-   populate_base_prog_key(devinfo, flags, robust_buffer_acccess, &key->base);
+   populate_base_prog_key(stage, device);
 }
 
 static bool
-pipeline_has_coarse_pixel(const struct anv_graphics_pipeline *pipeline,
-                          const VkPipelineFragmentShadingRateStateCreateInfoKHR *fsr_info)
+pipeline_has_coarse_pixel(const BITSET_WORD *dynamic,
+                          const struct vk_multisample_state *ms,
+                          const struct vk_fragment_shading_rate_state *fsr)
 {
-   if (pipeline->sample_shading_enable)
-      return false;
-
-   /* Not dynamic & not specified for the pipeline. */
-   if ((pipeline->dynamic_states & ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE) == 0 && !fsr_info)
+   /* The Vulkan 1.2.199 spec says:
+    *
+    *    "If any of the following conditions are met, Cxy' must be set to
+    *    {1,1}:
+    *
+    *     * If Sample Shading is enabled.
+    *     * [...]"
+    *
+    * And "sample shading" is defined as follows:
+    *
+    *    "Sample shading is enabled for a graphics pipeline:
+    *
+    *     * If the interface of the fragment shader entry point of the
+    *       graphics pipeline includes an input variable decorated with
+    *       SampleId or SamplePosition. In this case minSampleShadingFactor
+    *       takes the value 1.0.
+    *
+    *     * Else if the sampleShadingEnable member of the
+    *       VkPipelineMultisampleStateCreateInfo structure specified when
+    *       creating the graphics pipeline is set to VK_TRUE. In this case
+    *       minSampleShadingFactor takes the value of
+    *       VkPipelineMultisampleStateCreateInfo::minSampleShading.
+    *
+    *    Otherwise, sample shading is considered disabled."
+    *
+    * The first bullet above is handled by the back-end compiler because those
+    * inputs both force per-sample dispatch.  The second bullet is handled
+    * here.  Note that this sample shading being enabled has nothing to do
+    * with minSampleShading.
+    */
+   if (ms != NULL && ms->sample_shading_enable)
       return false;
 
    /* Not dynamic & pipeline has a 1x1 fragment shading rate with no
-    * possibility for element of the pipeline to change the value.
+    * possibility for element of the pipeline to change the value or fragment
+    * shading rate not specified at all.
     */
-   if ((pipeline->dynamic_states & ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE) == 0 &&
-       fsr_info->fragmentSize.width <= 1 &&
-       fsr_info->fragmentSize.height <= 1 &&
-       fsr_info->combinerOps[0] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR &&
-       fsr_info->combinerOps[1] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR)
+   if (!BITSET_TEST(dynamic, MESA_VK_DYNAMIC_FSR) &&
+       (fsr == NULL ||
+        (fsr->fragment_size.width <= 1 &&
+         fsr->fragment_size.height <= 1 &&
+         fsr->combiner_ops[0] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR &&
+         fsr->combiner_ops[1] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR)))
       return false;
 
    return true;
 }
 
 static void
-populate_wm_prog_key(const struct anv_graphics_pipeline *pipeline,
-                     VkPipelineShaderStageCreateFlags flags,
-                     bool robust_buffer_acccess,
-                     const struct anv_subpass *subpass,
-                     const VkPipelineMultisampleStateCreateInfo *ms_info,
-                     const VkPipelineFragmentShadingRateStateCreateInfoKHR *fsr_info,
-                     struct brw_wm_prog_key *key)
+populate_task_prog_key(struct anv_pipeline_stage *stage,
+                       const struct anv_device *device)
+{
+   memset(&stage->key, 0, sizeof(stage->key));
+
+   populate_base_prog_key(stage, device);
+}
+
+static void
+populate_mesh_prog_key(struct anv_pipeline_stage *stage,
+                       const struct anv_device *device,
+                       bool compact_mue)
+{
+   memset(&stage->key, 0, sizeof(stage->key));
+
+   populate_base_prog_key(stage, device);
+
+   stage->key.mesh.compact_mue = compact_mue;
+}
+
+static uint32_t
+rp_color_mask(const struct vk_render_pass_state *rp)
+{
+   if (rp == NULL || !vk_render_pass_state_has_attachment_info(rp))
+      return ((1u << MAX_RTS) - 1);
+
+   uint32_t color_mask = 0;
+   for (uint32_t i = 0; i < rp->color_attachment_count; i++) {
+      if (rp->color_attachment_formats[i] != VK_FORMAT_UNDEFINED)
+         color_mask |= BITFIELD_BIT(i);
+   }
+
+   return color_mask;
+}
+
+static void
+populate_wm_prog_key(struct anv_pipeline_stage *stage,
+                     const struct anv_graphics_base_pipeline *pipeline,
+                     const BITSET_WORD *dynamic,
+                     const struct vk_multisample_state *ms,
+                     const struct vk_fragment_shading_rate_state *fsr,
+                     const struct vk_render_pass_state *rp,
+                     const enum brw_sometimes is_mesh)
 {
    const struct anv_device *device = pipeline->base.device;
-   const struct intel_device_info *devinfo = &device->info;
 
-   memset(key, 0, sizeof(*key));
+   memset(&stage->key, 0, sizeof(stage->key));
+
+   populate_base_prog_key(stage, device);
 
-   populate_base_prog_key(devinfo, flags, robust_buffer_acccess, &key->base);
+   struct brw_wm_prog_key *key = &stage->key.wm;
 
    /* We set this to 0 here and set to the actual value before we call
     * brw_compile_fs.
     */
    key->input_slots_valid = 0;
 
-   /* Vulkan doesn't specify a default */
-   key->high_quality_derivatives = false;
-
    /* XXX Vulkan doesn't appear to specify */
    key->clamp_fragment_color = false;
 
    key->ignore_sample_mask_out = false;
 
-   assert(subpass->color_count <= MAX_RTS);
-   for (uint32_t i = 0; i < subpass->color_count; i++) {
-      if (subpass->color_attachments[i].attachment != VK_ATTACHMENT_UNUSED)
-         key->color_outputs_valid |= (1 << i);
-   }
-
-   key->nr_color_regions = subpass->color_count;
+   assert(rp == NULL || rp->color_attachment_count <= MAX_RTS);
+   /* Consider all inputs as valid until look at the NIR variables. */
+   key->color_outputs_valid = rp_color_mask(rp);
+   key->nr_color_regions = util_last_bit(key->color_outputs_valid);
 
    /* To reduce possible shader recompilations we would need to know if
     * there is a SampleMask output variable to compute if we should emit
     * code to workaround the issue that hardware disables alpha to coverage
     * when there is SampleMask output.
+    *
+    * If the pipeline we compile the fragment shader in includes the output
+    * interface, then we can be sure whether alpha_coverage is enabled or not.
+    * If we don't have that output interface, then we have to compile the
+    * shader with some conditionals.
     */
-   key->alpha_to_coverage = ms_info && ms_info->alphaToCoverageEnable;
-
-   /* Vulkan doesn't support fixed-function alpha test */
-   key->alpha_test_replicate_alpha = false;
-
-   if (ms_info) {
-      /* We should probably pull this out of the shader, but it's fairly
-       * harmless to compute it and then let dead-code take care of it.
+   if (ms != NULL) {
+      /* VUID-VkGraphicsPipelineCreateInfo-rasterizerDiscardEnable-00751:
+       *
+       *   "If the pipeline is being created with fragment shader state,
+       *    pMultisampleState must be a valid pointer to a valid
+       *    VkPipelineMultisampleStateCreateInfo structure"
+       *
+       * It's also required for the fragment output interface.
        */
-      if (ms_info->rasterizationSamples > 1) {
-         key->persample_interp = ms_info->sampleShadingEnable &&
-            (ms_info->minSampleShading * ms_info->rasterizationSamples) > 1;
-         key->multisample_fbo = true;
-      }
+      key->multisample_fbo =
+         BITSET_TEST(dynamic, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ?
+         BRW_SOMETIMES :
+         ms->rasterization_samples > 1 ? BRW_ALWAYS : BRW_NEVER;
+      key->persample_interp =
+         BITSET_TEST(dynamic, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ?
+         BRW_SOMETIMES :
+         (ms->sample_shading_enable &&
+          (ms->min_sample_shading * ms->rasterization_samples) > 1) ?
+         BRW_ALWAYS : BRW_NEVER;
+      key->alpha_to_coverage =
+         BITSET_TEST(dynamic, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ?
+         BRW_SOMETIMES :
+         (ms->alpha_to_coverage_enable ? BRW_ALWAYS : BRW_NEVER);
+
+      /* TODO: We should make this dynamic */
+      if (device->physical->instance->sample_mask_out_opengl_behaviour)
+         key->ignore_sample_mask_out = !key->multisample_fbo;
+   } else {
+      /* Consider all inputs as valid until we look at the NIR variables. */
+      key->color_outputs_valid = (1u << MAX_RTS) - 1;
+      key->nr_color_regions = MAX_RTS;
 
-      key->frag_coord_adds_sample_pos = key->persample_interp;
+      key->alpha_to_coverage = BRW_SOMETIMES;
+      key->multisample_fbo = BRW_SOMETIMES;
+      key->persample_interp = BRW_SOMETIMES;
    }
 
-   key->coarse_pixel =
-      device->vk.enabled_extensions.KHR_fragment_shading_rate &&
-      pipeline_has_coarse_pixel(pipeline, fsr_info);
-}
-
-static void
-populate_cs_prog_key(const struct intel_device_info *devinfo,
-                     VkPipelineShaderStageCreateFlags flags,
-                     bool robust_buffer_acccess,
-                     const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT *rss_info,
-                     struct brw_cs_prog_key *key)
-{
-   memset(key, 0, sizeof(*key));
+   key->mesh_input = is_mesh;
 
-   populate_base_prog_key(devinfo, flags, robust_buffer_acccess, &key->base);
-
-   if (rss_info) {
-      assert(key->base.subgroup_size_type != BRW_SUBGROUP_SIZE_VARYING);
+   /* Vulkan doesn't support fixed-function alpha test */
+   key->alpha_test_replicate_alpha = false;
 
-      /* These enum values are expressly chosen to be equal to the subgroup
-       * size that they require.
-       */
-      assert(rss_info->requiredSubgroupSize == 8 ||
-             rss_info->requiredSubgroupSize == 16 ||
-             rss_info->requiredSubgroupSize == 32);
-      key->base.subgroup_size_type = rss_info->requiredSubgroupSize;
-   } else if (flags & VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT) {
-      /* If the client expressly requests full subgroups and they don't
-       * specify a subgroup size, we need to pick one.  If they're requested
-       * varying subgroup sizes, we set it to UNIFORM and let the back-end
-       * compiler pick.  Otherwise, we specify the API value of 32.
-       * Performance will likely be terrible in this case but there's nothing
-       * we can do about that.  The client should have chosen a size.
-       */
-      if (flags & VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT)
-         key->base.subgroup_size_type = BRW_SUBGROUP_SIZE_UNIFORM;
-      else
-         key->base.subgroup_size_type = BRW_SUBGROUP_SIZE_REQUIRE_32;
-   }
+  key->coarse_pixel =
+     device->vk.enabled_extensions.KHR_fragment_shading_rate &&
+     pipeline_has_coarse_pixel(dynamic, ms, fsr);
 }
 
 static void
-populate_bs_prog_key(const struct intel_device_info *devinfo,
-                     VkPipelineShaderStageCreateFlags flags,
-                     bool robust_buffer_access,
-                     struct brw_bs_prog_key *key)
+populate_cs_prog_key(struct anv_pipeline_stage *stage,
+                     const struct anv_device *device)
 {
-   memset(key, 0, sizeof(*key));
+   memset(&stage->key, 0, sizeof(stage->key));
 
-   populate_base_prog_key(devinfo, flags, robust_buffer_access, &key->base);
+   populate_base_prog_key(stage, device);
 }
 
-struct anv_pipeline_stage {
-   gl_shader_stage stage;
-
-   const struct vk_shader_module *module;
-   const char *entrypoint;
-   const VkSpecializationInfo *spec_info;
-
-   unsigned char shader_sha1[20];
-
-   union brw_any_prog_key key;
+static void
+populate_bs_prog_key(struct anv_pipeline_stage *stage,
+                     const struct anv_device *device,
+                     uint32_t ray_flags)
+{
+   memset(&stage->key, 0, sizeof(stage->key));
 
-   struct {
-      gl_shader_stage stage;
-      unsigned char sha1[20];
-   } cache_key;
+   populate_base_prog_key(stage, device);
 
-   nir_shader *nir;
+   stage->key.bs.pipeline_ray_flags = ray_flags;
+   stage->key.bs.pipeline_ray_flags = ray_flags;
+}
 
-   struct anv_pipeline_binding surface_to_descriptor[256];
-   struct anv_pipeline_binding sampler_to_descriptor[256];
-   struct anv_pipeline_bind_map bind_map;
+static void
+anv_stage_write_shader_hash(struct anv_pipeline_stage *stage,
+                            const struct anv_device *device)
+{
+   vk_pipeline_robustness_state_fill(&device->vk,
+                                     &stage->rstate,
+                                     stage->pipeline_pNext,
+                                     stage->info->pNext);
 
-   union brw_any_prog_data prog_data;
+   vk_pipeline_hash_shader_stage(stage->info, &stage->rstate, stage->shader_sha1);
 
-   uint32_t num_stats;
-   struct brw_compile_stats stats[3];
-   char *disasm[3];
+   stage->robust_flags = anv_get_robust_flags(&stage->rstate);
 
-   VkPipelineCreationFeedbackEXT feedback;
+   /* Use lowest dword of source shader sha1 for shader hash. */
+   stage->source_hash = ((uint32_t*)stage->shader_sha1)[0];
+}
 
-   const unsigned *code;
+static bool
+anv_graphics_pipeline_stage_fragment_dynamic(const struct anv_pipeline_stage *stage)
+{
+   if (stage->stage != MESA_SHADER_FRAGMENT)
+      return false;
 
-   struct anv_shader_bin *bin;
-};
+   return stage->key.wm.persample_interp == BRW_SOMETIMES ||
+          stage->key.wm.multisample_fbo == BRW_SOMETIMES ||
+          stage->key.wm.alpha_to_coverage == BRW_SOMETIMES;
+}
 
 static void
-anv_pipeline_hash_shader(const struct vk_shader_module *module,
-                         const char *entrypoint,
-                         gl_shader_stage stage,
-                         const VkSpecializationInfo *spec_info,
-                         unsigned char *sha1_out)
+anv_pipeline_hash_common(struct mesa_sha1 *ctx,
+                         const struct anv_pipeline *pipeline)
 {
-   struct mesa_sha1 ctx;
-   _mesa_sha1_init(&ctx);
+   struct anv_device *device = pipeline->device;
 
-   _mesa_sha1_update(&ctx, module->sha1, sizeof(module->sha1));
-   _mesa_sha1_update(&ctx, entrypoint, strlen(entrypoint));
-   _mesa_sha1_update(&ctx, &stage, sizeof(stage));
-   if (spec_info) {
-      _mesa_sha1_update(&ctx, spec_info->pMapEntries,
-                        spec_info->mapEntryCount *
-                        sizeof(*spec_info->pMapEntries));
-      _mesa_sha1_update(&ctx, spec_info->pData,
-                        spec_info->dataSize);
-   }
+   _mesa_sha1_update(ctx, pipeline->layout.sha1, sizeof(pipeline->layout.sha1));
 
-   _mesa_sha1_final(&ctx, sha1_out);
+   const bool indirect_descriptors = device->physical->indirect_descriptors;
+   _mesa_sha1_update(ctx, &indirect_descriptors, sizeof(indirect_descriptors));
+
+   const bool rba = device->robust_buffer_access;
+   _mesa_sha1_update(ctx, &rba, sizeof(rba));
+
+   const int spilling_rate = device->physical->compiler->spilling_rate;
+   _mesa_sha1_update(ctx, &spilling_rate, sizeof(spilling_rate));
 }
 
 static void
-anv_pipeline_hash_graphics(struct anv_graphics_pipeline *pipeline,
-                           struct anv_pipeline_layout *layout,
+anv_pipeline_hash_graphics(struct anv_graphics_base_pipeline *pipeline,
                            struct anv_pipeline_stage *stages,
+                           uint32_t view_mask,
                            unsigned char *sha1_out)
 {
+   const struct anv_device *device = pipeline->base.device;
    struct mesa_sha1 ctx;
    _mesa_sha1_init(&ctx);
 
-   _mesa_sha1_update(&ctx, &pipeline->subpass->view_mask,
-                     sizeof(pipeline->subpass->view_mask));
+   anv_pipeline_hash_common(&ctx, &pipeline->base);
 
-   if (layout)
-      _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
+   _mesa_sha1_update(&ctx, &view_mask, sizeof(view_mask));
 
-   const bool rba = pipeline->base.device->robust_buffer_access;
-   _mesa_sha1_update(&ctx, &rba, sizeof(rba));
-
-   for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
-      if (stages[s].entrypoint) {
+   for (uint32_t s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
+      if (pipeline->base.active_stages & BITFIELD_BIT(s)) {
          _mesa_sha1_update(&ctx, stages[s].shader_sha1,
                            sizeof(stages[s].shader_sha1));
          _mesa_sha1_update(&ctx, &stages[s].key, brw_prog_key_size(s));
       }
    }
 
+   if (stages[MESA_SHADER_MESH].info || stages[MESA_SHADER_TASK].info) {
+      const uint8_t afs = device->physical->instance->assume_full_subgroups;
+      _mesa_sha1_update(&ctx, &afs, sizeof(afs));
+   }
+
    _mesa_sha1_final(&ctx, sha1_out);
 }
 
 static void
 anv_pipeline_hash_compute(struct anv_compute_pipeline *pipeline,
-                          struct anv_pipeline_layout *layout,
                           struct anv_pipeline_stage *stage,
                           unsigned char *sha1_out)
 {
+   const struct anv_device *device = pipeline->base.device;
    struct mesa_sha1 ctx;
    _mesa_sha1_init(&ctx);
 
-   if (layout)
-      _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
+   anv_pipeline_hash_common(&ctx, &pipeline->base);
 
-   const bool rba = pipeline->base.device->robust_buffer_access;
-   _mesa_sha1_update(&ctx, &rba, sizeof(rba));
+   const uint8_t afs = device->physical->instance->assume_full_subgroups;
+   _mesa_sha1_update(&ctx, &afs, sizeof(afs));
 
    _mesa_sha1_update(&ctx, stage->shader_sha1,
                      sizeof(stage->shader_sha1));
@@ -705,18 +814,13 @@ anv_pipeline_hash_compute(struct anv_compute_pipeline *pipeline,
 
 static void
 anv_pipeline_hash_ray_tracing_shader(struct anv_ray_tracing_pipeline *pipeline,
-                                     struct anv_pipeline_layout *layout,
                                      struct anv_pipeline_stage *stage,
                                      unsigned char *sha1_out)
 {
    struct mesa_sha1 ctx;
    _mesa_sha1_init(&ctx);
 
-   if (layout != NULL)
-      _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
-
-   const bool rba = pipeline->base.device->robust_buffer_access;
-   _mesa_sha1_update(&ctx, &rba, sizeof(rba));
+   anv_pipeline_hash_common(&ctx, &pipeline->base);
 
    _mesa_sha1_update(&ctx, stage->shader_sha1, sizeof(stage->shader_sha1));
    _mesa_sha1_update(&ctx, &stage->key, sizeof(stage->key.bs));
@@ -726,7 +830,6 @@ anv_pipeline_hash_ray_tracing_shader(struct anv_ray_tracing_pipeline *pipeline,
 
 static void
 anv_pipeline_hash_ray_tracing_combined_shader(struct anv_ray_tracing_pipeline *pipeline,
-                                              struct anv_pipeline_layout *layout,
                                               struct anv_pipeline_stage *intersection,
                                               struct anv_pipeline_stage *any_hit,
                                               unsigned char *sha1_out)
@@ -734,8 +837,8 @@ anv_pipeline_hash_ray_tracing_combined_shader(struct anv_ray_tracing_pipeline *p
    struct mesa_sha1 ctx;
    _mesa_sha1_init(&ctx);
 
-   if (layout != NULL)
-      _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
+   _mesa_sha1_update(&ctx, pipeline->base.layout.sha1,
+                     sizeof(pipeline->base.layout.sha1));
 
    const bool rba = pipeline->base.device->robust_buffer_access;
    _mesa_sha1_update(&ctx, &rba, sizeof(rba));
@@ -750,14 +853,14 @@ anv_pipeline_hash_ray_tracing_combined_shader(struct anv_ray_tracing_pipeline *p
 
 static nir_shader *
 anv_pipeline_stage_get_nir(struct anv_pipeline *pipeline,
-                           struct anv_pipeline_cache *cache,
+                           struct vk_pipeline_cache *cache,
                            void *mem_ctx,
                            struct anv_pipeline_stage *stage)
 {
    const struct brw_compiler *compiler =
       pipeline->device->physical->compiler;
    const nir_shader_compiler_options *nir_options =
-      compiler->glsl_compiler_options[stage->stage].NirOptions;
+      compiler->nir_options[stage->stage];
    nir_shader *nir;
 
    nir = anv_device_search_for_nir(pipeline->device, cache,
@@ -769,12 +872,8 @@ anv_pipeline_stage_get_nir(struct anv_pipeline *pipeline,
       return nir;
    }
 
-   nir = anv_shader_compile_to_nir(pipeline->device,
-                                   mem_ctx,
-                                   stage->module,
-                                   stage->entrypoint,
-                                   stage->stage,
-                                   stage->spec_info);
+   nir = anv_shader_stage_to_nir(pipeline->device, stage->info,
+                                 stage->key.base.robust_flags, mem_ctx);
    if (nir) {
       anv_device_upload_nir(pipeline->device, cache, nir, stage->shader_sha1);
       return nir;
@@ -783,6 +882,29 @@ anv_pipeline_stage_get_nir(struct anv_pipeline *pipeline,
    return NULL;
 }
 
+static const struct vk_ycbcr_conversion_state *
+lookup_ycbcr_conversion(const void *_sets_layout, uint32_t set,
+                        uint32_t binding, uint32_t array_index)
+{
+   const struct anv_pipeline_sets_layout *sets_layout = _sets_layout;
+
+   assert(set < MAX_SETS);
+   assert(binding < sets_layout->set[set].layout->binding_count);
+   const struct anv_descriptor_set_binding_layout *bind_layout =
+      &sets_layout->set[set].layout->binding[binding];
+
+   if (bind_layout->immutable_samplers == NULL)
+      return NULL;
+
+   array_index = MIN2(array_index, bind_layout->array_size - 1);
+
+   const struct anv_sampler *sampler =
+      bind_layout->immutable_samplers[array_index];
+
+   return sampler && sampler->vk.ycbcr_conversion ?
+          &sampler->vk.ycbcr_conversion->state : NULL;
+}
+
 static void
 shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
 {
@@ -795,11 +917,91 @@ shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
    *align = comp_size * (length == 3 ? 4 : length);
 }
 
+static enum anv_dynamic_push_bits
+anv_nir_compute_dynamic_push_bits(nir_shader *shader)
+{
+   enum anv_dynamic_push_bits ret = 0;
+
+   nir_foreach_function_impl(impl, shader) {
+      nir_foreach_block(block, impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            if (intrin->intrinsic != nir_intrinsic_load_push_constant)
+               continue;
+
+            switch (nir_intrinsic_base(intrin)) {
+            case offsetof(struct anv_push_constants, gfx.tcs_input_vertices):
+               ret |= ANV_DYNAMIC_PUSH_INPUT_VERTICES;
+               break;
+
+            default:
+               break;
+            }
+         }
+      }
+   }
+
+   return ret;
+}
+
+static void
+anv_fixup_subgroup_size(struct anv_device *device, struct shader_info *info)
+{
+   switch (info->stage) {
+   case MESA_SHADER_COMPUTE:
+   case MESA_SHADER_TASK:
+   case MESA_SHADER_MESH:
+      break;
+   default:
+      return;
+   }
+
+   unsigned local_size = info->workgroup_size[0] *
+                         info->workgroup_size[1] *
+                         info->workgroup_size[2];
+
+   /* Games don't always request full subgroups when they should,
+    * which can cause bugs, as they may expect bigger size of the
+    * subgroup than we choose for the execution.
+    */
+   if (device->physical->instance->assume_full_subgroups &&
+       info->uses_wide_subgroup_intrinsics &&
+       info->subgroup_size == SUBGROUP_SIZE_API_CONSTANT &&
+       local_size &&
+       local_size % BRW_SUBGROUP_SIZE == 0)
+      info->subgroup_size = SUBGROUP_SIZE_FULL_SUBGROUPS;
+
+   /* If the client requests that we dispatch full subgroups but doesn't
+    * allow us to pick a subgroup size, we have to smash it to the API
+    * value of 32.  Performance will likely be terrible in this case but
+    * there's nothing we can do about that.  The client should have chosen
+    * a size.
+    */
+   if (info->subgroup_size == SUBGROUP_SIZE_FULL_SUBGROUPS)
+      info->subgroup_size =
+         device->physical->instance->assume_full_subgroups != 0 ?
+         device->physical->instance->assume_full_subgroups : BRW_SUBGROUP_SIZE;
+
+   /* Cooperative matrix extension requires that all invocations in a subgroup
+    * be active. As a result, when the application does not request a specific
+    * subgroup size, we must use SIMD32.
+    */
+   if (info->stage == MESA_SHADER_COMPUTE && info->cs.has_cooperative_matrix &&
+       info->subgroup_size < SUBGROUP_SIZE_REQUIRE_8) {
+      info->subgroup_size = BRW_SUBGROUP_SIZE;
+   }
+}
+
 static void
 anv_pipeline_lower_nir(struct anv_pipeline *pipeline,
                        void *mem_ctx,
                        struct anv_pipeline_stage *stage,
-                       struct anv_pipeline_layout *layout)
+                       struct anv_pipeline_sets_layout *layout,
+                       uint32_t view_mask,
+                       bool use_primitive_replication)
 {
    const struct anv_physical_device *pdevice = pipeline->device->physical;
    const struct brw_compiler *compiler = pdevice->compiler;
@@ -808,80 +1010,153 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline,
    nir_shader *nir = stage->nir;
 
    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
-      /* Check if sample shading is enabled in the shader and toggle
-       * it on for the pipeline independent if sampleShadingEnable is set.
-       */
-      nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
-      if (nir->info.fs.uses_sample_shading)
-         anv_pipeline_to_graphics(pipeline)->sample_shading_enable = true;
+      NIR_PASS(_, nir, nir_lower_wpos_center);
+      NIR_PASS(_, nir, nir_lower_input_attachments,
+               &(nir_input_attachment_options) {
+                   .use_fragcoord_sysval = true,
+                   .use_layer_id_sysval = true,
+               });
+   }
+
+   if (nir->info.stage == MESA_SHADER_MESH ||
+         nir->info.stage == MESA_SHADER_TASK) {
+      nir_lower_compute_system_values_options options = {
+            .lower_cs_local_id_to_index = true,
+            .lower_workgroup_id_to_index = true,
+            /* nir_lower_idiv generates expensive code */
+            .shortcut_1d_workgroup_id = compiler->devinfo->verx10 >= 125,
+      };
 
-      NIR_PASS_V(nir, nir_lower_wpos_center,
-                 anv_pipeline_to_graphics(pipeline)->sample_shading_enable);
-      NIR_PASS_V(nir, nir_lower_input_attachments,
-                 &(nir_input_attachment_options) {
-                     .use_fragcoord_sysval = true,
-                     .use_layer_id_sysval = true,
-                 });
+      NIR_PASS(_, nir, nir_lower_compute_system_values, &options);
    }
 
-   NIR_PASS_V(nir, anv_nir_lower_ycbcr_textures, layout);
+   NIR_PASS(_, nir, nir_vk_lower_ycbcr_tex, lookup_ycbcr_conversion, layout);
 
-   if (pipeline->type == ANV_PIPELINE_GRAPHICS) {
-      NIR_PASS_V(nir, anv_nir_lower_multiview,
-                 anv_pipeline_to_graphics(pipeline));
+   if (pipeline->type == ANV_PIPELINE_GRAPHICS ||
+       pipeline->type == ANV_PIPELINE_GRAPHICS_LIB) {
+      NIR_PASS(_, nir, anv_nir_lower_multiview, view_mask,
+               use_primitive_replication);
    }
 
+   if (nir->info.stage == MESA_SHADER_COMPUTE && nir->info.cs.has_cooperative_matrix) {
+      anv_fixup_subgroup_size(pipeline->device, &nir->info);
+      NIR_PASS(_, nir, brw_nir_lower_cmat, nir->info.subgroup_size);
+      NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, 16);
+   }
+
+   /* The patch control points are delivered through a push constant when
+    * dynamic.
+    */
+   if (nir->info.stage == MESA_SHADER_TESS_CTRL &&
+       stage->key.tcs.input_vertices == 0)
+      NIR_PASS(_, nir, anv_nir_lower_load_patch_vertices_in);
+
    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
 
-   NIR_PASS_V(nir, brw_nir_lower_storage_image, compiler->devinfo);
+   NIR_PASS(_, nir, brw_nir_lower_storage_image,
+            &(struct brw_nir_lower_storage_image_opts) {
+               /* Anv only supports Gfx9+ which has better defined typed read
+                * behavior. It allows us to only have to care about lowering
+                * loads.
+                */
+               .devinfo = compiler->devinfo,
+               .lower_loads = true,
+            });
 
-   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_global,
-              nir_address_format_64bit_global);
-   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_push_const,
-              nir_address_format_32bit_offset);
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global,
+            nir_address_format_64bit_global);
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const,
+            nir_address_format_32bit_offset);
+
+   NIR_PASS(_, nir, brw_nir_lower_ray_queries, &pdevice->info);
+
+   stage->push_desc_info.used_descriptors =
+      anv_nir_compute_used_push_descriptors(nir, layout);
+
+   struct anv_pipeline_push_map push_map = {};
 
    /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */
-   anv_nir_apply_pipeline_layout(pdevice,
-                                 pipeline->device->robust_buffer_access,
-                                 layout, nir, &stage->bind_map);
+   NIR_PASS_V(nir, anv_nir_apply_pipeline_layout,
+              pdevice, stage->key.base.robust_flags,
+              layout->independent_sets,
+              layout, &stage->bind_map, &push_map, mem_ctx);
 
-   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo,
-              anv_nir_ubo_addr_format(pdevice,
-                 pipeline->device->robust_buffer_access));
-   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo,
-              anv_nir_ssbo_addr_format(pdevice,
-                 pipeline->device->robust_buffer_access));
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo,
+            anv_nir_ubo_addr_format(pdevice, stage->key.base.robust_flags));
+   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo,
+            anv_nir_ssbo_addr_format(pdevice, stage->key.base.robust_flags));
 
    /* First run copy-prop to get rid of all of the vec() that address
     * calculations often create and then constant-fold so that, when we
     * get to anv_nir_lower_ubo_loads, we can detect constant offsets.
     */
-   NIR_PASS_V(nir, nir_copy_prop);
-   NIR_PASS_V(nir, nir_opt_constant_folding);
+   bool progress;
+   do {
+      progress = false;
+      NIR_PASS(progress, nir, nir_opt_algebraic);
+      NIR_PASS(progress, nir, nir_copy_prop);
+      NIR_PASS(progress, nir, nir_opt_constant_folding);
+      NIR_PASS(progress, nir, nir_opt_dce);
+   } while (progress);
+
+   /* Required for nir_divergence_analysis() which is needed for
+    * anv_nir_lower_ubo_loads.
+    */
+   NIR_PASS(_, nir, nir_convert_to_lcssa, true, true);
+   nir_divergence_analysis(nir);
+
+   NIR_PASS(_, nir, anv_nir_lower_ubo_loads);
 
-   NIR_PASS_V(nir, anv_nir_lower_ubo_loads);
+   NIR_PASS(_, nir, nir_opt_remove_phis);
 
-   /* We don't support non-uniform UBOs and non-uniform SSBO access is
-    * handled naturally by falling back to A64 messages.
+   enum nir_lower_non_uniform_access_type lower_non_uniform_access_types =
+      nir_lower_non_uniform_texture_access |
+      nir_lower_non_uniform_image_access |
+      nir_lower_non_uniform_get_ssbo_size;
+
+   /* In practice, most shaders do not have non-uniform-qualified
+    * accesses (see
+    * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17558#note_1475069)
+    * thus a cheaper and likely to fail check is run first.
     */
-   NIR_PASS_V(nir, nir_lower_non_uniform_access,
-              &(nir_lower_non_uniform_access_options) {
-                  .types = nir_lower_non_uniform_texture_access |
-                           nir_lower_non_uniform_image_access,
+   if (nir_has_non_uniform_access(nir, lower_non_uniform_access_types)) {
+      NIR_PASS(_, nir, nir_opt_non_uniform_access);
+
+      /* We don't support non-uniform UBOs and non-uniform SSBO access is
+      * handled naturally by falling back to A64 messages.
+      */
+      NIR_PASS(_, nir, nir_lower_non_uniform_access,
+               &(nir_lower_non_uniform_access_options) {
+                  .types = lower_non_uniform_access_types,
                   .callback = NULL,
-              });
+               });
+
+      NIR_PASS(_, nir, intel_nir_lower_non_uniform_resource_intel);
+      NIR_PASS(_, nir, intel_nir_cleanup_resource_intel);
+      NIR_PASS(_, nir, nir_opt_dce);
+   }
+
+   NIR_PASS_V(nir, anv_nir_update_resource_intel_block);
 
-   anv_nir_compute_push_layout(pdevice, pipeline->device->robust_buffer_access,
-                               nir, prog_data, &stage->bind_map, mem_ctx);
+   stage->dynamic_push_values = anv_nir_compute_dynamic_push_bits(nir);
+
+   NIR_PASS_V(nir, anv_nir_compute_push_layout,
+              pdevice, stage->key.base.robust_flags,
+              anv_graphics_pipeline_stage_fragment_dynamic(stage),
+              prog_data, &stage->bind_map, &push_map,
+              pipeline->layout.type, mem_ctx);
+
+   NIR_PASS_V(nir, anv_nir_lower_resource_intel, pdevice,
+              pipeline->layout.type);
 
    if (gl_shader_stage_uses_workgroup(nir->info.stage)) {
       if (!nir->info.shared_memory_explicit_layout) {
-         NIR_PASS_V(nir, nir_lower_vars_to_explicit_types,
-                    nir_var_mem_shared, shared_type_info);
+         NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
+                  nir_var_mem_shared, shared_type_info);
       }
 
-      NIR_PASS_V(nir, nir_lower_explicit_io,
-                 nir_var_mem_shared, nir_address_format_32bit_offset);
+      NIR_PASS(_, nir, nir_lower_explicit_io,
+               nir_var_mem_shared, nir_address_format_32bit_offset);
 
       if (nir->info.zero_initialize_shared_memory &&
           nir->info.shared_size > 0) {
@@ -894,11 +1169,22 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline,
          assert(shared_size <=
                 intel_calculate_slm_size(compiler->devinfo->ver, nir->info.shared_size));
 
-         NIR_PASS_V(nir, nir_zero_initialize_shared_memory,
-                    shared_size, chunk_size);
+         NIR_PASS(_, nir, nir_zero_initialize_shared_memory,
+                  shared_size, chunk_size);
       }
    }
 
+   if (gl_shader_stage_is_compute(nir->info.stage) ||
+       gl_shader_stage_is_mesh(nir->info.stage)) {
+      NIR_PASS(_, nir, brw_nir_lower_cs_intrinsics, compiler->devinfo,
+               &stage->prog_data.cs);
+   }
+
+   stage->push_desc_info.used_set_buffer =
+      anv_nir_loads_push_desc_buffer(nir, layout, &stage->bind_map);
+   stage->push_desc_info.fully_promoted_ubo_descriptors =
+      anv_nir_push_desc_ubo_fully_promoted(nir, layout, &stage->bind_map);
+
    stage->nir = nir;
 }
 
@@ -914,14 +1200,19 @@ anv_pipeline_link_vs(const struct brw_compiler *compiler,
 static void
 anv_pipeline_compile_vs(const struct brw_compiler *compiler,
                         void *mem_ctx,
-                        struct anv_graphics_pipeline *pipeline,
-                        struct anv_pipeline_stage *vs_stage)
+                        struct anv_graphics_base_pipeline *pipeline,
+                        struct anv_pipeline_stage *vs_stage,
+                        uint32_t view_mask)
 {
    /* When using Primitive Replication for multiview, each view gets its own
     * position slot.
     */
-   uint32_t pos_slots = pipeline->use_primitive_replication ?
-      anv_subpass_view_count(pipeline->subpass) : 1;
+   uint32_t pos_slots =
+      (vs_stage->nir->info.per_view_outputs & VARYING_BIT_POS) ?
+      MAX2(1, util_bitcount(view_mask)) : 1;
+
+   /* Only position is allowed to be per-view */
+   assert(!(vs_stage->nir->info.per_view_outputs & ~VARYING_BIT_POS));
 
    brw_compute_vue_map(compiler->devinfo,
                        &vs_stage->prog_data.vs.base.vue_map,
@@ -932,14 +1223,18 @@ anv_pipeline_compile_vs(const struct brw_compiler *compiler,
    vs_stage->num_stats = 1;
 
    struct brw_compile_vs_params params = {
-      .nir = vs_stage->nir,
+      .base = {
+         .nir = vs_stage->nir,
+         .stats = vs_stage->stats,
+         .log_data = pipeline->base.device,
+         .mem_ctx = mem_ctx,
+         .source_hash = vs_stage->source_hash,
+      },
       .key = &vs_stage->key.vs,
       .prog_data = &vs_stage->prog_data.vs,
-      .stats = vs_stage->stats,
-      .log_data = pipeline->base.device,
    };
 
-   vs_stage->code = brw_compile_vs(compiler, mem_ctx, &params);
+   vs_stage->code = brw_compile_vs(compiler, &params);
 }
 
 static void
@@ -973,10 +1268,10 @@ merge_tess_info(struct shader_info *tes_info,
           tcs_info->tess.spacing == tes_info->tess.spacing);
    tes_info->tess.spacing |= tcs_info->tess.spacing;
 
-   assert(tcs_info->tess.primitive_mode == 0 ||
-          tes_info->tess.primitive_mode == 0 ||
-          tcs_info->tess.primitive_mode == tes_info->tess.primitive_mode);
-   tes_info->tess.primitive_mode |= tcs_info->tess.primitive_mode;
+   assert(tcs_info->tess._primitive_mode == 0 ||
+          tes_info->tess._primitive_mode == 0 ||
+          tcs_info->tess._primitive_mode == tes_info->tess._primitive_mode);
+   tes_info->tess._primitive_mode |= tcs_info->tess._primitive_mode;
    tes_info->tess.ccw |= tcs_info->tess.ccw;
    tes_info->tess.point_mode |= tcs_info->tess.point_mode;
 }
@@ -1001,12 +1296,8 @@ anv_pipeline_link_tcs(const struct brw_compiler *compiler,
     * this comes from the SPIR-V, which is part of the hash used for the
     * pipeline cache.  So it should be safe.
     */
-   tcs_stage->key.tcs.tes_primitive_mode =
-      tes_stage->nir->info.tess.primitive_mode;
-   tcs_stage->key.tcs.quads_workaround =
-      compiler->devinfo->ver < 9 &&
-      tes_stage->nir->info.tess.primitive_mode == 7 /* GL_QUADS */ &&
-      tes_stage->nir->info.tess.spacing == TESS_SPACING_EQUAL;
+   tcs_stage->key.tcs._tes_primitive_mode =
+      tes_stage->nir->info.tess._primitive_mode;
 }
 
 static void
@@ -1022,11 +1313,20 @@ anv_pipeline_compile_tcs(const struct brw_compiler *compiler,
       tcs_stage->nir->info.patch_outputs_written;
 
    tcs_stage->num_stats = 1;
-   tcs_stage->code = brw_compile_tcs(compiler, device, mem_ctx,
-                                     &tcs_stage->key.tcs,
-                                     &tcs_stage->prog_data.tcs,
-                                     tcs_stage->nir, -1,
-                                     tcs_stage->stats, NULL);
+
+   struct brw_compile_tcs_params params = {
+      .base = {
+         .nir = tcs_stage->nir,
+         .stats = tcs_stage->stats,
+         .log_data = device,
+         .mem_ctx = mem_ctx,
+         .source_hash = tcs_stage->source_hash,
+      },
+      .key = &tcs_stage->key.tcs,
+      .prog_data = &tcs_stage->prog_data.tcs,
+   };
+
+   tcs_stage->code = brw_compile_tcs(compiler, &params);
 }
 
 static void
@@ -1051,12 +1351,21 @@ anv_pipeline_compile_tes(const struct brw_compiler *compiler,
       tcs_stage->nir->info.patch_outputs_written;
 
    tes_stage->num_stats = 1;
-   tes_stage->code = brw_compile_tes(compiler, device, mem_ctx,
-                                     &tes_stage->key.tes,
-                                     &tcs_stage->prog_data.tcs.base.vue_map,
-                                     &tes_stage->prog_data.tes,
-                                     tes_stage->nir, -1,
-                                     tes_stage->stats, NULL);
+
+   struct brw_compile_tes_params params = {
+      .base = {
+         .nir = tes_stage->nir,
+         .stats = tes_stage->stats,
+         .log_data = device,
+         .mem_ctx = mem_ctx,
+         .source_hash = tes_stage->source_hash,
+      },
+      .key = &tes_stage->key.tes,
+      .prog_data = &tes_stage->prog_data.tes,
+      .input_vue_map = &tcs_stage->prog_data.tcs.base.vue_map,
+   };
+
+   tes_stage->code = brw_compile_tes(compiler, &params);
 }
 
 static void
@@ -1081,17 +1390,120 @@ anv_pipeline_compile_gs(const struct brw_compiler *compiler,
                        gs_stage->nir->info.separate_shader, 1);
 
    gs_stage->num_stats = 1;
-   gs_stage->code = brw_compile_gs(compiler, device, mem_ctx,
-                                   &gs_stage->key.gs,
-                                   &gs_stage->prog_data.gs,
-                                   gs_stage->nir, -1,
-                                   gs_stage->stats, NULL);
+
+   struct brw_compile_gs_params params = {
+      .base = {
+         .nir = gs_stage->nir,
+         .stats = gs_stage->stats,
+         .log_data = device,
+         .mem_ctx = mem_ctx,
+         .source_hash = gs_stage->source_hash,
+      },
+      .key = &gs_stage->key.gs,
+      .prog_data = &gs_stage->prog_data.gs,
+   };
+
+   gs_stage->code = brw_compile_gs(compiler, &params);
+}
+
+static void
+anv_pipeline_link_task(const struct brw_compiler *compiler,
+                       struct anv_pipeline_stage *task_stage,
+                       struct anv_pipeline_stage *next_stage)
+{
+   assert(next_stage);
+   assert(next_stage->stage == MESA_SHADER_MESH);
+   brw_nir_link_shaders(compiler, task_stage->nir, next_stage->nir);
+}
+
+static void
+anv_pipeline_compile_task(const struct brw_compiler *compiler,
+                          void *mem_ctx,
+                          struct anv_device *device,
+                          struct anv_pipeline_stage *task_stage)
+{
+   task_stage->num_stats = 1;
+
+   struct brw_compile_task_params params = {
+      .base = {
+         .nir = task_stage->nir,
+         .stats = task_stage->stats,
+         .log_data = device,
+         .mem_ctx = mem_ctx,
+         .source_hash = task_stage->source_hash,
+      },
+      .key = &task_stage->key.task,
+      .prog_data = &task_stage->prog_data.task,
+   };
+
+   task_stage->code = brw_compile_task(compiler, &params);
+}
+
+static void
+anv_pipeline_link_mesh(const struct brw_compiler *compiler,
+                       struct anv_pipeline_stage *mesh_stage,
+                       struct anv_pipeline_stage *next_stage)
+{
+   if (next_stage) {
+      brw_nir_link_shaders(compiler, mesh_stage->nir, next_stage->nir);
+   }
+}
+
+static void
+anv_pipeline_compile_mesh(const struct brw_compiler *compiler,
+                          void *mem_ctx,
+                          struct anv_device *device,
+                          struct anv_pipeline_stage *mesh_stage,
+                          struct anv_pipeline_stage *prev_stage)
+{
+   mesh_stage->num_stats = 1;
+
+   struct brw_compile_mesh_params params = {
+      .base = {
+         .nir = mesh_stage->nir,
+         .stats = mesh_stage->stats,
+         .log_data = device,
+         .mem_ctx = mem_ctx,
+         .source_hash = mesh_stage->source_hash,
+      },
+      .key = &mesh_stage->key.mesh,
+      .prog_data = &mesh_stage->prog_data.mesh,
+   };
+
+   if (prev_stage) {
+      assert(prev_stage->stage == MESA_SHADER_TASK);
+      params.tue_map = &prev_stage->prog_data.task.map;
+   }
+
+   mesh_stage->code = brw_compile_mesh(compiler, &params);
 }
 
 static void
 anv_pipeline_link_fs(const struct brw_compiler *compiler,
-                     struct anv_pipeline_stage *stage)
+                     struct anv_pipeline_stage *stage,
+                     const struct vk_render_pass_state *rp)
 {
+   /* Initially the valid outputs value is set to all possible render targets
+    * valid (see populate_wm_prog_key()), before we look at the shader
+    * variables. Here we look at the output variables of the shader an compute
+    * a correct number of render target outputs.
+    */
+   stage->key.wm.color_outputs_valid = 0;
+   nir_foreach_shader_out_variable_safe(var, stage->nir) {
+      if (var->data.location < FRAG_RESULT_DATA0)
+         continue;
+
+      const unsigned rt = var->data.location - FRAG_RESULT_DATA0;
+      const unsigned array_len =
+         glsl_type_is_array(var->type) ? glsl_get_length(var->type) : 1;
+      assert(rt + array_len <= MAX_RTS);
+
+      stage->key.wm.color_outputs_valid |= BITFIELD_RANGE(rt, array_len);
+   }
+   stage->key.wm.color_outputs_valid &= rp_color_mask(rp);
+   stage->key.wm.nr_color_regions =
+      util_last_bit(stage->key.wm.color_outputs_valid);
+
    unsigned num_rt_bindings;
    struct anv_pipeline_binding rt_bindings[MAX_RTS];
    if (stage->key.wm.nr_color_regions > 0) {
@@ -1101,12 +1513,15 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler,
             rt_bindings[rt] = (struct anv_pipeline_binding) {
                .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS,
                .index = rt,
+               .binding = UINT32_MAX,
+
             };
          } else {
             /* Setup a null render target */
             rt_bindings[rt] = (struct anv_pipeline_binding) {
                .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS,
                .index = UINT32_MAX,
+               .binding = UINT32_MAX,
             };
          }
       }
@@ -1125,53 +1540,6 @@ anv_pipeline_link_fs(const struct brw_compiler *compiler,
    typed_memcpy(stage->bind_map.surface_to_descriptor,
                 rt_bindings, num_rt_bindings);
    stage->bind_map.surface_count += num_rt_bindings;
-
-   /* Now that we've set up the color attachments, we can go through and
-    * eliminate any shader outputs that map to VK_ATTACHMENT_UNUSED in the
-    * hopes that dead code can clean them up in this and any earlier shader
-    * stages.
-    */
-   nir_function_impl *impl = nir_shader_get_entrypoint(stage->nir);
-   bool deleted_output = false;
-   nir_foreach_shader_out_variable_safe(var, stage->nir) {
-      /* TODO: We don't delete depth/stencil writes.  We probably could if the
-       * subpass doesn't have a depth/stencil attachment.
-       */
-      if (var->data.location < FRAG_RESULT_DATA0)
-         continue;
-
-      const unsigned rt = var->data.location - FRAG_RESULT_DATA0;
-
-      /* If this is the RT at location 0 and we have alpha to coverage
-       * enabled we still need that write because it will affect the coverage
-       * mask even if it's never written to a color target.
-       */
-      if (rt == 0 && stage->key.wm.alpha_to_coverage)
-         continue;
-
-      const unsigned array_len =
-         glsl_type_is_array(var->type) ? glsl_get_length(var->type) : 1;
-      assert(rt + array_len <= MAX_RTS);
-
-      if (rt >= MAX_RTS || !(stage->key.wm.color_outputs_valid &
-                             BITFIELD_RANGE(rt, array_len))) {
-         deleted_output = true;
-         var->data.mode = nir_var_function_temp;
-         exec_node_remove(&var->node);
-         exec_list_push_tail(&impl->locals, &var->node);
-      }
-   }
-
-   if (deleted_output)
-      nir_fixup_deref_modes(stage->nir);
-
-   /* We stored the number of subpass color attachments in nr_color_regions
-    * when calculating the key for caching.  Now that we've computed the bind
-    * map, we can reduce this to the actual max before we go into the back-end
-    * compiler.
-    */
-   stage->key.wm.nr_color_regions =
-      util_last_bit(stage->key.wm.color_outputs_valid);
 }
 
 static void
@@ -1179,45 +1547,61 @@ anv_pipeline_compile_fs(const struct brw_compiler *compiler,
                         void *mem_ctx,
                         struct anv_device *device,
                         struct anv_pipeline_stage *fs_stage,
-                        struct anv_pipeline_stage *prev_stage)
+                        struct anv_pipeline_stage *prev_stage,
+                        struct anv_graphics_base_pipeline *pipeline,
+                        uint32_t view_mask,
+                        bool use_primitive_replication)
 {
-   /* TODO: we could set this to 0 based on the information in nir_shader, but
-    * we need this before we call spirv_to_nir.
+   /* When using Primitive Replication for multiview, each view gets its own
+    * position slot.
+    */
+   uint32_t pos_slots = use_primitive_replication ?
+      MAX2(1, util_bitcount(view_mask)) : 1;
+
+   /* If we have a previous stage we can use that to deduce valid slots.
+    * Otherwise, rely on inputs of the input shader.
     */
-   assert(prev_stage);
-   fs_stage->key.wm.input_slots_valid =
-      prev_stage->prog_data.vue.vue_map.slots_valid;
+   if (prev_stage) {
+      fs_stage->key.wm.input_slots_valid =
+         prev_stage->prog_data.vue.vue_map.slots_valid;
+   } else {
+      struct intel_vue_map prev_vue_map;
+      brw_compute_vue_map(compiler->devinfo,
+                          &prev_vue_map,
+                          fs_stage->nir->info.inputs_read,
+                          fs_stage->nir->info.separate_shader,
+                          pos_slots);
+
+      fs_stage->key.wm.input_slots_valid = prev_vue_map.slots_valid;
+   }
 
    struct brw_compile_fs_params params = {
-      .nir = fs_stage->nir,
+      .base = {
+         .nir = fs_stage->nir,
+         .stats = fs_stage->stats,
+         .log_data = device,
+         .mem_ctx = mem_ctx,
+         .source_hash = fs_stage->source_hash,
+      },
       .key = &fs_stage->key.wm,
       .prog_data = &fs_stage->prog_data.wm,
 
       .allow_spilling = true,
-      .stats = fs_stage->stats,
-      .log_data = device,
+      .max_polygons = UCHAR_MAX,
    };
 
-   fs_stage->code = brw_compile_fs(compiler, mem_ctx, &params);
+   if (prev_stage && prev_stage->stage == MESA_SHADER_MESH) {
+      params.mue_map = &prev_stage->prog_data.mesh.map;
+      /* TODO(mesh): Slots valid, do we even use/rely on it? */
+   }
+
+   fs_stage->code = brw_compile_fs(compiler, &params);
 
-   fs_stage->num_stats = (uint32_t)fs_stage->prog_data.wm.dispatch_8 +
+   fs_stage->num_stats = (uint32_t)!!fs_stage->prog_data.wm.dispatch_multi +
+                         (uint32_t)fs_stage->prog_data.wm.dispatch_8 +
                          (uint32_t)fs_stage->prog_data.wm.dispatch_16 +
                          (uint32_t)fs_stage->prog_data.wm.dispatch_32;
-
-   if (fs_stage->key.wm.color_outputs_valid == 0 &&
-       !fs_stage->prog_data.wm.has_side_effects &&
-       !fs_stage->prog_data.wm.uses_omask &&
-       !fs_stage->key.wm.alpha_to_coverage &&
-       !fs_stage->prog_data.wm.uses_kill &&
-       fs_stage->prog_data.wm.computed_depth_mode == BRW_PSCDEPTH_OFF &&
-       !fs_stage->prog_data.wm.computed_stencil) {
-      /* This fragment shader has no outputs and no side effects.  Go ahead
-       * and return the code pointer so we don't accidentally think the
-       * compile failed but zero out prog_data which will set program_size to
-       * zero and disable the stage.
-       */
-      memset(&fs_stage->prog_data, 0, sizeof(fs_stage->prog_data));
-   }
+   assert(fs_stage->num_stats <= ARRAY_SIZE(fs_stage->stats));
 }
 
 static void
@@ -1229,14 +1613,14 @@ anv_pipeline_add_executable(struct anv_pipeline *pipeline,
    char *nir = NULL;
    if (stage->nir &&
        (pipeline->flags &
-        VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR)) {
+        VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR)) {
       nir = nir_shader_as_str(stage->nir, pipeline->mem_ctx);
    }
 
    char *disasm = NULL;
    if (stage->code &&
        (pipeline->flags &
-        VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR)) {
+        VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR)) {
       char *stream_data = NULL;
       size_t stream_size = 0;
       FILE *stream = open_memstream(&stream_data, &stream_size);
@@ -1262,6 +1646,12 @@ anv_pipeline_add_executable(struct anv_pipeline *pipeline,
                fprintf(stream, "Vulkan push constants and API params");
                break;
 
+            case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER:
+               fprintf(stream, "Descriptor buffer (desc buffer) for set %d (start=%dB)",
+                       stage->bind_map.push_ranges[i].index,
+                       stage->bind_map.push_ranges[i].start * 32);
+               break;
+
             case ANV_DESCRIPTOR_SET_DESCRIPTORS:
                fprintf(stream, "Descriptor buffer for set %d (start=%dB)",
                        stage->bind_map.push_ranges[i].index,
@@ -1271,11 +1661,6 @@ anv_pipeline_add_executable(struct anv_pipeline *pipeline,
             case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS:
                unreachable("gl_NumWorkgroups is never pushed");
 
-            case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
-               fprintf(stream, "Inline shader constant data (start=%dB)",
-                       stage->bind_map.push_ranges[i].start * 32);
-               break;
-
             case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
                unreachable("Color attachments can't be pushed");
 
@@ -1294,8 +1679,8 @@ anv_pipeline_add_executable(struct anv_pipeline *pipeline,
       /* Creating this is far cheaper than it looks.  It's perfectly fine to
        * do it for every binary.
        */
-      intel_disassemble(&pipeline->device->info,
-                        stage->code, code_offset, stream);
+      brw_disassemble_with_errors(&pipeline->device->physical->compiler->isa,
+                                  stage->code, code_offset, stream);
 
       fclose(stream);
 
@@ -1319,8 +1704,7 @@ anv_pipeline_add_executable(struct anv_pipeline *pipeline,
 
 static void
 anv_pipeline_add_executables(struct anv_pipeline *pipeline,
-                             struct anv_pipeline_stage *stage,
-                             struct anv_shader_bin *bin)
+                             struct anv_pipeline_stage *stage)
 {
    if (stage->stage == MESA_SHADER_FRAGMENT) {
       /* We pull the prog data and stats out of the anv_shader_bin because
@@ -1328,10 +1712,11 @@ anv_pipeline_add_executables(struct anv_pipeline *pipeline,
        * looked up the shader in a cache.
        */
       const struct brw_wm_prog_data *wm_prog_data =
-         (const struct brw_wm_prog_data *)bin->prog_data;
-      struct brw_compile_stats *stats = bin->stats;
+         (const struct brw_wm_prog_data *)stage->bin->prog_data;
+      struct brw_compile_stats *stats = stage->bin->stats;
 
-      if (wm_prog_data->dispatch_8) {
+      if (wm_prog_data->dispatch_8 ||
+          wm_prog_data->dispatch_multi) {
          anv_pipeline_add_executable(pipeline, stage, stats++, 0);
       }
 
@@ -1345,551 +1730,950 @@ anv_pipeline_add_executables(struct anv_pipeline *pipeline,
                                      wm_prog_data->prog_offset_32);
       }
    } else {
-      anv_pipeline_add_executable(pipeline, stage, bin->stats, 0);
+      anv_pipeline_add_executable(pipeline, stage, stage->bin->stats, 0);
    }
 }
 
 static void
-anv_pipeline_init_from_cached_graphics(struct anv_graphics_pipeline *pipeline)
+anv_pipeline_account_shader(struct anv_pipeline *pipeline,
+                            struct anv_shader_bin *shader)
 {
-   /* TODO: Cache this pipeline-wide information. */
+   pipeline->scratch_size = MAX2(pipeline->scratch_size,
+                                 shader->prog_data->total_scratch);
 
-   /* Primitive replication depends on information from all the shaders.
-    * Recover this bit from the fact that we have more than one position slot
-    * in the vertex shader when using it.
-    */
-   assert(pipeline->active_stages & VK_SHADER_STAGE_VERTEX_BIT);
-   int pos_slots = 0;
-   const struct brw_vue_prog_data *vue_prog_data =
-      (const void *) pipeline->shaders[MESA_SHADER_VERTEX]->prog_data;
-   const struct brw_vue_map *vue_map = &vue_prog_data->vue_map;
-   for (int i = 0; i < vue_map->num_slots; i++) {
-      if (vue_map->slot_to_varying[i] == VARYING_SLOT_POS)
-         pos_slots++;
+   pipeline->ray_queries = MAX2(pipeline->ray_queries,
+                                shader->prog_data->ray_queries);
+
+   if (shader->push_desc_info.used_set_buffer) {
+      pipeline->use_push_descriptor_buffer |=
+         mesa_to_vk_shader_stage(shader->stage);
    }
-   pipeline->use_primitive_replication = pos_slots > 1;
+   if (shader->push_desc_info.used_descriptors &
+       ~shader->push_desc_info.fully_promoted_ubo_descriptors)
+      pipeline->use_push_descriptor |= mesa_to_vk_shader_stage(shader->stage);
 }
 
-static VkResult
-anv_pipeline_compile_graphics(struct anv_graphics_pipeline *pipeline,
-                              struct anv_pipeline_cache *cache,
-                              const VkGraphicsPipelineCreateInfo *info)
+/* This function return true if a shader should not be looked at because of
+ * fast linking. Instead we should use the shader binaries provided by
+ * libraries.
+ */
+static bool
+anv_graphics_pipeline_skip_shader_compile(struct anv_graphics_base_pipeline *pipeline,
+                                          struct anv_pipeline_stage *stages,
+                                          bool link_optimize,
+                                          gl_shader_stage stage)
 {
-   VkPipelineCreationFeedbackEXT pipeline_feedback = {
-      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT,
-   };
-   int64_t pipeline_start = os_time_get_nano();
-
-   const struct brw_compiler *compiler = pipeline->base.device->physical->compiler;
-   struct anv_pipeline_stage stages[MESA_SHADER_STAGES] = {};
-
-   pipeline->active_stages = 0;
+   /* Always skip non active stages */
+   if (!anv_pipeline_base_has_stage(pipeline, stage))
+      return true;
 
-   /* Information on which states are considered dynamic. */
-   const VkPipelineDynamicStateCreateInfo *dyn_info =
-      info->pDynamicState;
-   uint32_t dynamic_states = 0;
-   if (dyn_info) {
-      for (unsigned i = 0; i < dyn_info->dynamicStateCount; i++)
-         dynamic_states |=
-            anv_cmd_dirty_bit_for_vk_dynamic_state(dyn_info->pDynamicStates[i]);
-   }
+   /* When link optimizing, consider all stages */
+   if (link_optimize)
+      return false;
 
-   VkResult result;
-   for (uint32_t i = 0; i < info->stageCount; i++) {
-      const VkPipelineShaderStageCreateInfo *sinfo = &info->pStages[i];
-      gl_shader_stage stage = vk_to_mesa_shader_stage(sinfo->stage);
+   /* Otherwise check if the stage was specified through
+    * VkGraphicsPipelineCreateInfo
+    */
+   assert(stages[stage].info != NULL || stages[stage].imported.bin != NULL);
+   return stages[stage].info == NULL;
+}
 
-      pipeline->active_stages |= sinfo->stage;
+static void
+anv_graphics_pipeline_init_keys(struct anv_graphics_base_pipeline *pipeline,
+                                const struct vk_graphics_pipeline_state *state,
+                                struct anv_pipeline_stage *stages)
+{
+   for (uint32_t s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
+      if (!anv_pipeline_base_has_stage(pipeline, s))
+         continue;
 
       int64_t stage_start = os_time_get_nano();
 
-      stages[stage].stage = stage;
-      stages[stage].module = vk_shader_module_from_handle(sinfo->module);
-      stages[stage].entrypoint = sinfo->pName;
-      stages[stage].spec_info = sinfo->pSpecializationInfo;
-      anv_pipeline_hash_shader(stages[stage].module,
-                               stages[stage].entrypoint,
-                               stage,
-                               stages[stage].spec_info,
-                               stages[stage].shader_sha1);
-
-      const struct intel_device_info *devinfo = &pipeline->base.device->info;
-      switch (stage) {
+      const struct anv_device *device = pipeline->base.device;
+      switch (stages[s].stage) {
       case MESA_SHADER_VERTEX:
-         populate_vs_prog_key(devinfo, sinfo->flags,
-                              pipeline->base.device->robust_buffer_access,
-                              &stages[stage].key.vs);
+         populate_vs_prog_key(&stages[s], device);
          break;
       case MESA_SHADER_TESS_CTRL:
-         populate_tcs_prog_key(devinfo, sinfo->flags,
-                               pipeline->base.device->robust_buffer_access,
-                               info->pTessellationState->patchControlPoints,
-                               &stages[stage].key.tcs);
+         populate_tcs_prog_key(&stages[s],
+                               device,
+                               BITSET_TEST(state->dynamic,
+                                           MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS) ?
+                               0 : state->ts->patch_control_points);
          break;
       case MESA_SHADER_TESS_EVAL:
-         populate_tes_prog_key(devinfo, sinfo->flags,
-                               pipeline->base.device->robust_buffer_access,
-                               &stages[stage].key.tes);
+         populate_tes_prog_key(&stages[s], device);
          break;
       case MESA_SHADER_GEOMETRY:
-         populate_gs_prog_key(devinfo, sinfo->flags,
-                              pipeline->base.device->robust_buffer_access,
-                              &stages[stage].key.gs);
+         populate_gs_prog_key(&stages[s], device);
          break;
       case MESA_SHADER_FRAGMENT: {
+         /* Assume rasterization enabled in any of the following case :
+          *
+          *    - We're a pipeline library without pre-rasterization information
+          *
+          *    - Rasterization is not disabled in the non dynamic state
+          *
+          *    - Rasterization disable is dynamic
+          */
          const bool raster_enabled =
-            !info->pRasterizationState->rasterizerDiscardEnable ||
-            dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
-         populate_wm_prog_key(pipeline, sinfo->flags,
-                              pipeline->base.device->robust_buffer_access,
-                              pipeline->subpass,
-                              raster_enabled ? info->pMultisampleState : NULL,
-                              vk_find_struct_const(info->pNext,
-                                                   PIPELINE_FRAGMENT_SHADING_RATE_STATE_CREATE_INFO_KHR),
-                              &stages[stage].key.wm);
+            state->rs == NULL ||
+            !state->rs->rasterizer_discard_enable ||
+            BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE);
+         enum brw_sometimes is_mesh = BRW_NEVER;
+         if (device->vk.enabled_extensions.EXT_mesh_shader) {
+            if (anv_pipeline_base_has_stage(pipeline, MESA_SHADER_VERTEX))
+               is_mesh = BRW_NEVER;
+            else if (anv_pipeline_base_has_stage(pipeline, MESA_SHADER_MESH))
+               is_mesh = BRW_ALWAYS;
+            else {
+               assert(pipeline->base.type == ANV_PIPELINE_GRAPHICS_LIB);
+               is_mesh = BRW_SOMETIMES;
+            }
+         }
+         populate_wm_prog_key(&stages[s],
+                              pipeline,
+                              state->dynamic,
+                              raster_enabled ? state->ms : NULL,
+                              state->fsr, state->rp, is_mesh);
          break;
       }
+
+      case MESA_SHADER_TASK:
+         populate_task_prog_key(&stages[s], device);
+         break;
+
+      case MESA_SHADER_MESH: {
+         const bool compact_mue =
+            !(pipeline->base.type == ANV_PIPELINE_GRAPHICS_LIB &&
+              !anv_pipeline_base_has_stage(pipeline, MESA_SHADER_FRAGMENT));
+         populate_mesh_prog_key(&stages[s], device, compact_mue);
+         break;
+      }
+
       default:
          unreachable("Invalid graphics shader stage");
       }
 
-      stages[stage].feedback.duration += os_time_get_nano() - stage_start;
-      stages[stage].feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
+      stages[s].feedback.duration += os_time_get_nano() - stage_start;
+      stages[s].feedback.flags |= VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
    }
+}
+
+static void
+anv_graphics_lib_retain_shaders(struct anv_graphics_base_pipeline *pipeline,
+                                struct anv_pipeline_stage *stages,
+                                bool will_compile)
+{
+   /* There isn't much point in retaining NIR shaders on final pipelines. */
+   assert(pipeline->base.type == ANV_PIPELINE_GRAPHICS_LIB);
+
+   struct anv_graphics_lib_pipeline *lib = (struct anv_graphics_lib_pipeline *) pipeline;
 
-   if (pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)
-      pipeline->active_stages |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
+   for (int s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
+      if (!anv_pipeline_base_has_stage(pipeline, s))
+         continue;
 
-   assert(pipeline->active_stages & VK_SHADER_STAGE_VERTEX_BIT);
+      memcpy(lib->retained_shaders[s].shader_sha1, stages[s].shader_sha1,
+             sizeof(stages[s].shader_sha1));
 
-   ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
+      lib->retained_shaders[s].subgroup_size_type = stages[s].subgroup_size_type;
 
-   unsigned char sha1[20];
-   anv_pipeline_hash_graphics(pipeline, layout, stages, sha1);
+      nir_shader *nir = stages[s].nir != NULL ? stages[s].nir : stages[s].imported.nir;
+      assert(nir != NULL);
+
+      if (!will_compile) {
+         lib->retained_shaders[s].nir = nir;
+      } else {
+         lib->retained_shaders[s].nir =
+            nir_shader_clone(pipeline->base.mem_ctx, nir);
+      }
+   }
+}
+
+static bool
+anv_graphics_pipeline_load_cached_shaders(struct anv_graphics_base_pipeline *pipeline,
+                                          struct vk_pipeline_cache *cache,
+                                          struct anv_pipeline_stage *stages,
+                                          bool link_optimize,
+                                          VkPipelineCreationFeedback *pipeline_feedback)
+{
+   struct anv_device *device = pipeline->base.device;
+   unsigned cache_hits = 0, found = 0, imported = 0;
 
    for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
-      if (!stages[s].entrypoint)
+      if (!anv_pipeline_base_has_stage(pipeline, s))
          continue;
 
-      stages[s].cache_key.stage = s;
-      memcpy(stages[s].cache_key.sha1, sha1, sizeof(sha1));
-   }
+      int64_t stage_start = os_time_get_nano();
 
-   const bool skip_cache_lookup =
-      (pipeline->base.flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR);
+      bool cache_hit;
+      stages[s].bin =
+         anv_device_search_for_kernel(device, cache, &stages[s].cache_key,
+                                      sizeof(stages[s].cache_key), &cache_hit);
+      if (stages[s].bin) {
+         found++;
+         pipeline->shaders[s] = stages[s].bin;
+      }
 
-   if (!skip_cache_lookup) {
-      unsigned found = 0;
-      unsigned cache_hits = 0;
+      if (cache_hit) {
+         cache_hits++;
+         stages[s].feedback.flags |=
+            VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
+      }
+      stages[s].feedback.duration += os_time_get_nano() - stage_start;
+   }
+
+   /* When not link optimizing, lookup the missing shader in the imported
+    * libraries.
+    */
+   if (!link_optimize) {
       for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
-         if (!stages[s].entrypoint)
+         if (!anv_pipeline_base_has_stage(pipeline, s))
             continue;
 
-         int64_t stage_start = os_time_get_nano();
+         if (pipeline->shaders[s] != NULL)
+            continue;
 
-         bool cache_hit;
-         struct anv_shader_bin *bin =
-            anv_device_search_for_kernel(pipeline->base.device, cache,
-                                         &stages[s].cache_key,
-                                         sizeof(stages[s].cache_key), &cache_hit);
-         if (bin) {
-            found++;
-            pipeline->shaders[s] = bin;
-         }
+         if (stages[s].imported.bin == NULL)
+            continue;
 
-         if (cache_hit) {
-            cache_hits++;
-            stages[s].feedback.flags |=
-               VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
-         }
-         stages[s].feedback.duration += os_time_get_nano() - stage_start;
+         stages[s].bin = stages[s].imported.bin;
+         pipeline->shaders[s] = anv_shader_bin_ref(stages[s].imported.bin);
+         pipeline->source_hashes[s] = stages[s].source_hash;
+         imported++;
       }
+   }
 
-      if (found == __builtin_popcount(pipeline->active_stages)) {
-         if (cache_hits == found) {
-            pipeline_feedback.flags |=
-               VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
-         }
-         /* We found all our shaders in the cache.  We're done. */
-         for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
-            if (!stages[s].entrypoint)
-               continue;
+   if ((found + imported) == __builtin_popcount(pipeline->base.active_stages)) {
+      if (cache_hits == found && found != 0) {
+         pipeline_feedback->flags |=
+            VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
+      }
+      /* We found all our shaders in the cache.  We're done. */
+      for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
+         if (pipeline->shaders[s] == NULL)
+            continue;
 
-            anv_pipeline_add_executables(&pipeline->base, &stages[s],
-                                         pipeline->shaders[s]);
-         }
-         anv_pipeline_init_from_cached_graphics(pipeline);
-         goto done;
-      } else if (found > 0) {
-         /* We found some but not all of our shaders.  This shouldn't happen
-          * most of the time but it can if we have a partially populated
-          * pipeline cache.
+         /* Only add the executables when we're not importing or doing link
+          * optimizations. The imported executables are added earlier. Link
+          * optimization can produce different binaries.
           */
-         assert(found < __builtin_popcount(pipeline->active_stages));
-
-         vk_debug_report(&pipeline->base.device->physical->instance->vk,
-                         VK_DEBUG_REPORT_WARNING_BIT_EXT |
-                         VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT,
-                         &cache->base, 0, 0, "anv",
-                         "Found a partial pipeline in the cache.  This is "
-                         "most likely caused by an incomplete pipeline cache "
-                         "import or export");
-
-         /* We're going to have to recompile anyway, so just throw away our
-          * references to the shaders in the cache.  We'll get them out of the
-          * cache again as part of the compilation process.
-          */
-         for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
-            stages[s].feedback.flags = 0;
-            if (pipeline->shaders[s]) {
-               anv_shader_bin_unref(pipeline->base.device, pipeline->shaders[s]);
-               pipeline->shaders[s] = NULL;
-            }
+         if (stages[s].imported.bin == NULL || link_optimize)
+            anv_pipeline_add_executables(&pipeline->base, &stages[s]);
+         pipeline->source_hashes[s] = stages[s].source_hash;
+      }
+      return true;
+   } else if (found > 0) {
+      /* We found some but not all of our shaders. This shouldn't happen most
+       * of the time but it can if we have a partially populated pipeline
+       * cache.
+       */
+      assert(found < __builtin_popcount(pipeline->base.active_stages));
+
+      /* With GPL, this might well happen if the app does an optimized
+       * link.
+       */
+      if (!pipeline->base.device->vk.enabled_extensions.EXT_graphics_pipeline_library) {
+         vk_perf(VK_LOG_OBJS(cache ? &cache->base :
+                             &pipeline->base.device->vk.base),
+                 "Found a partial pipeline in the cache.  This is "
+                 "most likely caused by an incomplete pipeline cache "
+                 "import or export");
+      }
+
+      /* We're going to have to recompile anyway, so just throw away our
+       * references to the shaders in the cache.  We'll get them out of the
+       * cache again as part of the compilation process.
+       */
+      for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
+         stages[s].feedback.flags = 0;
+         if (pipeline->shaders[s]) {
+            anv_shader_bin_unref(device, pipeline->shaders[s]);
+            pipeline->shaders[s] = NULL;
          }
       }
    }
 
-   if (info->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT)
-      return VK_PIPELINE_COMPILE_REQUIRED_EXT;
+   return false;
+}
 
-   void *pipeline_ctx = ralloc_context(NULL);
+static const gl_shader_stage graphics_shader_order[] = {
+   MESA_SHADER_VERTEX,
+   MESA_SHADER_TESS_CTRL,
+   MESA_SHADER_TESS_EVAL,
+   MESA_SHADER_GEOMETRY,
 
-   for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
-      if (!stages[s].entrypoint)
+   MESA_SHADER_TASK,
+   MESA_SHADER_MESH,
+
+   MESA_SHADER_FRAGMENT,
+};
+
+/* This function loads NIR only for stages specified in
+ * VkGraphicsPipelineCreateInfo::pStages[]
+ */
+static VkResult
+anv_graphics_pipeline_load_nir(struct anv_graphics_base_pipeline *pipeline,
+                               struct vk_pipeline_cache *cache,
+                               struct anv_pipeline_stage *stages,
+                               void *mem_ctx,
+                               bool need_clone)
+{
+   for (unsigned s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
+      if (!anv_pipeline_base_has_stage(pipeline, s))
          continue;
 
       int64_t stage_start = os_time_get_nano();
 
       assert(stages[s].stage == s);
-      assert(pipeline->shaders[s] == NULL);
-
-      stages[s].bind_map = (struct anv_pipeline_bind_map) {
-         .surface_to_descriptor = stages[s].surface_to_descriptor,
-         .sampler_to_descriptor = stages[s].sampler_to_descriptor
-      };
 
-      stages[s].nir = anv_pipeline_stage_get_nir(&pipeline->base, cache,
-                                                 pipeline_ctx,
-                                                 &stages[s]);
-      if (stages[s].nir == NULL) {
-         result = vk_error(VK_ERROR_UNKNOWN);
-         goto fail;
+      /* Only use the create NIR from the pStages[] element if we don't have
+       * an imported library for the same stage.
+       */
+      if (stages[s].imported.bin == NULL) {
+         stages[s].nir = anv_pipeline_stage_get_nir(&pipeline->base, cache,
+                                                    mem_ctx, &stages[s]);
+         if (stages[s].nir == NULL)
+            return vk_error(pipeline, VK_ERROR_UNKNOWN);
+      } else {
+         stages[s].nir = need_clone ?
+                         nir_shader_clone(mem_ctx, stages[s].imported.nir) :
+                         stages[s].imported.nir;
       }
 
-      /* This is rather ugly.
+      stages[s].feedback.duration += os_time_get_nano() - stage_start;
+   }
+
+   return VK_SUCCESS;
+}
+
+static void
+anv_pipeline_nir_preprocess(struct anv_pipeline *pipeline,
+                            struct anv_pipeline_stage *stage)
+{
+   struct anv_device *device = pipeline->device;
+   const struct brw_compiler *compiler = device->physical->compiler;
+
+   const struct nir_lower_sysvals_to_varyings_options sysvals_to_varyings = {
+      .point_coord = true,
+   };
+   NIR_PASS(_, stage->nir, nir_lower_sysvals_to_varyings, &sysvals_to_varyings);
+
+   const nir_opt_access_options opt_access_options = {
+      .is_vulkan = true,
+   };
+   NIR_PASS(_, stage->nir, nir_opt_access, &opt_access_options);
+
+   /* Vulkan uses the separate-shader linking model */
+   stage->nir->info.separate_shader = true;
+
+   struct brw_nir_compiler_opts opts = {
+      .softfp64 = device->fp64_nir,
+      /* Assume robustness with EXT_pipeline_robustness because this can be
+       * turned on/off per pipeline and we have no visibility on this here.
+       */
+      .robust_image_access = device->vk.enabled_features.robustImageAccess ||
+                             device->vk.enabled_features.robustImageAccess2 ||
+                             device->vk.enabled_extensions.EXT_pipeline_robustness,
+      .input_vertices = stage->nir->info.stage == MESA_SHADER_TESS_CTRL ?
+                        stage->key.tcs.input_vertices : 0,
+   };
+   brw_preprocess_nir(compiler, stage->nir, &opts);
+
+   if (stage->nir->info.stage == MESA_SHADER_MESH) {
+      NIR_PASS(_, stage->nir, anv_nir_lower_set_vtx_and_prim_count);
+      NIR_PASS(_, stage->nir, nir_opt_dce);
+      NIR_PASS(_, stage->nir, nir_remove_dead_variables, nir_var_shader_out, NULL);
+   }
+
+   NIR_PASS(_, stage->nir, nir_opt_barrier_modes);
+
+   nir_shader_gather_info(stage->nir, nir_shader_get_entrypoint(stage->nir));
+}
+
+static void
+anv_fill_pipeline_creation_feedback(const struct anv_graphics_base_pipeline *pipeline,
+                                    VkPipelineCreationFeedback *pipeline_feedback,
+                                    const VkGraphicsPipelineCreateInfo *info,
+                                    struct anv_pipeline_stage *stages)
+{
+   const VkPipelineCreationFeedbackCreateInfo *create_feedback =
+      vk_find_struct_const(info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
+   if (create_feedback) {
+      *create_feedback->pPipelineCreationFeedback = *pipeline_feedback;
+
+      /* VkPipelineCreationFeedbackCreateInfo:
+       *
+       *    "An implementation must set or clear the
+       *     VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT in
+       *     VkPipelineCreationFeedback::flags for pPipelineCreationFeedback
+       *     and every element of pPipelineStageCreationFeedbacks."
        *
-       * Any variable annotated as interpolated by sample essentially disables
-       * coarse pixel shading. Unfortunately the CTS tests exercising this set
-       * the varying value in the previous stage using a constant. Our NIR
-       * infrastructure is clever enough to lookup variables across stages and
-       * constant fold, removing the variable. So in order to comply with CTS
-       * we have check variables here.
        */
-      if (s == MESA_SHADER_FRAGMENT) {
-         nir_foreach_variable_in_list(var, &stages[s].nir->variables) {
-            if (var->data.sample) {
-               stages[s].key.wm.coarse_pixel = false;
-               break;
+      for (uint32_t i = 0; i < create_feedback->pipelineStageCreationFeedbackCount; i++) {
+         create_feedback->pPipelineStageCreationFeedbacks[i].flags &=
+            ~VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
+      }
+      /* This part is not really specified in the Vulkan spec at the moment.
+       * We're kind of guessing what the CTS wants. We might need to update
+       * when https://gitlab.khronos.org/vulkan/vulkan/-/issues/3115 is
+       * clarified.
+       */
+      for (uint32_t s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
+         if (!anv_pipeline_base_has_stage(pipeline, s))
+            continue;
+
+         if (stages[s].feedback_idx < create_feedback->pipelineStageCreationFeedbackCount) {
+            create_feedback->pPipelineStageCreationFeedbacks[
+               stages[s].feedback_idx] = stages[s].feedback;
+         }
+      }
+   }
+}
+
+static uint32_t
+anv_graphics_pipeline_imported_shader_count(struct anv_pipeline_stage *stages)
+{
+   uint32_t count = 0;
+   for (uint32_t s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
+      if (stages[s].imported.bin != NULL)
+         count++;
+   }
+   return count;
+}
+
+static VkResult
+anv_graphics_pipeline_compile(struct anv_graphics_base_pipeline *pipeline,
+                              struct anv_pipeline_stage *stages,
+                              struct vk_pipeline_cache *cache,
+                              VkPipelineCreationFeedback *pipeline_feedback,
+                              const VkGraphicsPipelineCreateInfo *info,
+                              const struct vk_graphics_pipeline_state *state)
+{
+   int64_t pipeline_start = os_time_get_nano();
+
+   struct anv_device *device = pipeline->base.device;
+   const struct intel_device_info *devinfo = device->info;
+   const struct brw_compiler *compiler = device->physical->compiler;
+
+   /* Setup the shaders given in this VkGraphicsPipelineCreateInfo::pStages[].
+    * Other shaders imported from libraries should have been added by
+    * anv_graphics_pipeline_import_lib().
+    */
+   uint32_t shader_count = anv_graphics_pipeline_imported_shader_count(stages);
+   for (uint32_t i = 0; i < info->stageCount; i++) {
+      gl_shader_stage stage = vk_to_mesa_shader_stage(info->pStages[i].stage);
+
+      /* If a pipeline library is loaded in this stage, we should ignore the
+       * pStages[] entry of the same stage.
+       */
+      if (stages[stage].imported.bin != NULL)
+         continue;
+
+      stages[stage].stage = stage;
+      stages[stage].pipeline_pNext = info->pNext;
+      stages[stage].info = &info->pStages[i];
+      stages[stage].feedback_idx = shader_count++;
+
+      anv_stage_write_shader_hash(&stages[stage], device);
+   }
+
+   /* Prepare shader keys for all shaders in pipeline->base.active_stages
+    * (this includes libraries) before generating the hash for cache look up.
+    *
+    * We're doing this because the spec states that :
+    *
+    *    "When an implementation is looking up a pipeline in a pipeline cache,
+    *     if that pipeline is being created using linked libraries,
+    *     implementations should always return an equivalent pipeline created
+    *     with VK_PIPELINE_CREATE_LINK_TIME_OPTIMIZATION_BIT_EXT if available,
+    *     whether or not that bit was specified."
+    *
+    * So even if the application does not request link optimization, we have
+    * to do our cache lookup with the entire set of shader sha1s so that we
+    * can find what would be the best optimized pipeline in the case as if we
+    * had compiled all the shaders together and known the full graphics state.
+    */
+   anv_graphics_pipeline_init_keys(pipeline, state, stages);
+
+   uint32_t view_mask = state->rp ? state->rp->view_mask : 0;
+
+   unsigned char sha1[20];
+   anv_pipeline_hash_graphics(pipeline, stages, view_mask, sha1);
+
+   for (unsigned s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
+      if (!anv_pipeline_base_has_stage(pipeline, s))
+         continue;
+
+      stages[s].cache_key.stage = s;
+      memcpy(stages[s].cache_key.sha1, sha1, sizeof(sha1));
+   }
+
+   const bool retain_shaders =
+      pipeline->base.flags & VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT;
+   const bool link_optimize =
+      pipeline->base.flags & VK_PIPELINE_CREATE_2_LINK_TIME_OPTIMIZATION_BIT_EXT;
+
+   VkResult result = VK_SUCCESS;
+   const bool skip_cache_lookup =
+      (pipeline->base.flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR);
+
+   if (!skip_cache_lookup) {
+      bool found_all_shaders =
+         anv_graphics_pipeline_load_cached_shaders(pipeline, cache, stages,
+                                                   link_optimize,
+                                                   pipeline_feedback);
+
+      if (found_all_shaders) {
+         /* If we need to retain shaders, we need to also load from the NIR
+          * cache.
+          */
+         if (pipeline->base.type == ANV_PIPELINE_GRAPHICS_LIB && retain_shaders) {
+            result = anv_graphics_pipeline_load_nir(pipeline, cache,
+                                                    stages,
+                                                    pipeline->base.mem_ctx,
+                                                    false /* need_clone */);
+            if (result != VK_SUCCESS) {
+               vk_perf(VK_LOG_OBJS(cache ? &cache->base :
+                                   &pipeline->base.device->vk.base),
+                       "Found all ISA shaders in the cache but not all NIR shaders.");
             }
+
+            anv_graphics_lib_retain_shaders(pipeline, stages, false /* will_compile */);
+         }
+
+         if (result == VK_SUCCESS)
+            goto done;
+
+         for (unsigned s = 0; s < ANV_GRAPHICS_SHADER_STAGE_COUNT; s++) {
+            if (!anv_pipeline_base_has_stage(pipeline, s))
+               continue;
+
+            if (stages[s].nir) {
+               ralloc_free(stages[s].nir);
+               stages[s].nir = NULL;
+            }
+
+            assert(pipeline->shaders[s] != NULL);
+            anv_shader_bin_unref(device, pipeline->shaders[s]);
+            pipeline->shaders[s] = NULL;
          }
       }
+   }
 
-      stages[s].feedback.duration += os_time_get_nano() - stage_start;
+   if (pipeline->base.flags & VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR)
+      return VK_PIPELINE_COMPILE_REQUIRED;
+
+   void *tmp_ctx = ralloc_context(NULL);
+
+   result = anv_graphics_pipeline_load_nir(pipeline, cache, stages,
+                                           tmp_ctx, link_optimize /* need_clone */);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   /* Retain shaders now if asked, this only applies to libraries */
+   if (pipeline->base.type == ANV_PIPELINE_GRAPHICS_LIB && retain_shaders)
+      anv_graphics_lib_retain_shaders(pipeline, stages, true /* will_compile */);
+
+   /* The following steps will be executed for shaders we need to compile :
+    *
+    *    - specified through VkGraphicsPipelineCreateInfo::pStages[]
+    *
+    *    - or compiled from libraries with retained shaders (libraries
+    *      compiled with CREATE_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT) if the
+    *      pipeline has the CREATE_LINK_TIME_OPTIMIZATION_BIT flag.
+    */
+
+   /* Preprocess all NIR shaders. */
+   for (int s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
+      if (anv_graphics_pipeline_skip_shader_compile(pipeline, stages,
+                                                    link_optimize, s))
+         continue;
+
+      anv_stage_allocate_bind_map_tables(&pipeline->base, &stages[s], tmp_ctx);
+
+      anv_pipeline_nir_preprocess(&pipeline->base, &stages[s]);
+   }
+
+   if (stages[MESA_SHADER_MESH].info && stages[MESA_SHADER_FRAGMENT].info) {
+      anv_apply_per_prim_attr_wa(stages[MESA_SHADER_MESH].nir,
+                                 stages[MESA_SHADER_FRAGMENT].nir,
+                                 device,
+                                 info);
    }
 
    /* Walk backwards to link */
    struct anv_pipeline_stage *next_stage = NULL;
-   for (int s = ARRAY_SIZE(pipeline->shaders) - 1; s >= 0; s--) {
-      if (!stages[s].entrypoint)
+   for (int i = ARRAY_SIZE(graphics_shader_order) - 1; i >= 0; i--) {
+      gl_shader_stage s = graphics_shader_order[i];
+      if (anv_graphics_pipeline_skip_shader_compile(pipeline, stages,
+                                                    link_optimize, s))
          continue;
 
+      struct anv_pipeline_stage *stage = &stages[s];
+
       switch (s) {
       case MESA_SHADER_VERTEX:
-         anv_pipeline_link_vs(compiler, &stages[s], next_stage);
+         anv_pipeline_link_vs(compiler, stage, next_stage);
          break;
       case MESA_SHADER_TESS_CTRL:
-         anv_pipeline_link_tcs(compiler, &stages[s], next_stage);
+         anv_pipeline_link_tcs(compiler, stage, next_stage);
          break;
       case MESA_SHADER_TESS_EVAL:
-         anv_pipeline_link_tes(compiler, &stages[s], next_stage);
+         anv_pipeline_link_tes(compiler, stage, next_stage);
          break;
       case MESA_SHADER_GEOMETRY:
-         anv_pipeline_link_gs(compiler, &stages[s], next_stage);
+         anv_pipeline_link_gs(compiler, stage, next_stage);
+         break;
+      case MESA_SHADER_TASK:
+         anv_pipeline_link_task(compiler, stage, next_stage);
+         break;
+      case MESA_SHADER_MESH:
+         anv_pipeline_link_mesh(compiler, stage, next_stage);
          break;
       case MESA_SHADER_FRAGMENT:
-         anv_pipeline_link_fs(compiler, &stages[s]);
+         anv_pipeline_link_fs(compiler, stage, state->rp);
          break;
       default:
          unreachable("Invalid graphics shader stage");
       }
 
-      next_stage = &stages[s];
+      next_stage = stage;
    }
 
-   if (pipeline->base.device->info.ver >= 12 &&
-       pipeline->subpass->view_mask != 0) {
+   bool use_primitive_replication = false;
+   if (devinfo->ver >= 12 && view_mask != 0) {
       /* For some pipelines HW Primitive Replication can be used instead of
        * instancing to implement Multiview.  This depend on how viewIndex is
        * used in all the active shaders, so this check can't be done per
        * individual shaders.
        */
-      nir_shader *shaders[MESA_SHADER_STAGES] = {};
-      for (unsigned s = 0; s < MESA_SHADER_STAGES; s++)
+      nir_shader *shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT] = {};
+      for (unsigned s = 0; s < ARRAY_SIZE(shaders); s++)
          shaders[s] = stages[s].nir;
 
-      pipeline->use_primitive_replication =
-         anv_check_for_primitive_replication(shaders, pipeline);
-   } else {
-      pipeline->use_primitive_replication = false;
+      use_primitive_replication =
+         anv_check_for_primitive_replication(device,
+                                             pipeline->base.active_stages,
+                                             shaders, view_mask);
    }
 
    struct anv_pipeline_stage *prev_stage = NULL;
-   for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
-      if (!stages[s].entrypoint)
+   for (unsigned i = 0; i < ARRAY_SIZE(graphics_shader_order); i++) {
+      gl_shader_stage s = graphics_shader_order[i];
+      if (anv_graphics_pipeline_skip_shader_compile(pipeline, stages,
+                                                    link_optimize, s))
          continue;
 
+      struct anv_pipeline_stage *stage = &stages[s];
+
       int64_t stage_start = os_time_get_nano();
 
-      void *stage_ctx = ralloc_context(NULL);
+      anv_pipeline_lower_nir(&pipeline->base, tmp_ctx, stage,
+                             &pipeline->base.layout, view_mask,
+                             use_primitive_replication);
+
+      struct shader_info *cur_info = &stage->nir->info;
 
-      anv_pipeline_lower_nir(&pipeline->base, stage_ctx, &stages[s], layout);
+      if (prev_stage && compiler->nir_options[s]->unify_interfaces) {
+         struct shader_info *prev_info = &prev_stage->nir->info;
 
-      if (prev_stage && compiler->glsl_compiler_options[s].NirOptions->unify_interfaces) {
-         prev_stage->nir->info.outputs_written |= stages[s].nir->info.inputs_read &
+         prev_info->outputs_written |= cur_info->inputs_read &
                   ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
-         stages[s].nir->info.inputs_read |= prev_stage->nir->info.outputs_written &
+         cur_info->inputs_read |= prev_info->outputs_written &
                   ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
-         prev_stage->nir->info.patch_outputs_written |= stages[s].nir->info.patch_inputs_read;
-         stages[s].nir->info.patch_inputs_read |= prev_stage->nir->info.patch_outputs_written;
+         prev_info->patch_outputs_written |= cur_info->patch_inputs_read;
+         cur_info->patch_inputs_read |= prev_info->patch_outputs_written;
       }
 
-      ralloc_free(stage_ctx);
+      anv_fixup_subgroup_size(device, cur_info);
 
-      stages[s].feedback.duration += os_time_get_nano() - stage_start;
+      stage->feedback.duration += os_time_get_nano() - stage_start;
 
-      prev_stage = &stages[s];
+      prev_stage = stage;
+   }
+
+   /* In the case the platform can write the primitive variable shading rate
+    * and KHR_fragment_shading_rate is enabled :
+    *    - there can be a fragment shader but we don't have it yet
+    *    - the fragment shader needs fragment shading rate
+    *
+    * figure out the last geometry stage that should write the primitive
+    * shading rate, and ensure it is marked as used there. The backend will
+    * write a default value if the shader doesn't actually write it.
+    *
+    * We iterate backwards in the stage and stop on the first shader that can
+    * set the value.
+    *
+    * Don't apply this to MESH stages, as this is a per primitive thing.
+    */
+   if (devinfo->has_coarse_pixel_primitive_and_cb &&
+       device->vk.enabled_extensions.KHR_fragment_shading_rate &&
+       pipeline_has_coarse_pixel(state->dynamic, state->ms, state->fsr) &&
+       (!stages[MESA_SHADER_FRAGMENT].info ||
+        stages[MESA_SHADER_FRAGMENT].key.wm.coarse_pixel) &&
+       stages[MESA_SHADER_MESH].nir == NULL) {
+      struct anv_pipeline_stage *last_psr = NULL;
+
+      for (unsigned i = 0; i < ARRAY_SIZE(graphics_shader_order); i++) {
+         gl_shader_stage s =
+            graphics_shader_order[ARRAY_SIZE(graphics_shader_order) - i - 1];
+
+         if (anv_graphics_pipeline_skip_shader_compile(pipeline, stages,
+                                                       link_optimize, s) ||
+             !gl_shader_stage_can_set_fragment_shading_rate(s))
+            continue;
+
+         last_psr = &stages[s];
+         break;
+      }
+
+      /* Only set primitive shading rate if there is a pre-rasterization
+       * shader in this pipeline/pipeline-library.
+       */
+      if (last_psr)
+         last_psr->nir->info.outputs_written |= VARYING_BIT_PRIMITIVE_SHADING_RATE;
    }
 
    prev_stage = NULL;
-   for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) {
-      if (!stages[s].entrypoint)
+   for (unsigned i = 0; i < ARRAY_SIZE(graphics_shader_order); i++) {
+      gl_shader_stage s = graphics_shader_order[i];
+      struct anv_pipeline_stage *stage = &stages[s];
+
+      if (anv_graphics_pipeline_skip_shader_compile(pipeline, stages, link_optimize, s))
          continue;
 
       int64_t stage_start = os_time_get_nano();
 
       void *stage_ctx = ralloc_context(NULL);
 
-      nir_xfb_info *xfb_info = NULL;
-      if (s == MESA_SHADER_VERTEX ||
-          s == MESA_SHADER_TESS_EVAL ||
-          s == MESA_SHADER_GEOMETRY)
-         xfb_info = nir_gather_xfb_info(stages[s].nir, stage_ctx);
-
       switch (s) {
       case MESA_SHADER_VERTEX:
          anv_pipeline_compile_vs(compiler, stage_ctx, pipeline,
-                                 &stages[s]);
+                                 stage, view_mask);
          break;
       case MESA_SHADER_TESS_CTRL:
-         anv_pipeline_compile_tcs(compiler, stage_ctx, pipeline->base.device,
-                                  &stages[s], prev_stage);
+         anv_pipeline_compile_tcs(compiler, stage_ctx, device,
+                                  stage, prev_stage);
          break;
       case MESA_SHADER_TESS_EVAL:
-         anv_pipeline_compile_tes(compiler, stage_ctx, pipeline->base.device,
-                                  &stages[s], prev_stage);
+         anv_pipeline_compile_tes(compiler, stage_ctx, device,
+                                  stage, prev_stage);
          break;
       case MESA_SHADER_GEOMETRY:
-         anv_pipeline_compile_gs(compiler, stage_ctx, pipeline->base.device,
-                                 &stages[s], prev_stage);
+         anv_pipeline_compile_gs(compiler, stage_ctx, device,
+                                 stage, prev_stage);
+         break;
+      case MESA_SHADER_TASK:
+         anv_pipeline_compile_task(compiler, stage_ctx, device,
+                                   stage);
+         break;
+      case MESA_SHADER_MESH:
+         anv_pipeline_compile_mesh(compiler, stage_ctx, device,
+                                   stage, prev_stage);
          break;
       case MESA_SHADER_FRAGMENT:
-         anv_pipeline_compile_fs(compiler, stage_ctx, pipeline->base.device,
-                                 &stages[s], prev_stage);
+         anv_pipeline_compile_fs(compiler, stage_ctx, device,
+                                 stage, prev_stage, pipeline,
+                                 view_mask,
+                                 use_primitive_replication);
          break;
       default:
          unreachable("Invalid graphics shader stage");
       }
-      if (stages[s].code == NULL) {
+      if (stage->code == NULL) {
          ralloc_free(stage_ctx);
-         result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+         result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
          goto fail;
       }
 
-      anv_nir_validate_push_layout(&stages[s].prog_data.base,
-                                   &stages[s].bind_map);
-
-      struct anv_shader_bin *bin =
-         anv_device_upload_kernel(pipeline->base.device, cache, s,
-                                  &stages[s].cache_key,
-                                  sizeof(stages[s].cache_key),
-                                  stages[s].code,
-                                  stages[s].prog_data.base.program_size,
-                                  &stages[s].prog_data.base,
-                                  brw_prog_data_size(s),
-                                  stages[s].stats, stages[s].num_stats,
-                                  xfb_info, &stages[s].bind_map);
-      if (!bin) {
+      anv_nir_validate_push_layout(&stage->prog_data.base,
+                                   &stage->bind_map);
+
+      struct anv_shader_upload_params upload_params = {
+         .stage               = s,
+         .key_data            = &stage->cache_key,
+         .key_size            = sizeof(stage->cache_key),
+         .kernel_data         = stage->code,
+         .kernel_size         = stage->prog_data.base.program_size,
+         .prog_data           = &stage->prog_data.base,
+         .prog_data_size      = brw_prog_data_size(s),
+         .stats               = stage->stats,
+         .num_stats           = stage->num_stats,
+         .xfb_info            = stage->nir->xfb_info,
+         .bind_map            = &stage->bind_map,
+         .push_desc_info      = &stage->push_desc_info,
+         .dynamic_push_values = stage->dynamic_push_values,
+      };
+
+      stage->bin =
+         anv_device_upload_kernel(device, cache, &upload_params);
+      if (!stage->bin) {
          ralloc_free(stage_ctx);
-         result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+         result = vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY);
          goto fail;
       }
 
-      anv_pipeline_add_executables(&pipeline->base, &stages[s], bin);
+      anv_pipeline_add_executables(&pipeline->base, stage);
+      pipeline->source_hashes[s] = stage->source_hash;
+      pipeline->shaders[s] = stage->bin;
 
-      pipeline->shaders[s] = bin;
       ralloc_free(stage_ctx);
 
-      stages[s].feedback.duration += os_time_get_nano() - stage_start;
+      stage->feedback.duration += os_time_get_nano() - stage_start;
 
-      prev_stage = &stages[s];
+      prev_stage = stage;
    }
 
-   ralloc_free(pipeline_ctx);
+   /* Finally add the imported shaders that were not compiled as part of this
+    * step.
+    */
+   for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
+      if (!anv_pipeline_base_has_stage(pipeline, s))
+         continue;
+
+      if (pipeline->shaders[s] != NULL)
+         continue;
 
-done:
+      /* We should have recompiled everything with link optimization. */
+      assert(!link_optimize);
 
-   if (pipeline->shaders[MESA_SHADER_FRAGMENT] &&
-       pipeline->shaders[MESA_SHADER_FRAGMENT]->prog_data->program_size == 0) {
-      /* This can happen if we decided to implicitly disable the fragment
-       * shader.  See anv_pipeline_compile_fs().
-       */
-      anv_shader_bin_unref(pipeline->base.device,
-                           pipeline->shaders[MESA_SHADER_FRAGMENT]);
-      pipeline->shaders[MESA_SHADER_FRAGMENT] = NULL;
-      pipeline->active_stages &= ~VK_SHADER_STAGE_FRAGMENT_BIT;
+      struct anv_pipeline_stage *stage = &stages[s];
+
+      pipeline->source_hashes[s] = stage->source_hash;
+      pipeline->shaders[s] = anv_shader_bin_ref(stage->imported.bin);
    }
 
-   pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
+   ralloc_free(tmp_ctx);
 
-   const VkPipelineCreationFeedbackCreateInfoEXT *create_feedback =
-      vk_find_struct_const(info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT);
-   if (create_feedback) {
-      *create_feedback->pPipelineCreationFeedback = pipeline_feedback;
+done:
 
-      assert(info->stageCount == create_feedback->pipelineStageCreationFeedbackCount);
-      for (uint32_t i = 0; i < info->stageCount; i++) {
-         gl_shader_stage s = vk_to_mesa_shader_stage(info->pStages[i].stage);
-         create_feedback->pPipelineStageCreationFeedbacks[i] = stages[s].feedback;
-      }
+   /* Write the feedback index into the pipeline */
+   for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
+      if (!anv_pipeline_base_has_stage(pipeline, s))
+         continue;
+
+      struct anv_pipeline_stage *stage = &stages[s];
+      pipeline->feedback_index[s] = stage->feedback_idx;
+      pipeline->robust_flags[s] = stage->robust_flags;
+
+      anv_pipeline_account_shader(&pipeline->base, pipeline->shaders[s]);
+   }
+
+   pipeline_feedback->duration = os_time_get_nano() - pipeline_start;
+
+   if (pipeline->shaders[MESA_SHADER_FRAGMENT]) {
+      pipeline->fragment_dynamic =
+         anv_graphics_pipeline_stage_fragment_dynamic(
+            &stages[MESA_SHADER_FRAGMENT]);
    }
 
    return VK_SUCCESS;
 
 fail:
-   ralloc_free(pipeline_ctx);
+   ralloc_free(tmp_ctx);
 
    for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
       if (pipeline->shaders[s])
-         anv_shader_bin_unref(pipeline->base.device, pipeline->shaders[s]);
+         anv_shader_bin_unref(device, pipeline->shaders[s]);
    }
 
    return result;
 }
 
-VkResult
+static VkResult
 anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline,
-                        struct anv_pipeline_cache *cache,
-                        const VkComputePipelineCreateInfo *info,
-                        const struct vk_shader_module *module,
-                        const char *entrypoint,
-                        const VkSpecializationInfo *spec_info)
-{
-   VkPipelineCreationFeedbackEXT pipeline_feedback = {
-      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT,
+                        struct vk_pipeline_cache *cache,
+                        const VkComputePipelineCreateInfo *info)
+{
+   ASSERTED const VkPipelineShaderStageCreateInfo *sinfo = &info->stage;
+   assert(sinfo->stage == VK_SHADER_STAGE_COMPUTE_BIT);
+
+   VkPipelineCreationFeedback pipeline_feedback = {
+      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
    };
    int64_t pipeline_start = os_time_get_nano();
 
-   const struct brw_compiler *compiler = pipeline->base.device->physical->compiler;
+   struct anv_device *device = pipeline->base.device;
+   const struct brw_compiler *compiler = device->physical->compiler;
 
    struct anv_pipeline_stage stage = {
       .stage = MESA_SHADER_COMPUTE,
-      .module = module,
-      .entrypoint = entrypoint,
-      .spec_info = spec_info,
+      .info = &info->stage,
+      .pipeline_pNext = info->pNext,
       .cache_key = {
          .stage = MESA_SHADER_COMPUTE,
       },
       .feedback = {
-         .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT,
+         .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
       },
    };
-   anv_pipeline_hash_shader(stage.module,
-                            stage.entrypoint,
-                            MESA_SHADER_COMPUTE,
-                            stage.spec_info,
-                            stage.shader_sha1);
+   anv_stage_write_shader_hash(&stage, device);
 
-   struct anv_shader_bin *bin = NULL;
-
-   const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT *rss_info =
-      vk_find_struct_const(info->stage.pNext,
-                           PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT);
-
-   populate_cs_prog_key(&pipeline->base.device->info, info->stage.flags,
-                        pipeline->base.device->robust_buffer_access,
-                        rss_info, &stage.key.cs);
-
-   ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
+   populate_cs_prog_key(&stage, device);
 
    const bool skip_cache_lookup =
       (pipeline->base.flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR);
 
-   anv_pipeline_hash_compute(pipeline, layout, &stage, stage.cache_key.sha1);
+   anv_pipeline_hash_compute(pipeline, &stage, stage.cache_key.sha1);
 
    bool cache_hit = false;
    if (!skip_cache_lookup) {
-      bin = anv_device_search_for_kernel(pipeline->base.device, cache,
-                                         &stage.cache_key,
-                                         sizeof(stage.cache_key),
-                                         &cache_hit);
+      stage.bin = anv_device_search_for_kernel(device, cache,
+                                               &stage.cache_key,
+                                               sizeof(stage.cache_key),
+                                               &cache_hit);
    }
 
-   if (bin == NULL &&
-       (info->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT))
-      return VK_PIPELINE_COMPILE_REQUIRED_EXT;
+   if (stage.bin == NULL &&
+       (pipeline->base.flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT))
+      return VK_PIPELINE_COMPILE_REQUIRED;
 
    void *mem_ctx = ralloc_context(NULL);
-   if (bin == NULL) {
+   if (stage.bin == NULL) {
       int64_t stage_start = os_time_get_nano();
 
-      stage.bind_map = (struct anv_pipeline_bind_map) {
-         .surface_to_descriptor = stage.surface_to_descriptor,
-         .sampler_to_descriptor = stage.sampler_to_descriptor
-      };
+      anv_stage_allocate_bind_map_tables(&pipeline->base, &stage, mem_ctx);
 
       /* Set up a binding for the gl_NumWorkGroups */
       stage.bind_map.surface_count = 1;
       stage.bind_map.surface_to_descriptor[0] = (struct anv_pipeline_binding) {
          .set = ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS,
+         .binding = UINT32_MAX,
       };
 
       stage.nir = anv_pipeline_stage_get_nir(&pipeline->base, cache, mem_ctx, &stage);
       if (stage.nir == NULL) {
          ralloc_free(mem_ctx);
-         return vk_error(VK_ERROR_UNKNOWN);
+         return vk_error(pipeline, VK_ERROR_UNKNOWN);
       }
 
-      NIR_PASS_V(stage.nir, anv_nir_add_base_work_group_id);
+      anv_pipeline_nir_preprocess(&pipeline->base, &stage);
 
-      anv_pipeline_lower_nir(&pipeline->base, mem_ctx, &stage, layout);
+      anv_pipeline_lower_nir(&pipeline->base, mem_ctx, &stage,
+                             &pipeline->base.layout, 0 /* view_mask */,
+                             false /* use_primitive_replication */);
 
-      NIR_PASS_V(stage.nir, brw_nir_lower_cs_intrinsics);
+      anv_fixup_subgroup_size(device, &stage.nir->info);
 
       stage.num_stats = 1;
 
       struct brw_compile_cs_params params = {
-         .nir = stage.nir,
+         .base = {
+            .nir = stage.nir,
+            .stats = stage.stats,
+            .log_data = device,
+            .mem_ctx = mem_ctx,
+         },
          .key = &stage.key.cs,
          .prog_data = &stage.prog_data.cs,
-         .stats = stage.stats,
-         .log_data = pipeline->base.device,
       };
 
-      stage.code = brw_compile_cs(compiler, mem_ctx, &params);
+      stage.code = brw_compile_cs(compiler, &params);
       if (stage.code == NULL) {
          ralloc_free(mem_ctx);
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY);
       }
 
       anv_nir_validate_push_layout(&stage.prog_data.base, &stage.bind_map);
@@ -1900,588 +2684,662 @@ anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline,
          stage.bind_map.surface_to_descriptor[0].set = ANV_DESCRIPTOR_SET_NULL;
       }
 
-      const unsigned code_size = stage.prog_data.base.program_size;
-      bin = anv_device_upload_kernel(pipeline->base.device, cache,
-                                     MESA_SHADER_COMPUTE,
-                                     &stage.cache_key, sizeof(stage.cache_key),
-                                     stage.code, code_size,
-                                     &stage.prog_data.base,
-                                     sizeof(stage.prog_data.cs),
-                                     stage.stats, stage.num_stats,
-                                     NULL, &stage.bind_map);
-      if (!bin) {
+      struct anv_shader_upload_params upload_params = {
+         .stage               = MESA_SHADER_COMPUTE,
+         .key_data            = &stage.cache_key,
+         .key_size            = sizeof(stage.cache_key),
+         .kernel_data         = stage.code,
+         .kernel_size         = stage.prog_data.base.program_size,
+         .prog_data           = &stage.prog_data.base,
+         .prog_data_size      = sizeof(stage.prog_data.cs),
+         .stats               = stage.stats,
+         .num_stats           = stage.num_stats,
+         .bind_map            = &stage.bind_map,
+         .push_desc_info      = &stage.push_desc_info,
+         .dynamic_push_values = stage.dynamic_push_values,
+      };
+
+      stage.bin = anv_device_upload_kernel(device, cache, &upload_params);
+      if (!stage.bin) {
          ralloc_free(mem_ctx);
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY);
       }
 
       stage.feedback.duration = os_time_get_nano() - stage_start;
    }
 
-   anv_pipeline_add_executables(&pipeline->base, &stage, bin);
+   anv_pipeline_account_shader(&pipeline->base, stage.bin);
+   anv_pipeline_add_executables(&pipeline->base, &stage);
+   pipeline->source_hash = stage.source_hash;
 
    ralloc_free(mem_ctx);
 
    if (cache_hit) {
       stage.feedback.flags |=
-         VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
+         VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
       pipeline_feedback.flags |=
-         VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
+         VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
    }
    pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
 
-   const VkPipelineCreationFeedbackCreateInfoEXT *create_feedback =
-      vk_find_struct_const(info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT);
+   const VkPipelineCreationFeedbackCreateInfo *create_feedback =
+      vk_find_struct_const(info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
    if (create_feedback) {
       *create_feedback->pPipelineCreationFeedback = pipeline_feedback;
 
-      assert(create_feedback->pipelineStageCreationFeedbackCount == 1);
-      create_feedback->pPipelineStageCreationFeedbacks[0] = stage.feedback;
+      if (create_feedback->pipelineStageCreationFeedbackCount) {
+         assert(create_feedback->pipelineStageCreationFeedbackCount == 1);
+         create_feedback->pPipelineStageCreationFeedbacks[0] = stage.feedback;
+      }
    }
 
-   pipeline->cs = bin;
+   pipeline->cs = stage.bin;
 
    return VK_SUCCESS;
 }
 
-/**
- * Copy pipeline state not marked as dynamic.
- * Dynamic state is pipeline state which hasn't been provided at pipeline
- * creation time, but is dynamically provided afterwards using various
- * vkCmdSet* functions.
- *
- * The set of state considered "non_dynamic" is determined by the pieces of
- * state that have their corresponding VkDynamicState enums omitted from
- * VkPipelineDynamicStateCreateInfo::pDynamicStates.
- *
- * @param[out] pipeline    Destination non_dynamic state.
- * @param[in]  pCreateInfo Source of non_dynamic state to be copied.
- */
-static void
-copy_non_dynamic_state(struct anv_graphics_pipeline *pipeline,
-                       const VkGraphicsPipelineCreateInfo *pCreateInfo)
+static VkResult
+anv_compute_pipeline_create(struct anv_device *device,
+                            struct vk_pipeline_cache *cache,
+                            const VkComputePipelineCreateInfo *pCreateInfo,
+                            const VkAllocationCallbacks *pAllocator,
+                            VkPipeline *pPipeline)
 {
-   anv_cmd_dirty_mask_t states = ANV_CMD_DIRTY_DYNAMIC_ALL;
-   struct anv_subpass *subpass = pipeline->subpass;
+   struct anv_compute_pipeline *pipeline;
+   VkResult result;
 
-   pipeline->dynamic_state = default_dynamic_state;
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO);
 
-   states &= ~pipeline->dynamic_states;
+   pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
+                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (pipeline == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   struct anv_dynamic_state *dynamic = &pipeline->dynamic_state;
+   result = anv_pipeline_init(&pipeline->base, device,
+                              ANV_PIPELINE_COMPUTE,
+                              vk_compute_pipeline_create_flags(pCreateInfo),
+                              pAllocator);
+   if (result != VK_SUCCESS) {
+      vk_free2(&device->vk.alloc, pAllocator, pipeline);
+      return result;
+   }
 
-   bool raster_discard =
-      pCreateInfo->pRasterizationState->rasterizerDiscardEnable &&
-      !(pipeline->dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE);
 
-   /* Section 9.2 of the Vulkan 1.0.15 spec says:
-    *
-    *    pViewportState is [...] NULL if the pipeline
-    *    has rasterization disabled.
-    */
-   if (!raster_discard) {
-      assert(pCreateInfo->pViewportState);
-
-      dynamic->viewport.count = pCreateInfo->pViewportState->viewportCount;
-      if (states & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT) {
-         typed_memcpy(dynamic->viewport.viewports,
-                     pCreateInfo->pViewportState->pViewports,
-                     pCreateInfo->pViewportState->viewportCount);
-      }
+   ANV_FROM_HANDLE(anv_pipeline_layout, pipeline_layout, pCreateInfo->layout);
+   anv_pipeline_init_layout(&pipeline->base, pipeline_layout);
 
-      dynamic->scissor.count = pCreateInfo->pViewportState->scissorCount;
-      if (states & ANV_CMD_DIRTY_DYNAMIC_SCISSOR) {
-         typed_memcpy(dynamic->scissor.scissors,
-                     pCreateInfo->pViewportState->pScissors,
-                     pCreateInfo->pViewportState->scissorCount);
-      }
-   }
+   pipeline->base.active_stages = VK_SHADER_STAGE_COMPUTE_BIT;
 
-   if (states & ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH) {
-      assert(pCreateInfo->pRasterizationState);
-      dynamic->line_width = pCreateInfo->pRasterizationState->lineWidth;
-   }
+   anv_batch_set_storage(&pipeline->base.batch, ANV_NULL_ADDRESS,
+                         pipeline->batch_data, sizeof(pipeline->batch_data));
 
-   if (states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS) {
-      assert(pCreateInfo->pRasterizationState);
-      dynamic->depth_bias.bias =
-         pCreateInfo->pRasterizationState->depthBiasConstantFactor;
-      dynamic->depth_bias.clamp =
-         pCreateInfo->pRasterizationState->depthBiasClamp;
-      dynamic->depth_bias.slope =
-         pCreateInfo->pRasterizationState->depthBiasSlopeFactor;
+   result = anv_pipeline_compile_cs(pipeline, cache, pCreateInfo);
+   if (result != VK_SUCCESS) {
+      anv_pipeline_finish(&pipeline->base, device);
+      vk_free2(&device->vk.alloc, pAllocator, pipeline);
+      return result;
    }
 
-   if (states & ANV_CMD_DIRTY_DYNAMIC_CULL_MODE) {
-      assert(pCreateInfo->pRasterizationState);
-      dynamic->cull_mode =
-         pCreateInfo->pRasterizationState->cullMode;
-   }
+   anv_genX(device->info, compute_pipeline_emit)(pipeline);
 
-   if (states & ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE) {
-      assert(pCreateInfo->pRasterizationState);
-      dynamic->front_face =
-         pCreateInfo->pRasterizationState->frontFace;
-   }
+   ANV_RMV(compute_pipeline_create, device, pipeline, false);
 
-   if (states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) {
-      assert(pCreateInfo->pInputAssemblyState);
-      dynamic->primitive_topology = pCreateInfo->pInputAssemblyState->topology;
-   }
+   *pPipeline = anv_pipeline_to_handle(&pipeline->base);
 
-   if (states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE) {
-      assert(pCreateInfo->pRasterizationState);
-      dynamic->raster_discard =
-         pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
-   }
+   return pipeline->base.batch.status;
+}
 
-   if (states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE) {
-      assert(pCreateInfo->pRasterizationState);
-      dynamic->depth_bias_enable =
-         pCreateInfo->pRasterizationState->depthBiasEnable;
-   }
+VkResult anv_CreateComputePipelines(
+    VkDevice                                    _device,
+    VkPipelineCache                             pipelineCache,
+    uint32_t                                    count,
+    const VkComputePipelineCreateInfo*          pCreateInfos,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipeline*                                 pPipelines)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(vk_pipeline_cache, pipeline_cache, pipelineCache);
 
-   if (states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE) {
-      assert(pCreateInfo->pInputAssemblyState);
-      dynamic->primitive_restart_enable =
-         pCreateInfo->pInputAssemblyState->primitiveRestartEnable;
-   }
+   VkResult result = VK_SUCCESS;
 
-   /* Section 9.2 of the Vulkan 1.0.15 spec says:
-    *
-    *    pColorBlendState is [...] NULL if the pipeline has rasterization
-    *    disabled or if the subpass of the render pass the pipeline is
-    *    created against does not use any color attachments.
-    */
-   bool uses_color_att = false;
-   for (unsigned i = 0; i < subpass->color_count; ++i) {
-      if (subpass->color_attachments[i].attachment != VK_ATTACHMENT_UNUSED) {
-         uses_color_att = true;
+   unsigned i;
+   for (i = 0; i < count; i++) {
+      const VkPipelineCreateFlags2KHR flags =
+         vk_compute_pipeline_create_flags(&pCreateInfos[i]);
+      VkResult res = anv_compute_pipeline_create(device, pipeline_cache,
+                                                 &pCreateInfos[i],
+                                                 pAllocator, &pPipelines[i]);
+
+      if (res == VK_SUCCESS)
+         continue;
+
+      /* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED as it
+       * is not obvious what error should be report upon 2 different failures.
+       * */
+      result = res;
+      if (res != VK_PIPELINE_COMPILE_REQUIRED)
          break;
-      }
-   }
 
-   if (uses_color_att && !raster_discard) {
-      assert(pCreateInfo->pColorBlendState);
+      pPipelines[i] = VK_NULL_HANDLE;
 
-      if (states & ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS)
-         typed_memcpy(dynamic->blend_constants,
-                     pCreateInfo->pColorBlendState->blendConstants, 4);
+      if (flags & VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
+         break;
    }
 
-   /* If there is no depthstencil attachment, then don't read
-    * pDepthStencilState. The Vulkan spec states that pDepthStencilState may
-    * be NULL in this case. Even if pDepthStencilState is non-NULL, there is
-    * no need to override the depthstencil defaults in
-    * anv_pipeline::dynamic_state when there is no depthstencil attachment.
-    *
-    * Section 9.2 of the Vulkan 1.0.15 spec says:
-    *
-    *    pDepthStencilState is [...] NULL if the pipeline has rasterization
-    *    disabled or if the subpass of the render pass the pipeline is created
-    *    against does not use a depth/stencil attachment.
-    */
-   if (!raster_discard && subpass->depth_stencil_attachment) {
-      assert(pCreateInfo->pDepthStencilState);
-
-      if (states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS) {
-         dynamic->depth_bounds.min =
-            pCreateInfo->pDepthStencilState->minDepthBounds;
-         dynamic->depth_bounds.max =
-            pCreateInfo->pDepthStencilState->maxDepthBounds;
-      }
+   for (; i < count; i++)
+      pPipelines[i] = VK_NULL_HANDLE;
 
-      if (states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK) {
-         dynamic->stencil_compare_mask.front =
-            pCreateInfo->pDepthStencilState->front.compareMask;
-         dynamic->stencil_compare_mask.back =
-            pCreateInfo->pDepthStencilState->back.compareMask;
-      }
+   return result;
+}
 
-      if (states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK) {
-         dynamic->stencil_write_mask.front =
-            pCreateInfo->pDepthStencilState->front.writeMask;
-         dynamic->stencil_write_mask.back =
-            pCreateInfo->pDepthStencilState->back.writeMask;
-      }
+/**
+ * Calculate the desired L3 partitioning based on the current state of the
+ * pipeline.  For now this simply returns the conservative defaults calculated
+ * by get_default_l3_weights(), but we could probably do better by gathering
+ * more statistics from the pipeline state (e.g. guess of expected URB usage
+ * and bound surfaces), or by using feed-back from performance counters.
+ */
+void
+anv_pipeline_setup_l3_config(struct anv_pipeline *pipeline, bool needs_slm)
+{
+   const struct intel_device_info *devinfo = pipeline->device->info;
 
-      if (states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE) {
-         dynamic->stencil_reference.front =
-            pCreateInfo->pDepthStencilState->front.reference;
-         dynamic->stencil_reference.back =
-            pCreateInfo->pDepthStencilState->back.reference;
-      }
+   const struct intel_l3_weights w =
+      intel_get_default_l3_weights(devinfo, true, needs_slm);
 
-      if (states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE) {
-         dynamic->depth_test_enable =
-            pCreateInfo->pDepthStencilState->depthTestEnable;
-      }
+   pipeline->l3_config = intel_get_l3_config(devinfo, w);
+}
 
-      if (states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE) {
-         dynamic->depth_write_enable =
-            pCreateInfo->pDepthStencilState->depthWriteEnable;
-      }
+static uint32_t
+get_vs_input_elements(const struct brw_vs_prog_data *vs_prog_data)
+{
+   /* Pull inputs_read out of the VS prog data */
+   const uint64_t inputs_read = vs_prog_data->inputs_read;
+   const uint64_t double_inputs_read =
+      vs_prog_data->double_inputs_read & inputs_read;
+   assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
+   const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;
+   const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0;
+
+   return __builtin_popcount(elements) -
+          __builtin_popcount(elements_double) / 2;
+}
 
-      if (states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP) {
-         dynamic->depth_compare_op =
-            pCreateInfo->pDepthStencilState->depthCompareOp;
-      }
+static void
+anv_graphics_pipeline_emit(struct anv_graphics_pipeline *pipeline,
+                           const struct vk_graphics_pipeline_state *state)
+{
+   pipeline->view_mask = state->rp->view_mask;
 
-      if (states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) {
-         dynamic->depth_bounds_test_enable =
-            pCreateInfo->pDepthStencilState->depthBoundsTestEnable;
-      }
+   anv_pipeline_setup_l3_config(&pipeline->base.base, false);
 
-      if (states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE) {
-         dynamic->stencil_test_enable =
-            pCreateInfo->pDepthStencilState->stencilTestEnable;
-      }
+   if (anv_pipeline_is_primitive(pipeline)) {
+      const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
 
-      if (states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP) {
-         const VkPipelineDepthStencilStateCreateInfo *info =
-            pCreateInfo->pDepthStencilState;
-         memcpy(&dynamic->stencil_op.front, &info->front,
-                sizeof(dynamic->stencil_op.front));
-         memcpy(&dynamic->stencil_op.back, &info->back,
-                sizeof(dynamic->stencil_op.back));
-      }
+      /* The total number of vertex elements we need to program. We might need
+       * a couple more to implement some of the draw parameters.
+       */
+      pipeline->svgs_count =
+         (vs_prog_data->uses_vertexid ||
+          vs_prog_data->uses_instanceid ||
+          vs_prog_data->uses_firstvertex ||
+          vs_prog_data->uses_baseinstance) + vs_prog_data->uses_drawid;
+
+      pipeline->vs_input_elements = get_vs_input_elements(vs_prog_data);
+
+      pipeline->vertex_input_elems =
+         (BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_VI) ?
+          0 : pipeline->vs_input_elements) + pipeline->svgs_count;
+
+      /* Our implementation of VK_KHR_multiview uses instancing to draw the
+       * different views when primitive replication cannot be used.  If the
+       * client asks for instancing, we need to multiply by the client's
+       * instance count at draw time and instance divisor in the vertex
+       * bindings by the number of views ensure that we repeat the client's
+       * per-instance data once for each view.
+       */
+      const bool uses_primitive_replication =
+         anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map.num_pos_slots > 1;
+      pipeline->instance_multiplier = 1;
+      if (pipeline->view_mask && !uses_primitive_replication)
+         pipeline->instance_multiplier = util_bitcount(pipeline->view_mask);
+   } else {
+      assert(anv_pipeline_is_mesh(pipeline));
+      /* TODO(mesh): Mesh vs. Multiview with Instancing. */
    }
 
-   const VkPipelineRasterizationLineStateCreateInfoEXT *line_state =
-      vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
-                           PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
-   if (!raster_discard && line_state && line_state->stippledLineEnable) {
-      if (states & ANV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE) {
-         dynamic->line_stipple.factor = line_state->lineStippleFactor;
-         dynamic->line_stipple.pattern = line_state->lineStipplePattern;
-      }
-   }
 
-   const VkPipelineMultisampleStateCreateInfo *ms_info =
-      pCreateInfo->pRasterizationState->rasterizerDiscardEnable ? NULL :
-      pCreateInfo->pMultisampleState;
-   if (states & ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS) {
-      const VkPipelineSampleLocationsStateCreateInfoEXT *sl_info = ms_info ?
-         vk_find_struct_const(ms_info, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT) : NULL;
-
-      if (sl_info) {
-         dynamic->sample_locations.samples =
-            sl_info->sampleLocationsInfo.sampleLocationsCount;
-         const VkSampleLocationEXT *positions =
-            sl_info->sampleLocationsInfo.pSampleLocations;
-         for (uint32_t i = 0; i < dynamic->sample_locations.samples; i++) {
-            dynamic->sample_locations.locations[i].x = positions[i].x;
-            dynamic->sample_locations.locations[i].y = positions[i].y;
-         }
-      }
+   pipeline->dynamic_patch_control_points =
+      anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL) &&
+      BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS) &&
+      (pipeline->base.shaders[MESA_SHADER_TESS_CTRL]->dynamic_push_values &
+       ANV_DYNAMIC_PUSH_INPUT_VERTICES);
+
+   if (pipeline->base.shaders[MESA_SHADER_FRAGMENT] && state->ms) {
+      pipeline->sample_shading_enable = state->ms->sample_shading_enable;
+      pipeline->min_sample_shading = state->ms->min_sample_shading;
    }
-   /* Ensure we always have valid values for sample_locations. */
-   if (pipeline->base.device->vk.enabled_extensions.EXT_sample_locations &&
-       dynamic->sample_locations.samples == 0) {
-      dynamic->sample_locations.samples =
-         ms_info ? ms_info->rasterizationSamples : 1;
-      const struct intel_sample_position *positions =
-         intel_get_sample_positions(dynamic->sample_locations.samples);
-      for (uint32_t i = 0; i < dynamic->sample_locations.samples; i++) {
-         dynamic->sample_locations.locations[i].x = positions[i].x;
-         dynamic->sample_locations.locations[i].y = positions[i].y;
-      }
+
+   const struct anv_device *device = pipeline->base.base.device;
+   const struct intel_device_info *devinfo = device->info;
+   anv_genX(devinfo, graphics_pipeline_emit)(pipeline, state);
+}
+
+static void
+anv_graphics_pipeline_import_layout(struct anv_graphics_base_pipeline *pipeline,
+                                    struct anv_pipeline_sets_layout *layout)
+{
+   pipeline->base.layout.independent_sets |= layout->independent_sets;
+
+   for (uint32_t s = 0; s < layout->num_sets; s++) {
+      if (layout->set[s].layout == NULL)
+         continue;
+
+      anv_pipeline_sets_layout_add(&pipeline->base.layout, s,
+                                   layout->set[s].layout);
    }
+}
 
-   if (states & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE) {
-      if (!pCreateInfo->pRasterizationState->rasterizerDiscardEnable &&
-          uses_color_att) {
-         assert(pCreateInfo->pColorBlendState);
-         const VkPipelineColorWriteCreateInfoEXT *color_write_info =
-            vk_find_struct_const(pCreateInfo->pColorBlendState->pNext,
-                                 PIPELINE_COLOR_WRITE_CREATE_INFO_EXT);
+static void
+anv_graphics_pipeline_import_lib(struct anv_graphics_base_pipeline *pipeline,
+                                 bool link_optimize,
+                                 bool retain_shaders,
+                                 struct anv_pipeline_stage *stages,
+                                 struct anv_graphics_lib_pipeline *lib)
+{
+   struct anv_pipeline_sets_layout *lib_layout =
+      &lib->base.base.layout;
+   anv_graphics_pipeline_import_layout(pipeline, lib_layout);
 
-         if (color_write_info) {
-            dynamic->color_writes = 0;
-            for (uint32_t i = 0; i < color_write_info->attachmentCount; i++) {
-               dynamic->color_writes |=
-                  color_write_info->pColorWriteEnables[i] ? (1u << i) : 0;
-            }
-         }
+   /* We can't have shaders specified twice through libraries. */
+   assert((pipeline->base.active_stages & lib->base.base.active_stages) == 0);
+
+   /* VK_EXT_graphics_pipeline_library:
+    *
+    *    "To perform link time optimizations,
+    *     VK_PIPELINE_CREATE_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT must
+    *     be specified on all pipeline libraries that are being linked
+    *     together. Implementations should retain any additional information
+    *     needed to perform optimizations at the final link step when this bit
+    *     is present."
+    */
+   assert(!link_optimize || lib->retain_shaders);
+
+   pipeline->base.active_stages |= lib->base.base.active_stages;
+
+   /* Propagate the fragment dynamic flag, unless we're doing link
+    * optimization, in that case we'll have all the state information and this
+    * will never be dynamic.
+    */
+   if (!link_optimize) {
+      if (lib->base.fragment_dynamic) {
+         assert(lib->base.base.active_stages & VK_SHADER_STAGE_FRAGMENT_BIT);
+         pipeline->fragment_dynamic = true;
       }
    }
 
-   const VkPipelineFragmentShadingRateStateCreateInfoKHR *fsr_state =
-      vk_find_struct_const(pCreateInfo->pNext,
-                           PIPELINE_FRAGMENT_SHADING_RATE_STATE_CREATE_INFO_KHR);
-   if (fsr_state) {
-      if (states & ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE)
-         dynamic->fragment_shading_rate = fsr_state->fragmentSize;
-   }
+   uint32_t shader_count = anv_graphics_pipeline_imported_shader_count(stages);
+   for (uint32_t s = 0; s < ARRAY_SIZE(lib->base.shaders); s++) {
+      if (lib->base.shaders[s] == NULL)
+         continue;
 
-   pipeline->dynamic_state_mask = states;
+      stages[s].stage = s;
+      stages[s].feedback_idx = shader_count + lib->base.feedback_index[s];
+      stages[s].robust_flags = lib->base.robust_flags[s];
 
-   /* Mark states that can either be dynamic or fully baked into the pipeline.
-    */
-   pipeline->static_state_mask = states &
-      (ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS |
-       ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |
-       ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE |
-       ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE |
-       ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP |
-       ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY);
-}
+      /* Always import the shader sha1, this will be used for cache lookup. */
+      memcpy(stages[s].shader_sha1, lib->retained_shaders[s].shader_sha1,
+             sizeof(stages[s].shader_sha1));
+      stages[s].source_hash = lib->base.source_hashes[s];
 
-static void
-anv_pipeline_validate_create_info(const VkGraphicsPipelineCreateInfo *info)
-{
-#ifdef DEBUG
-   struct anv_render_pass *renderpass = NULL;
-   struct anv_subpass *subpass = NULL;
+      stages[s].subgroup_size_type = lib->retained_shaders[s].subgroup_size_type;
+      stages[s].imported.nir = lib->retained_shaders[s].nir;
+      stages[s].imported.bin = lib->base.shaders[s];
+   }
 
-   /* Assert that all required members of VkGraphicsPipelineCreateInfo are
-    * present.  See the Vulkan 1.0.28 spec, Section 9.2 Graphics Pipelines.
+   /* When not link optimizing, import the executables (shader descriptions
+    * for VK_KHR_pipeline_executable_properties). With link optimization there
+    * is a chance it'll produce different binaries, so we'll add the optimized
+    * version later.
     */
-   assert(info->sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO);
-
-   renderpass = anv_render_pass_from_handle(info->renderPass);
-   assert(renderpass);
-
-   assert(info->subpass < renderpass->subpass_count);
-   subpass = &renderpass->subpasses[info->subpass];
-
-   assert(info->stageCount >= 1);
-   assert(info->pVertexInputState);
-   assert(info->pInputAssemblyState);
-   assert(info->pRasterizationState);
-   if (!info->pRasterizationState->rasterizerDiscardEnable) {
-      assert(info->pViewportState);
-      assert(info->pMultisampleState);
-
-      if (subpass && subpass->depth_stencil_attachment)
-         assert(info->pDepthStencilState);
-
-      if (subpass && subpass->color_count > 0) {
-         bool all_color_unused = true;
-         for (int i = 0; i < subpass->color_count; i++) {
-            if (subpass->color_attachments[i].attachment != VK_ATTACHMENT_UNUSED)
-               all_color_unused = false;
-         }
-         /* pColorBlendState is ignored if the pipeline has rasterization
-          * disabled or if the subpass of the render pass the pipeline is
-          * created against does not use any color attachments.
-          */
-         assert(info->pColorBlendState || all_color_unused);
+   if (!link_optimize) {
+      util_dynarray_foreach(&lib->base.base.executables,
+                            struct anv_pipeline_executable, exe) {
+         util_dynarray_append(&pipeline->base.executables,
+                              struct anv_pipeline_executable, *exe);
       }
    }
+}
 
-   for (uint32_t i = 0; i < info->stageCount; ++i) {
-      switch (info->pStages[i].stage) {
-      case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
-      case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
-         assert(info->pTessellationState);
-         break;
-      default:
-         break;
+static void
+anv_graphics_lib_validate_shaders(struct anv_graphics_lib_pipeline *lib,
+                                  bool retained_shaders)
+{
+   for (uint32_t s = 0; s < ARRAY_SIZE(lib->retained_shaders); s++) {
+      if (anv_pipeline_base_has_stage(&lib->base, s)) {
+         assert(!retained_shaders || lib->retained_shaders[s].nir != NULL);
+         assert(lib->base.shaders[s] != NULL);
       }
    }
-#endif
 }
 
-/**
- * Calculate the desired L3 partitioning based on the current state of the
- * pipeline.  For now this simply returns the conservative defaults calculated
- * by get_default_l3_weights(), but we could probably do better by gathering
- * more statistics from the pipeline state (e.g. guess of expected URB usage
- * and bound surfaces), or by using feed-back from performance counters.
- */
-void
-anv_pipeline_setup_l3_config(struct anv_pipeline *pipeline, bool needs_slm)
+static VkResult
+anv_graphics_lib_pipeline_create(struct anv_device *device,
+                                 struct vk_pipeline_cache *cache,
+                                 const VkGraphicsPipelineCreateInfo *pCreateInfo,
+                                 const VkAllocationCallbacks *pAllocator,
+                                 VkPipeline *pPipeline)
 {
-   const struct intel_device_info *devinfo = &pipeline->device->info;
+   struct anv_pipeline_stage stages[ANV_GRAPHICS_SHADER_STAGE_COUNT] = {};
+   VkPipelineCreationFeedback pipeline_feedback = {
+      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
+   };
+   int64_t pipeline_start = os_time_get_nano();
 
-   const struct intel_l3_weights w =
-      intel_get_default_l3_weights(devinfo, true, needs_slm);
+   struct anv_graphics_lib_pipeline *pipeline;
+   VkResult result;
 
-   pipeline->l3_config = intel_get_l3_config(devinfo, w);
-}
+   const VkPipelineCreateFlags2KHR flags =
+      vk_graphics_pipeline_create_flags(pCreateInfo);
+   assert(flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR);
 
-static VkLineRasterizationModeEXT
-vk_line_rasterization_mode(const VkPipelineRasterizationLineStateCreateInfoEXT *line_info,
-                           const VkPipelineMultisampleStateCreateInfo *ms_info)
-{
-   VkLineRasterizationModeEXT line_mode =
-      line_info ? line_info->lineRasterizationMode :
-                  VK_LINE_RASTERIZATION_MODE_DEFAULT_EXT;
+   const VkPipelineLibraryCreateInfoKHR *libs_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           PIPELINE_LIBRARY_CREATE_INFO_KHR);
 
-   if (line_mode == VK_LINE_RASTERIZATION_MODE_DEFAULT_EXT) {
-      if (ms_info && ms_info->rasterizationSamples > 1) {
-         return VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT;
-      } else {
-         return VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT;
+   pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
+                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (pipeline == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = anv_pipeline_init(&pipeline->base.base, device,
+                              ANV_PIPELINE_GRAPHICS_LIB, flags,
+                              pAllocator);
+   if (result != VK_SUCCESS) {
+      vk_free2(&device->vk.alloc, pAllocator, pipeline);
+      if (result == VK_PIPELINE_COMPILE_REQUIRED)
+         *pPipeline = VK_NULL_HANDLE;
+      return result;
+   }
+
+   /* Capture the retain state before we compile/load any shader. */
+   pipeline->retain_shaders =
+      (flags & VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT) != 0;
+
+   /* If we have libraries, import them first. */
+   if (libs_info) {
+      for (uint32_t i = 0; i < libs_info->libraryCount; i++) {
+         ANV_FROM_HANDLE(anv_pipeline, pipeline_lib, libs_info->pLibraries[i]);
+         struct anv_graphics_lib_pipeline *gfx_pipeline_lib =
+            anv_pipeline_to_graphics_lib(pipeline_lib);
+
+         vk_graphics_pipeline_state_merge(&pipeline->state, &gfx_pipeline_lib->state);
+         anv_graphics_pipeline_import_lib(&pipeline->base,
+                                          false /* link_optimize */,
+                                          pipeline->retain_shaders,
+                                          stages, gfx_pipeline_lib);
       }
    }
 
-   return line_mode;
+   result = vk_graphics_pipeline_state_fill(&device->vk,
+                                            &pipeline->state, pCreateInfo,
+                                            NULL /* driver_rp */,
+                                            0 /* driver_rp_flags */,
+                                            &pipeline->all_state, NULL, 0, NULL);
+   if (result != VK_SUCCESS) {
+      anv_pipeline_finish(&pipeline->base.base, device);
+      vk_free2(&device->vk.alloc, pAllocator, pipeline);
+      return result;
+   }
+
+   pipeline->base.base.active_stages = pipeline->state.shader_stages;
+
+   /* After we've imported all the libraries' layouts, import the pipeline
+    * layout and hash the whole lot.
+    */
+   ANV_FROM_HANDLE(anv_pipeline_layout, pipeline_layout, pCreateInfo->layout);
+   if (pipeline_layout != NULL) {
+      anv_graphics_pipeline_import_layout(&pipeline->base,
+                                          &pipeline_layout->sets_layout);
+   }
+
+   anv_pipeline_sets_layout_hash(&pipeline->base.base.layout);
+
+   /* Compile shaders. We can skip this if there are no active stage in that
+    * pipeline.
+    */
+   if (pipeline->base.base.active_stages != 0) {
+      result = anv_graphics_pipeline_compile(&pipeline->base, stages,
+                                             cache, &pipeline_feedback,
+                                             pCreateInfo, &pipeline->state);
+      if (result != VK_SUCCESS) {
+         anv_pipeline_finish(&pipeline->base.base, device);
+         vk_free2(&device->vk.alloc, pAllocator, pipeline);
+         return result;
+      }
+   }
+
+   pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
+
+   anv_fill_pipeline_creation_feedback(&pipeline->base, &pipeline_feedback,
+                                       pCreateInfo, stages);
+
+   anv_graphics_lib_validate_shaders(
+      pipeline,
+      flags & VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT);
+
+   *pPipeline = anv_pipeline_to_handle(&pipeline->base.base);
+
+   return VK_SUCCESS;
 }
 
-VkResult
-anv_graphics_pipeline_init(struct anv_graphics_pipeline *pipeline,
-                           struct anv_device *device,
-                           struct anv_pipeline_cache *cache,
-                           const VkGraphicsPipelineCreateInfo *pCreateInfo,
-                           const VkAllocationCallbacks *alloc)
+static VkResult
+anv_graphics_pipeline_create(struct anv_device *device,
+                             struct vk_pipeline_cache *cache,
+                             const VkGraphicsPipelineCreateInfo *pCreateInfo,
+                             const VkAllocationCallbacks *pAllocator,
+                             VkPipeline *pPipeline)
 {
+   struct anv_pipeline_stage stages[ANV_GRAPHICS_SHADER_STAGE_COUNT] = {};
+   VkPipelineCreationFeedback pipeline_feedback = {
+      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
+   };
+   int64_t pipeline_start = os_time_get_nano();
+
+   struct anv_graphics_pipeline *pipeline;
    VkResult result;
 
-   anv_pipeline_validate_create_info(pCreateInfo);
+   const VkPipelineCreateFlags2KHR flags =
+      vk_graphics_pipeline_create_flags(pCreateInfo);
+   assert((flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR) == 0);
 
-   result = anv_pipeline_init(&pipeline->base, device,
-                              ANV_PIPELINE_GRAPHICS, pCreateInfo->flags,
-                              alloc);
-   if (result != VK_SUCCESS)
+   const VkPipelineLibraryCreateInfoKHR *libs_info =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           PIPELINE_LIBRARY_CREATE_INFO_KHR);
+
+   pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
+                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (pipeline == NULL)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* Initialize some information required by shaders */
+   result = anv_pipeline_init(&pipeline->base.base, device,
+                              ANV_PIPELINE_GRAPHICS, flags,
+                              pAllocator);
+   if (result != VK_SUCCESS) {
+      vk_free2(&device->vk.alloc, pAllocator, pipeline);
       return result;
+   }
 
-   anv_batch_set_storage(&pipeline->base.batch, ANV_NULL_ADDRESS,
-                         pipeline->batch_data, sizeof(pipeline->batch_data));
+   const bool link_optimize =
+      (flags & VK_PIPELINE_CREATE_2_LINK_TIME_OPTIMIZATION_BIT_EXT) != 0;
 
-   ANV_FROM_HANDLE(anv_render_pass, render_pass, pCreateInfo->renderPass);
-   assert(pCreateInfo->subpass < render_pass->subpass_count);
-   pipeline->subpass = &render_pass->subpasses[pCreateInfo->subpass];
+   struct vk_graphics_pipeline_all_state all;
+   struct vk_graphics_pipeline_state state = { };
 
-   assert(pCreateInfo->pRasterizationState);
+   /* If we have libraries, import them first. */
+   if (libs_info) {
+      for (uint32_t i = 0; i < libs_info->libraryCount; i++) {
+         ANV_FROM_HANDLE(anv_pipeline, pipeline_lib, libs_info->pLibraries[i]);
+         struct anv_graphics_lib_pipeline *gfx_pipeline_lib =
+            anv_pipeline_to_graphics_lib(pipeline_lib);
 
-   if (pCreateInfo->pDynamicState) {
-      /* Remove all of the states that are marked as dynamic */
-      uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount;
-      for (uint32_t s = 0; s < count; s++) {
-         pipeline->dynamic_states |= anv_cmd_dirty_bit_for_vk_dynamic_state(
-            pCreateInfo->pDynamicState->pDynamicStates[s]);
+         /* If we have link time optimization, all libraries must be created
+          * with
+          * VK_PIPELINE_CREATE_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT.
+          */
+         assert(!link_optimize || gfx_pipeline_lib->retain_shaders);
+
+         vk_graphics_pipeline_state_merge(&state, &gfx_pipeline_lib->state);
+         anv_graphics_pipeline_import_lib(&pipeline->base,
+                                          link_optimize,
+                                          false,
+                                          stages,
+                                          gfx_pipeline_lib);
       }
    }
-   copy_non_dynamic_state(pipeline, pCreateInfo);
 
-   pipeline->depth_clamp_enable = pCreateInfo->pRasterizationState->depthClampEnable;
+   result = vk_graphics_pipeline_state_fill(&device->vk, &state, pCreateInfo,
+                                            NULL /* driver_rp */,
+                                            0 /* driver_rp_flags */,
+                                            &all, NULL, 0, NULL);
+   if (result != VK_SUCCESS) {
+      anv_pipeline_finish(&pipeline->base.base, device);
+      vk_free2(&device->vk.alloc, pAllocator, pipeline);
+      return result;
+   }
+
+   pipeline->dynamic_state.vi = &pipeline->vertex_input;
+   pipeline->dynamic_state.ms.sample_locations = &pipeline->base.sample_locations;
+   vk_dynamic_graphics_state_fill(&pipeline->dynamic_state, &state);
+
+   pipeline->base.base.active_stages = state.shader_stages;
+
+   /* Sanity check on the shaders */
+   assert(pipeline->base.base.active_stages & VK_SHADER_STAGE_VERTEX_BIT ||
+          pipeline->base.base.active_stages & VK_SHADER_STAGE_MESH_BIT_EXT);
 
-   /* Previously we enabled depth clipping when !depthClampEnable.
-    * DepthClipStateCreateInfo now makes depth clipping explicit so if the
-    * clipping info is available, use its enable value to determine clipping,
-    * otherwise fallback to the previous !depthClampEnable logic.
+   if (anv_pipeline_is_mesh(pipeline)) {
+      assert(device->physical->vk.supported_extensions.EXT_mesh_shader);
+   }
+
+   /* After we've imported all the libraries' layouts, import the pipeline
+    * layout and hash the whole lot.
     */
-   const VkPipelineRasterizationDepthClipStateCreateInfoEXT *clip_info =
-      vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
-                           PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT);
-   pipeline->depth_clip_enable = clip_info ? clip_info->depthClipEnable : !pipeline->depth_clamp_enable;
+   ANV_FROM_HANDLE(anv_pipeline_layout, pipeline_layout, pCreateInfo->layout);
+   if (pipeline_layout != NULL) {
+      anv_graphics_pipeline_import_layout(&pipeline->base,
+                                          &pipeline_layout->sets_layout);
+   }
 
-   pipeline->sample_shading_enable =
-      !pCreateInfo->pRasterizationState->rasterizerDiscardEnable &&
-      pCreateInfo->pMultisampleState &&
-      pCreateInfo->pMultisampleState->sampleShadingEnable;
+   anv_pipeline_sets_layout_hash(&pipeline->base.base.layout);
 
-   result = anv_pipeline_compile_graphics(pipeline, cache, pCreateInfo);
+   /* Compile shaders, all required information should be have been copied in
+    * the previous step. We can skip this if there are no active stage in that
+    * pipeline.
+    */
+   result = anv_graphics_pipeline_compile(&pipeline->base, stages,
+                                          cache, &pipeline_feedback,
+                                          pCreateInfo, &state);
    if (result != VK_SUCCESS) {
-      anv_pipeline_finish(&pipeline->base, device, alloc);
+      anv_pipeline_finish(&pipeline->base.base, device);
+      vk_free2(&device->vk.alloc, pAllocator, pipeline);
       return result;
    }
 
-   assert(pipeline->shaders[MESA_SHADER_VERTEX]);
-
-   anv_pipeline_setup_l3_config(&pipeline->base, false);
-
-   const VkPipelineVertexInputStateCreateInfo *vi_info =
-      pCreateInfo->pVertexInputState;
+   /* Prepare a batch for the commands and emit all the non dynamic ones.
+    */
+   anv_batch_set_storage(&pipeline->base.base.batch, ANV_NULL_ADDRESS,
+                         pipeline->batch_data, sizeof(pipeline->batch_data));
 
-   const uint64_t inputs_read = get_vs_prog_data(pipeline)->inputs_read;
+   if (pipeline->base.base.active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)
+      pipeline->base.base.active_stages |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
 
-   for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
-      const VkVertexInputAttributeDescription *desc =
-         &vi_info->pVertexAttributeDescriptions[i];
+   if (anv_pipeline_is_mesh(pipeline))
+      assert(device->physical->vk.supported_extensions.EXT_mesh_shader);
 
-      if (inputs_read & (1ull << (VERT_ATTRIB_GENERIC0 + desc->location)))
-         pipeline->vb_used |= 1 << desc->binding;
-   }
+   anv_graphics_pipeline_emit(pipeline, &state);
 
-   for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
-      const VkVertexInputBindingDescription *desc =
-         &vi_info->pVertexBindingDescriptions[i];
+   pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
 
-      pipeline->vb[desc->binding].stride = desc->stride;
+   anv_fill_pipeline_creation_feedback(&pipeline->base, &pipeline_feedback,
+                                       pCreateInfo, stages);
 
-      /* Step rate is programmed per vertex element (attribute), not
-       * binding. Set up a map of which bindings step per instance, for
-       * reference by vertex element setup. */
-      switch (desc->inputRate) {
-      default:
-      case VK_VERTEX_INPUT_RATE_VERTEX:
-         pipeline->vb[desc->binding].instanced = false;
-         break;
-      case VK_VERTEX_INPUT_RATE_INSTANCE:
-         pipeline->vb[desc->binding].instanced = true;
-         break;
-      }
+   ANV_RMV(graphics_pipeline_create, device, pipeline, false);
 
-      pipeline->vb[desc->binding].instance_divisor = 1;
-   }
+   *pPipeline = anv_pipeline_to_handle(&pipeline->base.base);
 
-   const VkPipelineVertexInputDivisorStateCreateInfoEXT *vi_div_state =
-      vk_find_struct_const(vi_info->pNext,
-                           PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
-   if (vi_div_state) {
-      for (uint32_t i = 0; i < vi_div_state->vertexBindingDivisorCount; i++) {
-         const VkVertexInputBindingDivisorDescriptionEXT *desc =
-            &vi_div_state->pVertexBindingDivisors[i];
+   return pipeline->base.base.batch.status;
+}
 
-         pipeline->vb[desc->binding].instance_divisor = desc->divisor;
+VkResult anv_CreateGraphicsPipelines(
+    VkDevice                                    _device,
+    VkPipelineCache                             pipelineCache,
+    uint32_t                                    count,
+    const VkGraphicsPipelineCreateInfo*         pCreateInfos,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipeline*                                 pPipelines)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(vk_pipeline_cache, pipeline_cache, pipelineCache);
+
+   VkResult result = VK_SUCCESS;
+
+   unsigned i;
+   for (i = 0; i < count; i++) {
+      assert(pCreateInfos[i].sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO);
+
+      const VkPipelineCreateFlags2KHR flags =
+         vk_graphics_pipeline_create_flags(&pCreateInfos[i]);
+      VkResult res;
+      if (flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR) {
+         res = anv_graphics_lib_pipeline_create(device, pipeline_cache,
+                                                &pCreateInfos[i],
+                                                pAllocator,
+                                                &pPipelines[i]);
+      } else {
+         res = anv_graphics_pipeline_create(device,
+                                            pipeline_cache,
+                                            &pCreateInfos[i],
+                                            pAllocator, &pPipelines[i]);
       }
-   }
 
-   /* Our implementation of VK_KHR_multiview uses instancing to draw the
-    * different views.  If the client asks for instancing, we need to multiply
-    * the instance divisor by the number of views ensure that we repeat the
-    * client's per-instance data once for each view.
-    */
-   if (pipeline->subpass->view_mask && !pipeline->use_primitive_replication) {
-      const uint32_t view_count = anv_subpass_view_count(pipeline->subpass);
-      for (uint32_t vb = 0; vb < MAX_VBS; vb++) {
-         if (pipeline->vb[vb].instanced)
-            pipeline->vb[vb].instance_divisor *= view_count;
-      }
-   }
+      if (res == VK_SUCCESS)
+         continue;
 
-   const VkPipelineInputAssemblyStateCreateInfo *ia_info =
-      pCreateInfo->pInputAssemblyState;
-   const VkPipelineTessellationStateCreateInfo *tess_info =
-      pCreateInfo->pTessellationState;
+      /* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED as it
+       * is not obvious what error should be report upon 2 different failures.
+       * */
+      result = res;
+      if (res != VK_PIPELINE_COMPILE_REQUIRED)
+         break;
 
-   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
-      pipeline->topology = _3DPRIM_PATCHLIST(tess_info->patchControlPoints);
-   else
-      pipeline->topology = vk_to_intel_primitive_type[ia_info->topology];
+      pPipelines[i] = VK_NULL_HANDLE;
 
-   /* If rasterization is not enabled, ms_info must be ignored. */
-   const bool raster_enabled =
-      !pCreateInfo->pRasterizationState->rasterizerDiscardEnable ||
-      (pipeline->dynamic_states &
-       ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE);
+      if (flags & VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
+         break;
+   }
 
-   const VkPipelineMultisampleStateCreateInfo *ms_info =
-      raster_enabled ? pCreateInfo->pMultisampleState : NULL;
+   for (; i < count; i++)
+      pPipelines[i] = VK_NULL_HANDLE;
 
-   const VkPipelineRasterizationLineStateCreateInfoEXT *line_info =
-      vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
-                           PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
+   return result;
+}
 
-   /* Store line mode, polygon mode and rasterization samples, these are used
-    * for dynamic primitive topology.
-    */
-   pipeline->line_mode = vk_line_rasterization_mode(line_info, ms_info);
-   pipeline->polygon_mode = pCreateInfo->pRasterizationState->polygonMode;
-   pipeline->rasterization_samples =
-      ms_info ? ms_info->rasterizationSamples : 1;
+static bool
+should_remat_cb(nir_instr *instr, void *data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
 
-   return VK_SUCCESS;
+   return nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_resource_intel;
 }
 
 static VkResult
 compile_upload_rt_shader(struct anv_ray_tracing_pipeline *pipeline,
-                         struct anv_pipeline_cache *cache,
+                         struct vk_pipeline_cache *cache,
                          nir_shader *nir,
                          struct anv_pipeline_stage *stage,
-                         struct anv_shader_bin **shader_out,
                          void *mem_ctx)
 {
    const struct brw_compiler *compiler =
@@ -2491,48 +3349,64 @@ compile_upload_rt_shader(struct anv_ray_tracing_pipeline *pipeline,
    nir_shader **resume_shaders = NULL;
    uint32_t num_resume_shaders = 0;
    if (nir->info.stage != MESA_SHADER_COMPUTE) {
-      NIR_PASS_V(nir, nir_lower_shader_calls,
-                 nir_address_format_64bit_global,
-                 BRW_BTD_STACK_ALIGN,
-                 &resume_shaders, &num_resume_shaders, mem_ctx);
-      NIR_PASS_V(nir, brw_nir_lower_shader_calls);
+      const nir_lower_shader_calls_options opts = {
+         .address_format = nir_address_format_64bit_global,
+         .stack_alignment = BRW_BTD_STACK_ALIGN,
+         .localized_loads = true,
+         .vectorizer_callback = brw_nir_should_vectorize_mem,
+         .vectorizer_data = NULL,
+         .should_remat_callback = should_remat_cb,
+      };
+
+      NIR_PASS(_, nir, nir_lower_shader_calls, &opts,
+               &resume_shaders, &num_resume_shaders, mem_ctx);
+      NIR_PASS(_, nir, brw_nir_lower_shader_calls, &stage->key.bs);
       NIR_PASS_V(nir, brw_nir_lower_rt_intrinsics, devinfo);
    }
 
    for (unsigned i = 0; i < num_resume_shaders; i++) {
-      NIR_PASS_V(resume_shaders[i], brw_nir_lower_shader_calls);
+      NIR_PASS(_,resume_shaders[i], brw_nir_lower_shader_calls, &stage->key.bs);
       NIR_PASS_V(resume_shaders[i], brw_nir_lower_rt_intrinsics, devinfo);
    }
 
-   stage->code =
-      brw_compile_bs(compiler, pipeline->base.device, mem_ctx,
-                     &stage->key.bs, &stage->prog_data.bs, nir,
-                     num_resume_shaders, resume_shaders, stage->stats, NULL);
-   if (stage->code == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   /* Ray-tracing shaders don't have a "real" bind map */
-   struct anv_pipeline_bind_map empty_bind_map = {};
+   struct brw_compile_bs_params params = {
+      .base = {
+         .nir = nir,
+         .stats = stage->stats,
+         .log_data = pipeline->base.device,
+         .mem_ctx = mem_ctx,
+      },
+      .key = &stage->key.bs,
+      .prog_data = &stage->prog_data.bs,
+      .num_resume_shaders = num_resume_shaders,
+      .resume_shaders = resume_shaders,
+   };
 
-   const unsigned code_size = stage->prog_data.base.program_size;
-   struct anv_shader_bin *bin =
-      anv_device_upload_kernel(pipeline->base.device,
-                               cache,
-                               stage->stage,
-                               &stage->cache_key, sizeof(stage->cache_key),
-                               stage->code, code_size,
-                               &stage->prog_data.base,
-                               sizeof(stage->prog_data.bs),
-                               stage->stats, 1,
-                               NULL, &empty_bind_map);
-   if (bin == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   stage->code = brw_compile_bs(compiler, &params);
+   if (stage->code == NULL)
+      return vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   struct anv_shader_upload_params upload_params = {
+      .stage               = stage->stage,
+      .key_data            = &stage->cache_key,
+      .key_size            = sizeof(stage->cache_key),
+      .kernel_data         = stage->code,
+      .kernel_size         = stage->prog_data.base.program_size,
+      .prog_data           = &stage->prog_data.base,
+      .prog_data_size      = brw_prog_data_size(stage->stage),
+      .stats               = stage->stats,
+      .num_stats           = 1,
+      .bind_map            = &stage->bind_map,
+      .push_desc_info      = &stage->push_desc_info,
+      .dynamic_push_values = stage->dynamic_push_values,
+   };
 
-   /* TODO: Figure out executables for resume shaders */
-   anv_pipeline_add_executables(&pipeline->base, stage, bin);
-   util_dynarray_append(&pipeline->shaders, struct anv_shader_bin *, bin);
+   stage->bin =
+      anv_device_upload_kernel(pipeline->base.device, cache, &upload_params);
+   if (stage->bin == NULL)
+      return vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   *shader_out = bin;
+   anv_pipeline_add_executables(&pipeline->base, stage);
 
    return VK_SUCCESS;
 }
@@ -2595,51 +3469,72 @@ anv_pipeline_compute_ray_tracing_stacks(struct anv_ray_tracing_pipeline *pipelin
    }
 }
 
+static enum brw_rt_ray_flags
+anv_pipeline_get_pipeline_ray_flags(VkPipelineCreateFlags2KHR flags)
+{
+   uint32_t ray_flags = 0;
+
+   const bool rt_skip_triangles =
+      flags & VK_PIPELINE_CREATE_2_RAY_TRACING_SKIP_TRIANGLES_BIT_KHR;
+   const bool rt_skip_aabbs =
+      flags & VK_PIPELINE_CREATE_2_RAY_TRACING_SKIP_AABBS_BIT_KHR;
+   assert(!(rt_skip_triangles && rt_skip_aabbs));
+
+   if (rt_skip_triangles)
+      ray_flags |= BRW_RT_RAY_FLAG_SKIP_TRIANGLES;
+   else if (rt_skip_aabbs)
+      ray_flags |= BRW_RT_RAY_FLAG_SKIP_AABBS;
+
+   return ray_flags;
+}
+
 static struct anv_pipeline_stage *
 anv_pipeline_init_ray_tracing_stages(struct anv_ray_tracing_pipeline *pipeline,
                                      const VkRayTracingPipelineCreateInfoKHR *info,
-                                     void *pipeline_ctx)
+                                     void *tmp_pipeline_ctx)
 {
-   ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
-
+   struct anv_device *device = pipeline->base.device;
    /* Create enough stage entries for all shader modules plus potential
     * combinaisons in the groups.
     */
    struct anv_pipeline_stage *stages =
-      rzalloc_array(pipeline_ctx, struct anv_pipeline_stage, info->stageCount);
+      rzalloc_array(tmp_pipeline_ctx, struct anv_pipeline_stage, info->stageCount);
+
+   enum brw_rt_ray_flags ray_flags =
+      anv_pipeline_get_pipeline_ray_flags(pipeline->base.flags);
 
    for (uint32_t i = 0; i < info->stageCount; i++) {
       const VkPipelineShaderStageCreateInfo *sinfo = &info->pStages[i];
-      if (sinfo->module == VK_NULL_HANDLE)
+      if (vk_pipeline_shader_stage_is_null(sinfo))
          continue;
 
       int64_t stage_start = os_time_get_nano();
 
       stages[i] = (struct anv_pipeline_stage) {
          .stage = vk_to_mesa_shader_stage(sinfo->stage),
-         .module = vk_shader_module_from_handle(sinfo->module),
-         .entrypoint = sinfo->pName,
-         .spec_info = sinfo->pSpecializationInfo,
+         .pipeline_pNext = info->pNext,
+         .info = sinfo,
          .cache_key = {
             .stage = vk_to_mesa_shader_stage(sinfo->stage),
          },
          .feedback = {
-            .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT,
+            .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
          },
       };
 
-      populate_bs_prog_key(&pipeline->base.device->info, sinfo->flags,
-                           pipeline->base.device->robust_buffer_access,
-                           &stages[i].key.bs);
+      anv_stage_allocate_bind_map_tables(&pipeline->base, &stages[i],
+                                         tmp_pipeline_ctx);
+
+      pipeline->base.active_stages |= sinfo->stage;
 
-      anv_pipeline_hash_shader(stages[i].module,
-                               stages[i].entrypoint,
-                               stages[i].stage,
-                               stages[i].spec_info,
-                               stages[i].shader_sha1);
+      anv_stage_write_shader_hash(&stages[i], device);
+
+      populate_bs_prog_key(&stages[i],
+                           pipeline->base.device,
+                           ray_flags);
 
       if (stages[i].stage != MESA_SHADER_INTERSECTION) {
-         anv_pipeline_hash_ray_tracing_shader(pipeline, layout, &stages[i],
+         anv_pipeline_hash_ray_tracing_shader(pipeline, &stages[i],
                                               stages[i].cache_key.sha1);
       }
 
@@ -2661,12 +3556,11 @@ anv_pipeline_init_ray_tracing_stages(struct anv_ray_tracing_pipeline *pipeline,
       if (any_hit_idx != VK_SHADER_UNUSED_KHR) {
          assert(any_hit_idx < info->stageCount);
          anv_pipeline_hash_ray_tracing_combined_shader(pipeline,
-                                                       layout,
                                                        &stages[intersection_idx],
                                                        &stages[any_hit_idx],
                                                        stages[intersection_idx].cache_key.sha1);
       } else {
-         anv_pipeline_hash_ray_tracing_shader(pipeline, layout,
+         anv_pipeline_hash_ray_tracing_shader(pipeline,
                                               &stages[intersection_idx],
                                               stages[intersection_idx].cache_key.sha1);
       }
@@ -2678,15 +3572,14 @@ anv_pipeline_init_ray_tracing_stages(struct anv_ray_tracing_pipeline *pipeline,
 }
 
 static bool
-anv_pipeline_load_cached_shaders(struct anv_ray_tracing_pipeline *pipeline,
-                                 struct anv_pipeline_cache *cache,
-                                 const VkRayTracingPipelineCreateInfoKHR *info,
-                                 struct anv_pipeline_stage *stages,
-                                 uint32_t *stack_max)
+anv_ray_tracing_pipeline_load_cached_shaders(struct anv_ray_tracing_pipeline *pipeline,
+                                             struct vk_pipeline_cache *cache,
+                                             const VkRayTracingPipelineCreateInfoKHR *info,
+                                             struct anv_pipeline_stage *stages)
 {
    uint32_t shaders = 0, cache_hits = 0;
    for (uint32_t i = 0; i < info->stageCount; i++) {
-      if (stages[i].entrypoint == NULL)
+      if (stages[i].info == NULL)
          continue;
 
       shaders++;
@@ -2701,18 +3594,11 @@ anv_pipeline_load_cached_shaders(struct anv_ray_tracing_pipeline *pipeline,
       if (cache_hit) {
          cache_hits++;
          stages[i].feedback.flags |=
-            VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
+            VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
       }
 
-      if (stages[i].bin != NULL) {
-         anv_pipeline_add_executables(&pipeline->base, &stages[i], stages[i].bin);
-         util_dynarray_append(&pipeline->shaders, struct anv_shader_bin *, stages[i].bin);
-
-         uint32_t stack_size =
-            brw_bs_prog_data_const(stages[i].bin->prog_data)->max_stack_size;
-         stack_max[stages[i].stage] =
-            MAX2(stack_max[stages[i].stage], stack_size);
-      }
+      if (stages[i].bin != NULL)
+         anv_pipeline_add_executables(&pipeline->base, &stages[i]);
 
       stages[i].feedback.duration += os_time_get_nano() - stage_start;
    }
@@ -2722,61 +3608,54 @@ anv_pipeline_load_cached_shaders(struct anv_ray_tracing_pipeline *pipeline,
 
 static VkResult
 anv_pipeline_compile_ray_tracing(struct anv_ray_tracing_pipeline *pipeline,
-                                 struct anv_pipeline_cache *cache,
+                                 void *tmp_pipeline_ctx,
+                                 struct anv_pipeline_stage *stages,
+                                 struct vk_pipeline_cache *cache,
                                  const VkRayTracingPipelineCreateInfoKHR *info)
 {
-   const struct intel_device_info *devinfo = &pipeline->base.device->info;
+   const struct intel_device_info *devinfo = pipeline->base.device->info;
    VkResult result;
 
-   VkPipelineCreationFeedbackEXT pipeline_feedback = {
-      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT,
+   VkPipelineCreationFeedback pipeline_feedback = {
+      .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
    };
    int64_t pipeline_start = os_time_get_nano();
 
-   void *pipeline_ctx = ralloc_context(NULL);
-
-   struct anv_pipeline_stage *stages =
-      anv_pipeline_init_ray_tracing_stages(pipeline, info, pipeline_ctx);
-
-   ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
-
    const bool skip_cache_lookup =
       (pipeline->base.flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR);
 
-   uint32_t stack_max[MESA_VULKAN_SHADER_STAGES] = {};
-
    if (!skip_cache_lookup &&
-       anv_pipeline_load_cached_shaders(pipeline, cache, info, stages, stack_max)) {
+       anv_ray_tracing_pipeline_load_cached_shaders(pipeline, cache, info, stages)) {
       pipeline_feedback.flags |=
-         VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT;
+         VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
       goto done;
    }
 
-   if (info->flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT) {
-      ralloc_free(pipeline_ctx);
-      return VK_PIPELINE_COMPILE_REQUIRED_EXT;
-   }
+   if (pipeline->base.flags & VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR)
+      return VK_PIPELINE_COMPILE_REQUIRED;
 
    for (uint32_t i = 0; i < info->stageCount; i++) {
-      if (stages[i].entrypoint == NULL)
+      if (stages[i].info == NULL)
          continue;
 
       int64_t stage_start = os_time_get_nano();
 
       stages[i].nir = anv_pipeline_stage_get_nir(&pipeline->base, cache,
-                                                 pipeline_ctx, &stages[i]);
-      if (stages[i].nir == NULL) {
-         ralloc_free(pipeline_ctx);
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-      }
+                                                 tmp_pipeline_ctx, &stages[i]);
+      if (stages[i].nir == NULL)
+         return vk_error(pipeline, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-      anv_pipeline_lower_nir(&pipeline->base, pipeline_ctx, &stages[i], layout);
+      anv_pipeline_nir_preprocess(&pipeline->base, &stages[i]);
+
+      anv_pipeline_lower_nir(&pipeline->base, tmp_pipeline_ctx, &stages[i],
+                             &pipeline->base.layout, 0 /* view_mask */,
+                             false /* use_primitive_replication */);
 
       stages[i].feedback.duration += os_time_get_nano() - stage_start;
    }
 
    for (uint32_t i = 0; i < info->stageCount; i++) {
-      if (stages[i].entrypoint == NULL)
+      if (stages[i].info == NULL)
          continue;
 
       /* Shader found in cache already. */
@@ -2789,9 +3668,9 @@ anv_pipeline_compile_ray_tracing(struct anv_ray_tracing_pipeline *pipeline,
 
       int64_t stage_start = os_time_get_nano();
 
-      void *stage_ctx = ralloc_context(pipeline_ctx);
+      void *tmp_stage_ctx = ralloc_context(tmp_pipeline_ctx);
 
-      nir_shader *nir = nir_shader_clone(stage_ctx, stages[i].nir);
+      nir_shader *nir = nir_shader_clone(tmp_stage_ctx, stages[i].nir);
       switch (stages[i].stage) {
       case MESA_SHADER_RAYGEN:
          brw_nir_lower_raygen(nir);
@@ -2821,21 +3700,18 @@ anv_pipeline_compile_ray_tracing(struct anv_ray_tracing_pipeline *pipeline,
       }
 
       result = compile_upload_rt_shader(pipeline, cache, nir, &stages[i],
-                                        &stages[i].bin, stage_ctx);
+                                        tmp_stage_ctx);
       if (result != VK_SUCCESS) {
-         ralloc_free(pipeline_ctx);
+         ralloc_free(tmp_stage_ctx);
          return result;
       }
 
-      uint32_t stack_size =
-         brw_bs_prog_data_const(stages[i].bin->prog_data)->max_stack_size;
-      stack_max[stages[i].stage] = MAX2(stack_max[stages[i].stage], stack_size);
-
-      ralloc_free(stage_ctx);
+      ralloc_free(tmp_stage_ctx);
 
       stages[i].feedback.duration += os_time_get_nano() - stage_start;
    }
 
+ done:
    for (uint32_t i = 0; i < info->groupCount; i++) {
       const VkRayTracingShaderGroupCreateInfoKHR *ginfo = &info->pGroups[i];
       struct anv_rt_shader_group *group = &pipeline->groups[i];
@@ -2869,9 +3745,9 @@ anv_pipeline_compile_ray_tracing(struct anv_ray_tracing_pipeline *pipeline,
             if (any_hit_idx < info->stageCount)
                any_hit = stages[any_hit_idx].nir;
 
-            void *group_ctx = ralloc_context(pipeline_ctx);
+            void *tmp_group_ctx = ralloc_context(tmp_pipeline_ctx);
             nir_shader *intersection =
-               nir_shader_clone(group_ctx, stages[intersection_idx].nir);
+               nir_shader_clone(tmp_group_ctx, stages[intersection_idx].nir);
 
             brw_nir_lower_combined_intersection_any_hit(intersection, any_hit,
                                                         devinfo);
@@ -2879,20 +3755,13 @@ anv_pipeline_compile_ray_tracing(struct anv_ray_tracing_pipeline *pipeline,
             result = compile_upload_rt_shader(pipeline, cache,
                                               intersection,
                                               &stages[intersection_idx],
-                                              &group->intersection,
-                                              group_ctx);
-            ralloc_free(group_ctx);
+                                              tmp_group_ctx);
+            ralloc_free(tmp_group_ctx);
             if (result != VK_SUCCESS)
                return result;
-         } else {
-            group->intersection = stages[intersection_idx].bin;
          }
 
-         uint32_t stack_size =
-            brw_bs_prog_data_const(group->intersection->prog_data)->max_stack_size;
-         stack_max[MESA_SHADER_INTERSECTION] =
-            MAX2(stack_max[MESA_SHADER_INTERSECTION], stack_size);
-
+         group->intersection = stages[intersection_idx].bin;
          break;
       }
 
@@ -2901,20 +3770,16 @@ anv_pipeline_compile_ray_tracing(struct anv_ray_tracing_pipeline *pipeline,
       }
    }
 
- done:
-   ralloc_free(pipeline_ctx);
-
-   anv_pipeline_compute_ray_tracing_stacks(pipeline, info, stack_max);
-
    pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
 
-   const VkPipelineCreationFeedbackCreateInfoEXT *create_feedback =
-      vk_find_struct_const(info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT);
+   const VkPipelineCreationFeedbackCreateInfo *create_feedback =
+      vk_find_struct_const(info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
    if (create_feedback) {
       *create_feedback->pPipelineCreationFeedback = pipeline_feedback;
 
-      assert(info->stageCount == create_feedback->pipelineStageCreationFeedbackCount);
-      for (uint32_t i = 0; i < info->stageCount; i++) {
+      uint32_t stage_count = create_feedback->pipelineStageCreationFeedbackCount;
+      assert(stage_count == 0 || info->stageCount == stage_count);
+      for (uint32_t i = 0; i < stage_count; i++) {
          gl_shader_stage s = vk_to_mesa_shader_stage(info->pStages[i].stage);
          create_feedback->pPipelineStageCreationFeedbacks[i] = stages[s].feedback;
       }
@@ -2926,23 +3791,23 @@ anv_pipeline_compile_ray_tracing(struct anv_ray_tracing_pipeline *pipeline,
 VkResult
 anv_device_init_rt_shaders(struct anv_device *device)
 {
+   device->bvh_build_method = ANV_BVH_BUILD_METHOD_NEW_SAH;
+
    if (!device->vk.enabled_extensions.KHR_ray_tracing_pipeline)
       return VK_SUCCESS;
 
    bool cache_hit;
 
+   struct anv_push_descriptor_info empty_push_desc_info = {};
+   struct anv_pipeline_bind_map empty_bind_map = {};
    struct brw_rt_trampoline {
       char name[16];
       struct brw_cs_prog_key key;
    } trampoline_key = {
       .name = "rt-trampoline",
-      .key = {
-         /* TODO: Other subgroup sizes? */
-         .base.subgroup_size_type = BRW_SUBGROUP_SIZE_REQUIRE_8,
-      },
    };
    device->rt_trampoline =
-      anv_device_search_for_kernel(device, &device->default_pipeline_cache,
+      anv_device_search_for_kernel(device, device->internal_cache,
                                    &trampoline_key, sizeof(trampoline_key),
                                    &cache_hit);
    if (device->rt_trampoline == NULL) {
@@ -2951,10 +3816,8 @@ anv_device_init_rt_shaders(struct anv_device *device)
       nir_shader *trampoline_nir =
          brw_nir_create_raygen_trampoline(device->physical->compiler, tmp_ctx);
 
-      struct anv_pipeline_bind_map bind_map = {
-         .surface_count = 0,
-         .sampler_count = 0,
-      };
+      trampoline_nir->info.subgroup_size = SUBGROUP_SIZE_REQUIRE_16;
+
       uint32_t dummy_params[4] = { 0, };
       struct brw_cs_prog_data trampoline_prog_data = {
          .base.nr_params = 4,
@@ -2963,30 +3826,44 @@ anv_device_init_rt_shaders(struct anv_device *device)
          .uses_btd_stack_ids = true,
       };
       struct brw_compile_cs_params params = {
-         .nir = trampoline_nir,
+         .base = {
+            .nir = trampoline_nir,
+            .log_data = device,
+            .mem_ctx = tmp_ctx,
+         },
          .key = &trampoline_key.key,
          .prog_data = &trampoline_prog_data,
-         .log_data = device,
       };
       const unsigned *tramp_data =
-         brw_compile_cs(device->physical->compiler, tmp_ctx, &params);
+         brw_compile_cs(device->physical->compiler, &params);
+
+      struct anv_shader_upload_params upload_params = {
+         .stage               = MESA_SHADER_COMPUTE,
+         .key_data            = &trampoline_key,
+         .key_size            = sizeof(trampoline_key),
+         .kernel_data         = tramp_data,
+         .kernel_size         = trampoline_prog_data.base.program_size,
+         .prog_data           = &trampoline_prog_data.base,
+         .prog_data_size      = sizeof(trampoline_prog_data),
+         .bind_map            = &empty_bind_map,
+         .push_desc_info      = &empty_push_desc_info,
+      };
 
       device->rt_trampoline =
-         anv_device_upload_kernel(device, &device->default_pipeline_cache,
-                                  MESA_SHADER_COMPUTE,
-                                  &trampoline_key, sizeof(trampoline_key),
-                                  tramp_data,
-                                  trampoline_prog_data.base.program_size,
-                                  &trampoline_prog_data.base,
-                                  sizeof(trampoline_prog_data),
-                                  NULL, 0, NULL, &bind_map);
+         anv_device_upload_kernel(device, device->internal_cache,
+                                  &upload_params);
 
       ralloc_free(tmp_ctx);
 
       if (device->rt_trampoline == NULL)
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
    }
 
+   /* The cache already has a reference and it's not going anywhere so there
+    * is no need to hold a second reference.
+    */
+   anv_shader_bin_unref(device, device->rt_trampoline);
+
    struct brw_rt_trivial_return {
       char name[16];
       struct brw_bs_prog_key key;
@@ -2994,7 +3871,7 @@ anv_device_init_rt_shaders(struct anv_device *device)
       .name = "rt-trivial-ret",
    };
    device->rt_trivial_return =
-      anv_device_search_for_kernel(device, &device->default_pipeline_cache,
+      anv_device_search_for_kernel(device, device->internal_cache,
                                    &return_key, sizeof(return_key),
                                    &cache_hit);
    if (device->rt_trivial_return == NULL) {
@@ -3002,34 +3879,48 @@ anv_device_init_rt_shaders(struct anv_device *device)
       nir_shader *trivial_return_nir =
          brw_nir_create_trivial_return_shader(device->physical->compiler, tmp_ctx);
 
-      NIR_PASS_V(trivial_return_nir, brw_nir_lower_rt_intrinsics, &device->info);
+      NIR_PASS_V(trivial_return_nir, brw_nir_lower_rt_intrinsics, device->info);
 
-      struct anv_pipeline_bind_map bind_map = {
-         .surface_count = 0,
-         .sampler_count = 0,
-      };
       struct brw_bs_prog_data return_prog_data = { 0, };
+      struct brw_compile_bs_params params = {
+         .base = {
+            .nir = trivial_return_nir,
+            .log_data = device,
+            .mem_ctx = tmp_ctx,
+         },
+         .key = &return_key.key,
+         .prog_data = &return_prog_data,
+      };
       const unsigned *return_data =
-         brw_compile_bs(device->physical->compiler, device, tmp_ctx,
-                        &return_key.key, &return_prog_data, trivial_return_nir,
-                        0, 0, NULL, NULL);
+         brw_compile_bs(device->physical->compiler, &params);
+
+      struct anv_shader_upload_params upload_params = {
+         .stage               = MESA_SHADER_CALLABLE,
+         .key_data            = &return_key,
+         .key_size            = sizeof(return_key),
+         .kernel_data         = return_data,
+         .kernel_size         = return_prog_data.base.program_size,
+         .prog_data           = &return_prog_data.base,
+         .prog_data_size      = sizeof(return_prog_data),
+         .bind_map            = &empty_bind_map,
+         .push_desc_info      = &empty_push_desc_info,
+      };
 
       device->rt_trivial_return =
-         anv_device_upload_kernel(device, &device->default_pipeline_cache,
-                                  MESA_SHADER_CALLABLE,
-                                  &return_key, sizeof(return_key),
-                                  return_data, return_prog_data.base.program_size,
-                                  &return_prog_data.base, sizeof(return_prog_data),
-                                  NULL, 0, NULL, &bind_map);
+         anv_device_upload_kernel(device, device->internal_cache,
+                                  &upload_params);
 
       ralloc_free(tmp_ctx);
 
-      if (device->rt_trivial_return == NULL) {
-         anv_shader_bin_unref(device, device->rt_trampoline);
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-      }
+      if (device->rt_trivial_return == NULL)
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
    }
 
+   /* The cache already has a reference and it's not going anywhere so there
+    * is no need to hold a second reference.
+    */
+   anv_shader_bin_unref(device, device->rt_trivial_return);
+
    return VK_SUCCESS;
 }
 
@@ -3038,34 +3929,247 @@ anv_device_finish_rt_shaders(struct anv_device *device)
 {
    if (!device->vk.enabled_extensions.KHR_ray_tracing_pipeline)
       return;
-
-   anv_shader_bin_unref(device, device->rt_trampoline);
 }
 
-VkResult
+static void
 anv_ray_tracing_pipeline_init(struct anv_ray_tracing_pipeline *pipeline,
                               struct anv_device *device,
-                              struct anv_pipeline_cache *cache,
+                              struct vk_pipeline_cache *cache,
                               const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
                               const VkAllocationCallbacks *alloc)
 {
-   VkResult result;
-
    util_dynarray_init(&pipeline->shaders, pipeline->base.mem_ctx);
 
-   result = anv_pipeline_compile_ray_tracing(pipeline, cache, pCreateInfo);
-   if (result != VK_SUCCESS)
-      goto fail;
+   ANV_FROM_HANDLE(anv_pipeline_layout, pipeline_layout, pCreateInfo->layout);
+   anv_pipeline_init_layout(&pipeline->base, pipeline_layout);
 
    anv_pipeline_setup_l3_config(&pipeline->base, /* needs_slm */ false);
+}
 
-   return VK_SUCCESS;
+static void
+assert_rt_stage_index_valid(const VkRayTracingPipelineCreateInfoKHR* pCreateInfo,
+                            uint32_t stage_idx,
+                            VkShaderStageFlags valid_stages)
+{
+   if (stage_idx == VK_SHADER_UNUSED_KHR)
+      return;
 
-fail:
-   util_dynarray_foreach(&pipeline->shaders,
-                         struct anv_shader_bin *, shader) {
-      anv_shader_bin_unref(device, *shader);
+   assert(stage_idx <= pCreateInfo->stageCount);
+   assert(util_bitcount(pCreateInfo->pStages[stage_idx].stage) == 1);
+   assert(pCreateInfo->pStages[stage_idx].stage & valid_stages);
+}
+
+static VkResult
+anv_ray_tracing_pipeline_create(
+    VkDevice                                    _device,
+    struct vk_pipeline_cache *                  cache,
+    const VkRayTracingPipelineCreateInfoKHR*    pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipeline*                                 pPipeline)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   VkResult result;
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RAY_TRACING_PIPELINE_CREATE_INFO_KHR);
+
+   uint32_t group_count = pCreateInfo->groupCount;
+   if (pCreateInfo->pLibraryInfo) {
+      for (uint32_t l = 0; l < pCreateInfo->pLibraryInfo->libraryCount; l++) {
+         ANV_FROM_HANDLE(anv_pipeline, library,
+                         pCreateInfo->pLibraryInfo->pLibraries[l]);
+         struct anv_ray_tracing_pipeline *rt_library =
+            anv_pipeline_to_ray_tracing(library);
+         group_count += rt_library->group_count;
+      }
+   }
+
+   VK_MULTIALLOC(ma);
+   VK_MULTIALLOC_DECL(&ma, struct anv_ray_tracing_pipeline, pipeline, 1);
+   VK_MULTIALLOC_DECL(&ma, struct anv_rt_shader_group, groups, group_count);
+   if (!vk_multialloc_zalloc2(&ma, &device->vk.alloc, pAllocator,
+                              VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = anv_pipeline_init(&pipeline->base, device,
+                              ANV_PIPELINE_RAY_TRACING,
+                              vk_rt_pipeline_create_flags(pCreateInfo),
+                              pAllocator);
+   if (result != VK_SUCCESS) {
+      vk_free2(&device->vk.alloc, pAllocator, pipeline);
+      return result;
    }
+
+   pipeline->group_count = group_count;
+   pipeline->groups = groups;
+
+   ASSERTED const VkShaderStageFlags ray_tracing_stages =
+      VK_SHADER_STAGE_RAYGEN_BIT_KHR |
+      VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
+      VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
+      VK_SHADER_STAGE_MISS_BIT_KHR |
+      VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
+      VK_SHADER_STAGE_CALLABLE_BIT_KHR;
+
+   for (uint32_t i = 0; i < pCreateInfo->stageCount; i++)
+      assert((pCreateInfo->pStages[i].stage & ~ray_tracing_stages) == 0);
+
+   for (uint32_t i = 0; i < pCreateInfo->groupCount; i++) {
+      const VkRayTracingShaderGroupCreateInfoKHR *ginfo =
+         &pCreateInfo->pGroups[i];
+      assert_rt_stage_index_valid(pCreateInfo, ginfo->generalShader,
+                                  VK_SHADER_STAGE_RAYGEN_BIT_KHR |
+                                  VK_SHADER_STAGE_MISS_BIT_KHR |
+                                  VK_SHADER_STAGE_CALLABLE_BIT_KHR);
+      assert_rt_stage_index_valid(pCreateInfo, ginfo->closestHitShader,
+                                  VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR);
+      assert_rt_stage_index_valid(pCreateInfo, ginfo->anyHitShader,
+                                  VK_SHADER_STAGE_ANY_HIT_BIT_KHR);
+      assert_rt_stage_index_valid(pCreateInfo, ginfo->intersectionShader,
+                                  VK_SHADER_STAGE_INTERSECTION_BIT_KHR);
+      switch (ginfo->type) {
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR:
+         assert(ginfo->generalShader < pCreateInfo->stageCount);
+         assert(ginfo->anyHitShader == VK_SHADER_UNUSED_KHR);
+         assert(ginfo->closestHitShader == VK_SHADER_UNUSED_KHR);
+         assert(ginfo->intersectionShader == VK_SHADER_UNUSED_KHR);
+         break;
+
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR:
+         assert(ginfo->generalShader == VK_SHADER_UNUSED_KHR);
+         assert(ginfo->intersectionShader == VK_SHADER_UNUSED_KHR);
+         break;
+
+      case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR:
+         assert(ginfo->generalShader == VK_SHADER_UNUSED_KHR);
+         break;
+
+      default:
+         unreachable("Invalid ray-tracing shader group type");
+      }
+   }
+
+   anv_ray_tracing_pipeline_init(pipeline, device, cache,
+                                 pCreateInfo, pAllocator);
+
+   void *tmp_ctx = ralloc_context(NULL);
+
+   struct anv_pipeline_stage *stages =
+      anv_pipeline_init_ray_tracing_stages(pipeline, pCreateInfo, tmp_ctx);
+
+   result = anv_pipeline_compile_ray_tracing(pipeline, tmp_ctx, stages,
+                                             cache, pCreateInfo);
+   if (result != VK_SUCCESS) {
+      ralloc_free(tmp_ctx);
+      util_dynarray_foreach(&pipeline->shaders, struct anv_shader_bin *, shader)
+         anv_shader_bin_unref(device, *shader);
+      anv_pipeline_finish(&pipeline->base, device);
+      vk_free2(&device->vk.alloc, pAllocator, pipeline);
+      return result;
+   }
+
+   /* Compute the size of the scratch BO (for register spilling) by taking the
+    * max of all the shaders in the pipeline. Also add the shaders to the list
+    * of executables.
+    */
+   uint32_t stack_max[MESA_VULKAN_SHADER_STAGES] = {};
+   for (uint32_t s = 0; s < pCreateInfo->stageCount; s++) {
+      util_dynarray_append(&pipeline->shaders,
+                           struct anv_shader_bin *,
+                           stages[s].bin);
+
+      uint32_t stack_size =
+         brw_bs_prog_data_const(stages[s].bin->prog_data)->max_stack_size;
+      stack_max[stages[s].stage] = MAX2(stack_max[stages[s].stage], stack_size);
+
+      anv_pipeline_account_shader(&pipeline->base, stages[s].bin);
+   }
+
+   anv_pipeline_compute_ray_tracing_stacks(pipeline, pCreateInfo, stack_max);
+
+   if (pCreateInfo->pLibraryInfo) {
+      uint32_t g = pCreateInfo->groupCount;
+      for (uint32_t l = 0; l < pCreateInfo->pLibraryInfo->libraryCount; l++) {
+         ANV_FROM_HANDLE(anv_pipeline, library,
+                         pCreateInfo->pLibraryInfo->pLibraries[l]);
+         struct anv_ray_tracing_pipeline *rt_library =
+            anv_pipeline_to_ray_tracing(library);
+         for (uint32_t lg = 0; lg < rt_library->group_count; lg++) {
+            pipeline->groups[g] = rt_library->groups[lg];
+            pipeline->groups[g].imported = true;
+            g++;
+         }
+
+         /* Account for shaders in the library. */
+         util_dynarray_foreach(&rt_library->shaders,
+                               struct anv_shader_bin *, shader) {
+            util_dynarray_append(&pipeline->shaders,
+                                 struct anv_shader_bin *,
+                                 anv_shader_bin_ref(*shader));
+            anv_pipeline_account_shader(&pipeline->base, *shader);
+         }
+
+         /* Add the library shaders to this pipeline's executables. */
+         util_dynarray_foreach(&rt_library->base.executables,
+                               struct anv_pipeline_executable, exe) {
+            util_dynarray_append(&pipeline->base.executables,
+                                 struct anv_pipeline_executable, *exe);
+         }
+
+         pipeline->base.active_stages |= rt_library->base.active_stages;
+      }
+   }
+
+   anv_genX(device->info, ray_tracing_pipeline_emit)(pipeline);
+
+   ralloc_free(tmp_ctx);
+
+   ANV_RMV(rt_pipeline_create, device, pipeline, false);
+
+   *pPipeline = anv_pipeline_to_handle(&pipeline->base);
+
+   return pipeline->base.batch.status;
+}
+
+VkResult
+anv_CreateRayTracingPipelinesKHR(
+    VkDevice                                    _device,
+    VkDeferredOperationKHR                      deferredOperation,
+    VkPipelineCache                             pipelineCache,
+    uint32_t                                    createInfoCount,
+    const VkRayTracingPipelineCreateInfoKHR*    pCreateInfos,
+    const VkAllocationCallbacks*                pAllocator,
+    VkPipeline*                                 pPipelines)
+{
+   ANV_FROM_HANDLE(vk_pipeline_cache, pipeline_cache, pipelineCache);
+
+   VkResult result = VK_SUCCESS;
+
+   unsigned i;
+   for (i = 0; i < createInfoCount; i++) {
+      const VkPipelineCreateFlags2KHR flags =
+         vk_rt_pipeline_create_flags(&pCreateInfos[i]);
+      VkResult res = anv_ray_tracing_pipeline_create(_device, pipeline_cache,
+                                                     &pCreateInfos[i],
+                                                     pAllocator, &pPipelines[i]);
+
+      if (res == VK_SUCCESS)
+         continue;
+
+      /* Bail out on the first error as it is not obvious what error should be
+       * report upon 2 different failures. */
+      result = res;
+      if (result != VK_PIPELINE_COMPILE_REQUIRED)
+         break;
+
+      pPipelines[i] = VK_NULL_HANDLE;
+
+      if (flags & VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
+         break;
+   }
+
+   for (; i < createInfoCount; i++)
+      pPipelines[i] = VK_NULL_HANDLE;
+
    return result;
 }
 
@@ -3082,19 +4186,26 @@ VkResult anv_GetPipelineExecutablePropertiesKHR(
     VkPipelineExecutablePropertiesKHR*          pProperties)
 {
    ANV_FROM_HANDLE(anv_pipeline, pipeline, pPipelineInfo->pipeline);
-   VK_OUTARRAY_MAKE(out, pProperties, pExecutableCount);
+   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
+                          pProperties, pExecutableCount);
 
    util_dynarray_foreach (&pipeline->executables, struct anv_pipeline_executable, exe) {
-      vk_outarray_append(&out, props) {
+      vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
          gl_shader_stage stage = exe->stage;
          props->stages = mesa_to_vk_shader_stage(stage);
 
          unsigned simd_width = exe->stats.dispatch_width;
          if (stage == MESA_SHADER_FRAGMENT) {
-            WRITE_STR(props->name, "%s%d %s",
-                      simd_width ? "SIMD" : "vec",
-                      simd_width ? simd_width : 4,
-                      _mesa_shader_stage_to_string(stage));
+            if (exe->stats.max_polygons > 1)
+               WRITE_STR(props->name, "SIMD%dx%d %s",
+                         exe->stats.max_polygons,
+                         simd_width / exe->stats.max_polygons,
+                         _mesa_shader_stage_to_string(stage));
+            else
+               WRITE_STR(props->name, "%s%d %s",
+                         simd_width ? "SIMD" : "vec",
+                         simd_width ? simd_width : 4,
+                         _mesa_shader_stage_to_string(stage));
          } else {
             WRITE_STR(props->name, "%s", _mesa_shader_stage_to_string(stage));
          }
@@ -3129,26 +4240,36 @@ VkResult anv_GetPipelineExecutableStatisticsKHR(
     VkPipelineExecutableStatisticKHR*           pStatistics)
 {
    ANV_FROM_HANDLE(anv_pipeline, pipeline, pExecutableInfo->pipeline);
-   VK_OUTARRAY_MAKE(out, pStatistics, pStatisticCount);
+   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
+                          pStatistics, pStatisticCount);
 
    const struct anv_pipeline_executable *exe =
       anv_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
 
    const struct brw_stage_prog_data *prog_data;
    switch (pipeline->type) {
-   case ANV_PIPELINE_GRAPHICS: {
-      prog_data = anv_pipeline_to_graphics(pipeline)->shaders[exe->stage]->prog_data;
+   case ANV_PIPELINE_GRAPHICS:
+   case ANV_PIPELINE_GRAPHICS_LIB: {
+      prog_data = anv_pipeline_to_graphics(pipeline)->base.shaders[exe->stage]->prog_data;
       break;
    }
    case ANV_PIPELINE_COMPUTE: {
       prog_data = anv_pipeline_to_compute(pipeline)->cs->prog_data;
       break;
    }
+   case ANV_PIPELINE_RAY_TRACING: {
+      struct anv_shader_bin **shader =
+         util_dynarray_element(&anv_pipeline_to_ray_tracing(pipeline)->shaders,
+                               struct anv_shader_bin *,
+                               pExecutableInfo->executableIndex);
+      prog_data = (*shader)->prog_data;
+      break;
+   }
    default:
       unreachable("invalid pipeline type");
    }
 
-   vk_outarray_append(&out, stat) {
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
       WRITE_STR(stat->name, "Instruction Count");
       WRITE_STR(stat->description,
                 "Number of GEN instructions in the final generated "
@@ -3157,7 +4278,7 @@ VkResult anv_GetPipelineExecutableStatisticsKHR(
       stat->value.u64 = exe->stats.instructions;
    }
 
-   vk_outarray_append(&out, stat) {
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
       WRITE_STR(stat->name, "SEND Count");
       WRITE_STR(stat->description,
                 "Number of instructions in the final generated shader "
@@ -3167,7 +4288,7 @@ VkResult anv_GetPipelineExecutableStatisticsKHR(
       stat->value.u64 = exe->stats.sends;
    }
 
-   vk_outarray_append(&out, stat) {
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
       WRITE_STR(stat->name, "Loop Count");
       WRITE_STR(stat->description,
                 "Number of loops (not unrolled) in the final generated "
@@ -3176,7 +4297,7 @@ VkResult anv_GetPipelineExecutableStatisticsKHR(
       stat->value.u64 = exe->stats.loops;
    }
 
-   vk_outarray_append(&out, stat) {
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
       WRITE_STR(stat->name, "Cycle Count");
       WRITE_STR(stat->description,
                 "Estimate of the number of EU cycles required to execute "
@@ -3186,7 +4307,7 @@ VkResult anv_GetPipelineExecutableStatisticsKHR(
       stat->value.u64 = exe->stats.cycles;
    }
 
-   vk_outarray_append(&out, stat) {
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
       WRITE_STR(stat->name, "Spill Count");
       WRITE_STR(stat->description,
                 "Number of scratch spill operations.  This gives a rough "
@@ -3197,7 +4318,7 @@ VkResult anv_GetPipelineExecutableStatisticsKHR(
       stat->value.u64 = exe->stats.spills;
    }
 
-   vk_outarray_append(&out, stat) {
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
       WRITE_STR(stat->name, "Fill Count");
       WRITE_STR(stat->description,
                 "Number of scratch fill operations.  This gives a rough "
@@ -3208,7 +4329,7 @@ VkResult anv_GetPipelineExecutableStatisticsKHR(
       stat->value.u64 = exe->stats.fills;
    }
 
-   vk_outarray_append(&out, stat) {
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
       WRITE_STR(stat->name, "Scratch Memory Size");
       WRITE_STR(stat->description,
                 "Number of bytes of scratch memory required by the "
@@ -3219,15 +4340,50 @@ VkResult anv_GetPipelineExecutableStatisticsKHR(
       stat->value.u64 = prog_data->total_scratch;
    }
 
-   if (gl_shader_stage_uses_workgroup(exe->stage)) {
-      vk_outarray_append(&out, stat) {
-         WRITE_STR(stat->name, "Workgroup Memory Size");
-         WRITE_STR(stat->description,
-                   "Number of bytes of workgroup shared memory used by this "
-                   "shader including any padding.");
-         stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+      WRITE_STR(stat->name, "Max dispatch width");
+      WRITE_STR(stat->description,
+                "Largest SIMD dispatch width.");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      /* Report the max dispatch width only on the smallest SIMD variant */
+      if (exe->stage != MESA_SHADER_FRAGMENT || exe->stats.dispatch_width == 8)
+         stat->value.u64 = exe->stats.max_dispatch_width;
+      else
+         stat->value.u64 = 0;
+   }
+
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+      WRITE_STR(stat->name, "Max live registers");
+      WRITE_STR(stat->description,
+                "Maximum number of registers used across the entire shader.");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = exe->stats.max_live_registers;
+   }
+
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+      WRITE_STR(stat->name, "Workgroup Memory Size");
+      WRITE_STR(stat->description,
+                "Number of bytes of workgroup shared memory used by this "
+                "shader including any padding.");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      if (gl_shader_stage_uses_workgroup(exe->stage))
          stat->value.u64 = prog_data->total_shared;
-      }
+      else
+         stat->value.u64 = 0;
+   }
+
+   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
+      uint32_t hash = pipeline->type == ANV_PIPELINE_COMPUTE ?
+                      anv_pipeline_to_compute(pipeline)->source_hash :
+                      (pipeline->type == ANV_PIPELINE_GRAPHICS_LIB ||
+                       pipeline->type == ANV_PIPELINE_GRAPHICS) ?
+                      anv_pipeline_to_graphics_base(pipeline)->source_hashes[exe->stage] :
+                      0 /* No source hash for ray tracing */;
+      WRITE_STR(stat->name, "Source hash");
+      WRITE_STR(stat->description,
+                "hash = 0x%08x. Hash generated from shader source.", hash);
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = hash;
    }
 
    return vk_outarray_status(&out);
@@ -3261,15 +4417,15 @@ VkResult anv_GetPipelineExecutableInternalRepresentationsKHR(
     VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations)
 {
    ANV_FROM_HANDLE(anv_pipeline, pipeline, pExecutableInfo->pipeline);
-   VK_OUTARRAY_MAKE(out, pInternalRepresentations,
-                    pInternalRepresentationCount);
+   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
+                          pInternalRepresentations, pInternalRepresentationCount);
    bool incomplete_text = false;
 
    const struct anv_pipeline_executable *exe =
       anv_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
 
    if (exe->nir) {
-      vk_outarray_append(&out, ir) {
+      vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
          WRITE_STR(ir->name, "Final NIR");
          WRITE_STR(ir->description,
                    "Final NIR before going into the back-end compiler");
@@ -3280,7 +4436,7 @@ VkResult anv_GetPipelineExecutableInternalRepresentationsKHR(
    }
 
    if (exe->disasm) {
-      vk_outarray_append(&out, ir) {
+      vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
          WRITE_STR(ir->name, "GEN Assembly");
          WRITE_STR(ir->description,
                    "Final GEN assembly for the generated shader binary");
@@ -3295,20 +4451,23 @@ VkResult anv_GetPipelineExecutableInternalRepresentationsKHR(
 
 VkResult
 anv_GetRayTracingShaderGroupHandlesKHR(
-    VkDevice                                    device,
+    VkDevice                                    _device,
     VkPipeline                                  _pipeline,
     uint32_t                                    firstGroup,
     uint32_t                                    groupCount,
     size_t                                      dataSize,
     void*                                       pData)
 {
+   ANV_FROM_HANDLE(anv_device, device, _device);
    ANV_FROM_HANDLE(anv_pipeline, pipeline, _pipeline);
+
    if (pipeline->type != ANV_PIPELINE_RAY_TRACING)
-      return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
+      return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
 
    struct anv_ray_tracing_pipeline *rt_pipeline =
       anv_pipeline_to_ray_tracing(pipeline);
 
+   assert(firstGroup + groupCount <= rt_pipeline->group_count);
    for (uint32_t i = 0; i < groupCount; i++) {
       struct anv_rt_shader_group *group = &rt_pipeline->groups[firstGroup + i];
       memcpy(pData, group->handle, sizeof(group->handle));
@@ -3320,15 +4479,16 @@ anv_GetRayTracingShaderGroupHandlesKHR(
 
 VkResult
 anv_GetRayTracingCaptureReplayShaderGroupHandlesKHR(
-    VkDevice                                    device,
+    VkDevice                                    _device,
     VkPipeline                                  pipeline,
     uint32_t                                    firstGroup,
     uint32_t                                    groupCount,
     size_t                                      dataSize,
     void*                                       pData)
 {
+   ANV_FROM_HANDLE(anv_device, device, _device);
    unreachable("Unimplemented");
-   return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
+   return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
 }
 
 VkDeviceSize
diff --git a/src/intel/vulkan/anv_pipeline_cache.c b/src/intel/vulkan/anv_pipeline_cache.c
index 0bbc0849c2a..73a145664a5 100644
--- a/src/intel/vulkan/anv_pipeline_cache.c
+++ b/src/intel/vulkan/anv_pipeline_cache.c
@@ -23,15 +23,184 @@
 
 #include "util/blob.h"
 #include "util/hash_table.h"
-#include "util/debug.h"
+#include "util/u_debug.h"
 #include "util/disk_cache.h"
 #include "util/mesa-sha1.h"
 #include "nir/nir_serialize.h"
 #include "anv_private.h"
 #include "nir/nir_xfb_info.h"
-#include "vulkan/util/vk_util.h"
+#include "vk_util.h"
+#include "compiler/spirv/nir_spirv.h"
+#include "shaders/float64_spv.h"
 
-struct anv_shader_bin *
+/**
+ * Embedded sampler management.
+ */
+
+static unsigned
+embedded_sampler_key_hash(const void *key)
+{
+   return _mesa_hash_data(key, sizeof(struct anv_embedded_sampler_key));
+}
+
+static bool
+embedded_sampler_key_equal(const void *a, const void *b)
+{
+   return memcmp(a, b, sizeof(struct anv_embedded_sampler_key)) == 0;
+}
+
+static void
+anv_embedded_sampler_free(struct anv_device *device,
+                          struct anv_embedded_sampler *sampler)
+{
+   anv_state_pool_free(&device->dynamic_state_db_pool, sampler->sampler_state);
+   anv_state_pool_free(&device->dynamic_state_db_pool, sampler->border_color_state);
+   vk_free(&device->vk.alloc, sampler);
+}
+
+static struct anv_embedded_sampler *
+anv_embedded_sampler_ref(struct anv_embedded_sampler *sampler)
+{
+   sampler->ref_cnt++;
+   return sampler;
+}
+
+static void
+anv_embedded_sampler_unref(struct anv_device *device,
+                           struct anv_embedded_sampler *sampler)
+{
+   simple_mtx_lock(&device->embedded_samplers.mutex);
+   if (--sampler->ref_cnt == 0) {
+      _mesa_hash_table_remove_key(device->embedded_samplers.map,
+                                  &sampler->key);
+      anv_embedded_sampler_free(device, sampler);
+   }
+   simple_mtx_unlock(&device->embedded_samplers.mutex);
+}
+
+void
+anv_device_init_embedded_samplers(struct anv_device *device)
+{
+   simple_mtx_init(&device->embedded_samplers.mutex, mtx_plain);
+   device->embedded_samplers.map =
+      _mesa_hash_table_create(NULL,
+                              embedded_sampler_key_hash,
+                              embedded_sampler_key_equal);
+}
+
+void
+anv_device_finish_embedded_samplers(struct anv_device *device)
+{
+   hash_table_foreach(device->embedded_samplers.map, entry) {
+      anv_embedded_sampler_free(device, entry->data);
+   }
+   ralloc_free(device->embedded_samplers.map);
+   simple_mtx_destroy(&device->embedded_samplers.mutex);
+}
+
+static VkResult
+anv_shader_bin_get_embedded_samplers(struct anv_device *device,
+                                     struct anv_shader_bin *shader,
+                                     const struct anv_pipeline_bind_map *bind_map)
+{
+   VkResult result = VK_SUCCESS;
+
+   simple_mtx_lock(&device->embedded_samplers.mutex);
+
+   for (uint32_t i = 0; i < bind_map->embedded_sampler_count; i++) {
+      struct hash_entry *entry =
+         _mesa_hash_table_search(device->embedded_samplers.map,
+                                 &bind_map->embedded_sampler_to_binding[i].key);
+      if (entry == NULL) {
+         shader->embedded_samplers[i] =
+            vk_zalloc(&device->vk.alloc,
+                      sizeof(struct anv_embedded_sampler), 8,
+                      VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+         if (shader->embedded_samplers[i] == NULL) {
+            result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+            goto err;
+         }
+
+         anv_genX(device->info, emit_embedded_sampler)(
+            device, shader->embedded_samplers[i],
+            &bind_map->embedded_sampler_to_binding[i]);
+         _mesa_hash_table_insert(device->embedded_samplers.map,
+                                 &shader->embedded_samplers[i]->key,
+                                 shader->embedded_samplers[i]);
+      } else {
+         shader->embedded_samplers[i] = anv_embedded_sampler_ref(entry->data);
+      }
+   }
+
+ err:
+   simple_mtx_unlock(&device->embedded_samplers.mutex);
+   return result;
+}
+
+/**
+ *
+ */
+
+static bool
+anv_shader_bin_serialize(struct vk_pipeline_cache_object *object,
+                         struct blob *blob);
+
+struct vk_pipeline_cache_object *
+anv_shader_bin_deserialize(struct vk_pipeline_cache *cache,
+                           const void *key_data, size_t key_size,
+                           struct blob_reader *blob);
+
+static void
+anv_shader_bin_destroy(struct vk_device *_device,
+                       struct vk_pipeline_cache_object *object)
+{
+   struct anv_device *device =
+      container_of(_device, struct anv_device, vk);
+
+   struct anv_shader_bin *shader =
+      container_of(object, struct anv_shader_bin, base);
+
+   for (uint32_t i = 0; i < shader->bind_map.embedded_sampler_count; i++)
+      anv_embedded_sampler_unref(device, shader->embedded_samplers[i]);
+
+   anv_state_pool_free(&device->instruction_state_pool, shader->kernel);
+   vk_pipeline_cache_object_finish(&shader->base);
+   vk_free(&device->vk.alloc, shader);
+}
+
+static const struct vk_pipeline_cache_object_ops anv_shader_bin_ops = {
+   .serialize = anv_shader_bin_serialize,
+   .deserialize = anv_shader_bin_deserialize,
+   .destroy = anv_shader_bin_destroy,
+};
+
+const struct vk_pipeline_cache_object_ops *const anv_cache_import_ops[2] = {
+   &anv_shader_bin_ops,
+   NULL
+};
+
+static void
+anv_shader_bin_rewrite_embedded_samplers(struct anv_device *device,
+                                         struct anv_shader_bin *shader,
+                                         const struct anv_pipeline_bind_map *bind_map,
+                                         const struct brw_stage_prog_data *prog_data_in)
+{
+   int rv_count = 0;
+   struct brw_shader_reloc_value reloc_values[BRW_MAX_EMBEDDED_SAMPLERS];
+
+   for (uint32_t i = 0; i < bind_map->embedded_sampler_count; i++) {
+      reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
+         .id = BRW_SHADER_RELOC_EMBEDDED_SAMPLER_HANDLE + i,
+         .value = shader->embedded_samplers[i]->sampler_state.offset,
+      };
+   }
+
+   brw_write_shader_relocs(&device->physical->compiler->isa,
+                           shader->kernel.map, prog_data_in,
+                           reloc_values, rv_count);
+}
+
+static struct anv_shader_bin *
 anv_shader_bin_create(struct anv_device *device,
                       gl_shader_stage stage,
                       const void *key_data, uint32_t key_size,
@@ -40,12 +209,13 @@ anv_shader_bin_create(struct anv_device *device,
                       uint32_t prog_data_size,
                       const struct brw_compile_stats *stats, uint32_t num_stats,
                       const nir_xfb_info *xfb_info_in,
-                      const struct anv_pipeline_bind_map *bind_map)
+                      const struct anv_pipeline_bind_map *bind_map,
+                      const struct anv_push_descriptor_info *push_desc_info,
+                      enum anv_dynamic_push_bits dynamic_push_values)
 {
    VK_MULTIALLOC(ma);
    VK_MULTIALLOC_DECL(&ma, struct anv_shader_bin, shader, 1);
-   VK_MULTIALLOC_DECL_SIZE(&ma, struct anv_shader_bin_key, key,
-                                sizeof(*key) + key_size);
+   VK_MULTIALLOC_DECL_SIZE(&ma, void, obj_key_data, key_size);
    VK_MULTIALLOC_DECL_SIZE(&ma, struct brw_stage_prog_data, prog_data,
                                 prog_data_size);
    VK_MULTIALLOC_DECL(&ma, struct brw_shader_reloc, prog_data_relocs,
@@ -59,38 +229,69 @@ anv_shader_bin_create(struct anv_device *device,
    VK_MULTIALLOC_DECL(&ma, struct anv_pipeline_binding, surface_to_descriptor,
                            bind_map->surface_count);
    VK_MULTIALLOC_DECL(&ma, struct anv_pipeline_binding, sampler_to_descriptor,
-                           bind_map->sampler_count);
+                      bind_map->sampler_count);
+   VK_MULTIALLOC_DECL(&ma, struct anv_pipeline_embedded_sampler_binding,
+                      embedded_sampler_to_binding,
+                      bind_map->embedded_sampler_count);
+   VK_MULTIALLOC_DECL(&ma, struct brw_kernel_arg_desc, kernel_args,
+                      bind_map->kernel_arg_count);
+   VK_MULTIALLOC_DECL(&ma, struct anv_embedded_sampler *, embedded_samplers,
+                      bind_map->embedded_sampler_count);
 
    if (!vk_multialloc_alloc(&ma, &device->vk.alloc,
                             VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
       return NULL;
 
-   shader->ref_cnt = 1;
+   memcpy(obj_key_data, key_data, key_size);
+   vk_pipeline_cache_object_init(&device->vk, &shader->base,
+                                 &anv_shader_bin_ops, obj_key_data, key_size);
 
    shader->stage = stage;
 
-   key->size = key_size;
-   memcpy(key->data, key_data, key_size);
-   shader->key = key;
-
    shader->kernel =
       anv_state_pool_alloc(&device->instruction_state_pool, kernel_size, 64);
    memcpy(shader->kernel.map, kernel_data, kernel_size);
    shader->kernel_size = kernel_size;
 
-   uint64_t shader_data_addr = INSTRUCTION_STATE_POOL_MIN_ADDRESS +
-                               shader->kernel.offset +
-                               prog_data_in->const_data_offset;
+   if (bind_map->embedded_sampler_count > 0) {
+      shader->embedded_samplers = embedded_samplers;
+      if (anv_shader_bin_get_embedded_samplers(device, shader, bind_map) != VK_SUCCESS) {
+         anv_state_pool_free(&device->instruction_state_pool, shader->kernel);
+         vk_free(&device->vk.alloc, shader);
+         return NULL;
+      }
+   }
+
+   uint64_t shader_data_addr =
+      device->physical->va.instruction_state_pool.addr +
+      shader->kernel.offset +
+      prog_data_in->const_data_offset;
 
    int rv_count = 0;
-   struct brw_shader_reloc_value reloc_values[5];
+   struct brw_shader_reloc_value reloc_values[7];
+   assert((device->physical->va.descriptor_buffer_pool.addr & 0xffffffff) == 0);
+   reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
+      .id = BRW_SHADER_RELOC_DESCRIPTORS_BUFFER_ADDR_HIGH,
+      .value = device->physical->va.descriptor_buffer_pool.addr >> 32,
+   };
+   assert((device->physical->va.indirect_descriptor_pool.addr & 0xffffffff) == 0);
+   assert((device->physical->va.internal_surface_state_pool.addr & 0xffffffff) == 0);
+   reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
+      .id = BRW_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH,
+      .value = device->physical->indirect_descriptors ?
+               (device->physical->va.indirect_descriptor_pool.addr >> 32) :
+               (device->physical->va.internal_surface_state_pool.addr >> 32),
+   };
+   assert((device->physical->va.instruction_state_pool.addr & 0xffffffff) == 0);
    reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
       .id = BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW,
       .value = shader_data_addr,
    };
+   assert((device->physical->va.instruction_state_pool.addr & 0xffffffff) == 0);
+   assert(shader_data_addr >> 32 == device->physical->va.instruction_state_pool.addr >> 32);
    reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
       .id = BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH,
-      .value = shader_data_addr >> 32,
+      .value = device->physical->va.instruction_state_pool.addr >> 32,
    };
    reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
       .id = BRW_SHADER_RELOC_SHADER_START_OFFSET,
@@ -99,9 +300,10 @@ anv_shader_bin_create(struct anv_device *device,
    if (brw_shader_stage_is_bindless(stage)) {
       const struct brw_bs_prog_data *bs_prog_data =
          brw_bs_prog_data_const(prog_data_in);
-      uint64_t resume_sbt_addr = INSTRUCTION_STATE_POOL_MIN_ADDRESS +
-                                 shader->kernel.offset +
-                                 bs_prog_data->resume_sbt_offset;
+      uint64_t resume_sbt_addr =
+         device->physical->va.instruction_state_pool.addr +
+         shader->kernel.offset +
+         bs_prog_data->resume_sbt_offset;
       reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
          .id = BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW,
          .value = resume_sbt_addr,
@@ -112,9 +314,12 @@ anv_shader_bin_create(struct anv_device *device,
       };
    }
 
-   brw_write_shader_relocs(&device->info, shader->kernel.map, prog_data_in,
+   brw_write_shader_relocs(&device->physical->compiler->isa,
+                           shader->kernel.map, prog_data_in,
                            reloc_values, rv_count);
 
+   anv_shader_bin_rewrite_embedded_samplers(device, shader, bind_map, prog_data_in);
+
    memcpy(prog_data, prog_data_in, prog_data_size);
    typed_memcpy(prog_data_relocs, prog_data_in->relocs,
                 prog_data_in->num_relocs);
@@ -138,40 +343,52 @@ anv_shader_bin_create(struct anv_device *device,
       shader->xfb_info = NULL;
    }
 
+   shader->dynamic_push_values = dynamic_push_values;
+
+   typed_memcpy(&shader->push_desc_info, push_desc_info, 1);
+
    shader->bind_map = *bind_map;
+
    typed_memcpy(surface_to_descriptor, bind_map->surface_to_descriptor,
                 bind_map->surface_count);
    shader->bind_map.surface_to_descriptor = surface_to_descriptor;
+
    typed_memcpy(sampler_to_descriptor, bind_map->sampler_to_descriptor,
                 bind_map->sampler_count);
    shader->bind_map.sampler_to_descriptor = sampler_to_descriptor;
 
-   return shader;
-}
+   typed_memcpy(embedded_sampler_to_binding, bind_map->embedded_sampler_to_binding,
+                bind_map->embedded_sampler_count);
+   shader->bind_map.embedded_sampler_to_binding = embedded_sampler_to_binding;
 
-void
-anv_shader_bin_destroy(struct anv_device *device,
-                       struct anv_shader_bin *shader)
-{
-   assert(shader->ref_cnt == 0);
-   anv_state_pool_free(&device->instruction_state_pool, shader->kernel);
-   vk_free(&device->vk.alloc, shader);
+   typed_memcpy(kernel_args, bind_map->kernel_args,
+                bind_map->kernel_arg_count);
+   shader->bind_map.kernel_args = kernel_args;
+
+   return shader;
 }
 
 static bool
-anv_shader_bin_write_to_blob(const struct anv_shader_bin *shader,
-                             struct blob *blob)
+anv_shader_bin_serialize(struct vk_pipeline_cache_object *object,
+                         struct blob *blob)
 {
-   blob_write_uint32(blob, shader->stage);
+   struct anv_shader_bin *shader =
+      container_of(object, struct anv_shader_bin, base);
 
-   blob_write_uint32(blob, shader->key->size);
-   blob_write_bytes(blob, shader->key->data, shader->key->size);
+   blob_write_uint32(blob, shader->stage);
 
    blob_write_uint32(blob, shader->kernel_size);
    blob_write_bytes(blob, shader->kernel.map, shader->kernel_size);
 
    blob_write_uint32(blob, shader->prog_data_size);
-   blob_write_bytes(blob, shader->prog_data, shader->prog_data_size);
+
+   union brw_any_prog_data prog_data;
+   assert(shader->prog_data_size <= sizeof(prog_data));
+   memcpy(&prog_data, shader->prog_data, shader->prog_data_size);
+   prog_data.base.relocs = NULL;
+   prog_data.base.param = NULL;
+   blob_write_bytes(blob, &prog_data, shader->prog_data_size);
+
    blob_write_bytes(blob, shader->prog_data->relocs,
                     shader->prog_data->num_relocs *
                     sizeof(shader->prog_data->relocs[0]));
@@ -189,6 +406,12 @@ anv_shader_bin_write_to_blob(const struct anv_shader_bin *shader,
       blob_write_uint32(blob, 0);
    }
 
+   blob_write_uint32(blob, shader->dynamic_push_values);
+
+   blob_write_uint32(blob, shader->push_desc_info.used_descriptors);
+   blob_write_uint32(blob, shader->push_desc_info.fully_promoted_ubo_descriptors);
+   blob_write_uint8(blob, shader->push_desc_info.used_set_buffer);
+
    blob_write_bytes(blob, shader->bind_map.surface_sha1,
                     sizeof(shader->bind_map.surface_sha1));
    blob_write_bytes(blob, shader->bind_map.sampler_sha1,
@@ -197,26 +420,39 @@ anv_shader_bin_write_to_blob(const struct anv_shader_bin *shader,
                     sizeof(shader->bind_map.push_sha1));
    blob_write_uint32(blob, shader->bind_map.surface_count);
    blob_write_uint32(blob, shader->bind_map.sampler_count);
+   blob_write_uint32(blob, shader->bind_map.embedded_sampler_count);
+   if (shader->stage == MESA_SHADER_KERNEL) {
+      uint32_t packed = (uint32_t)shader->bind_map.kernel_args_size << 16 |
+                        (uint32_t)shader->bind_map.kernel_arg_count;
+      blob_write_uint32(blob, packed);
+   }
    blob_write_bytes(blob, shader->bind_map.surface_to_descriptor,
                     shader->bind_map.surface_count *
                     sizeof(*shader->bind_map.surface_to_descriptor));
    blob_write_bytes(blob, shader->bind_map.sampler_to_descriptor,
                     shader->bind_map.sampler_count *
                     sizeof(*shader->bind_map.sampler_to_descriptor));
+   blob_write_bytes(blob, shader->bind_map.embedded_sampler_to_binding,
+                    shader->bind_map.embedded_sampler_count *
+                    sizeof(*shader->bind_map.embedded_sampler_to_binding));
+   blob_write_bytes(blob, shader->bind_map.kernel_args,
+                    shader->bind_map.kernel_arg_count *
+                    sizeof(*shader->bind_map.kernel_args));
    blob_write_bytes(blob, shader->bind_map.push_ranges,
                     sizeof(shader->bind_map.push_ranges));
 
    return !blob->out_of_memory;
 }
 
-static struct anv_shader_bin *
-anv_shader_bin_create_from_blob(struct anv_device *device,
-                                struct blob_reader *blob)
+struct vk_pipeline_cache_object *
+anv_shader_bin_deserialize(struct vk_pipeline_cache *cache,
+                           const void *key_data, size_t key_size,
+                           struct blob_reader *blob)
 {
-   gl_shader_stage stage = blob_read_uint32(blob);
+   struct anv_device *device =
+      container_of(cache->base.device, struct anv_device, vk);
 
-   uint32_t key_size = blob_read_uint32(blob);
-   const void *key_data = blob_read_bytes(blob, key_size);
+   gl_shader_stage stage = blob_read_uint32(blob);
 
    uint32_t kernel_size = blob_read_uint32(blob);
    const void *kernel_data = blob_read_bytes(blob, kernel_size);
@@ -242,614 +478,205 @@ anv_shader_bin_create_from_blob(struct anv_device *device,
    if (xfb_size)
       xfb_info = blob_read_bytes(blob, xfb_size);
 
-   struct anv_pipeline_bind_map bind_map;
+   enum anv_dynamic_push_bits dynamic_push_values = blob_read_uint32(blob);
+
+   struct anv_push_descriptor_info push_desc_info = {};
+   push_desc_info.used_descriptors = blob_read_uint32(blob);
+   push_desc_info.fully_promoted_ubo_descriptors = blob_read_uint32(blob);
+   push_desc_info.used_set_buffer = blob_read_uint8(blob);
+
+   struct anv_pipeline_bind_map bind_map = {};
    blob_copy_bytes(blob, bind_map.surface_sha1, sizeof(bind_map.surface_sha1));
    blob_copy_bytes(blob, bind_map.sampler_sha1, sizeof(bind_map.sampler_sha1));
    blob_copy_bytes(blob, bind_map.push_sha1, sizeof(bind_map.push_sha1));
    bind_map.surface_count = blob_read_uint32(blob);
    bind_map.sampler_count = blob_read_uint32(blob);
+   bind_map.embedded_sampler_count = blob_read_uint32(blob);
+   if (stage == MESA_SHADER_KERNEL) {
+      uint32_t packed = blob_read_uint32(blob);
+      bind_map.kernel_args_size = (uint16_t)(packed >> 16);
+      bind_map.kernel_arg_count = (uint16_t)packed;
+   }
    bind_map.surface_to_descriptor = (void *)
       blob_read_bytes(blob, bind_map.surface_count *
                             sizeof(*bind_map.surface_to_descriptor));
    bind_map.sampler_to_descriptor = (void *)
       blob_read_bytes(blob, bind_map.sampler_count *
                             sizeof(*bind_map.sampler_to_descriptor));
+   bind_map.embedded_sampler_to_binding = (void *)
+      blob_read_bytes(blob, bind_map.embedded_sampler_count *
+                            sizeof(*bind_map.embedded_sampler_to_binding));
+   bind_map.kernel_args = (void *)
+      blob_read_bytes(blob, bind_map.kernel_arg_count *
+                            sizeof(*bind_map.kernel_args));
    blob_copy_bytes(blob, bind_map.push_ranges, sizeof(bind_map.push_ranges));
 
    if (blob->overrun)
       return NULL;
 
-   return anv_shader_bin_create(device, stage,
-                                key_data, key_size,
-                                kernel_data, kernel_size,
-                                &prog_data.base, prog_data_size,
-                                stats, num_stats, xfb_info, &bind_map);
-}
-
-/* Remaining work:
- *
- * - Compact binding table layout so it's tight and not dependent on
- *   descriptor set layout.
- *
- * - Review prog_data struct for size and cacheability: struct
- *   brw_stage_prog_data has binding_table which uses a lot of uint32_t for 8
- *   bit quantities etc; use bit fields for all bools, eg dual_src_blend.
- */
-
-static uint32_t
-shader_bin_key_hash_func(const void *void_key)
-{
-   const struct anv_shader_bin_key *key = void_key;
-   return _mesa_hash_data(key->data, key->size);
-}
-
-static bool
-shader_bin_key_compare_func(const void *void_a, const void *void_b)
-{
-   const struct anv_shader_bin_key *a = void_a, *b = void_b;
-   if (a->size != b->size)
-      return false;
-
-   return memcmp(a->data, b->data, a->size) == 0;
-}
-
-static uint32_t
-sha1_hash_func(const void *sha1)
-{
-   return _mesa_hash_data(sha1, 20);
-}
-
-static bool
-sha1_compare_func(const void *sha1_a, const void *sha1_b)
-{
-   return memcmp(sha1_a, sha1_b, 20) == 0;
-}
-
-void
-anv_pipeline_cache_init(struct anv_pipeline_cache *cache,
-                        struct anv_device *device,
-                        bool cache_enabled,
-                        bool external_sync)
-{
-   vk_object_base_init(&device->vk, &cache->base,
-                       VK_OBJECT_TYPE_PIPELINE_CACHE);
-   cache->device = device;
-   cache->external_sync = external_sync;
-   pthread_mutex_init(&cache->mutex, NULL);
-
-   if (cache_enabled) {
-      cache->cache = _mesa_hash_table_create(NULL, shader_bin_key_hash_func,
-                                             shader_bin_key_compare_func);
-      cache->nir_cache = _mesa_hash_table_create(NULL, sha1_hash_func,
-                                                 sha1_compare_func);
-   } else {
-      cache->cache = NULL;
-      cache->nir_cache = NULL;
-   }
-}
-
-void
-anv_pipeline_cache_finish(struct anv_pipeline_cache *cache)
-{
-   pthread_mutex_destroy(&cache->mutex);
-
-   if (cache->cache) {
-      /* This is a bit unfortunate.  In order to keep things from randomly
-       * going away, the shader cache has to hold a reference to all shader
-       * binaries it contains.  We unref them when we destroy the cache.
-       */
-      hash_table_foreach(cache->cache, entry)
-         anv_shader_bin_unref(cache->device, entry->data);
-
-      _mesa_hash_table_destroy(cache->cache, NULL);
-   }
-
-   if (cache->nir_cache) {
-      hash_table_foreach(cache->nir_cache, entry)
-         ralloc_free(entry->data);
-
-      _mesa_hash_table_destroy(cache->nir_cache, NULL);
-   }
-
-   vk_object_base_finish(&cache->base);
-}
-
-static struct anv_shader_bin *
-anv_pipeline_cache_search_locked(struct anv_pipeline_cache *cache,
-                                 const void *key_data, uint32_t key_size)
-{
-   uint32_t vla[1 + DIV_ROUND_UP(key_size, sizeof(uint32_t))];
-   struct anv_shader_bin_key *key = (void *)vla;
-   key->size = key_size;
-   memcpy(key->data, key_data, key_size);
-
-   struct hash_entry *entry = _mesa_hash_table_search(cache->cache, key);
-   if (entry)
-      return entry->data;
-   else
-      return NULL;
-}
-
-static inline void
-anv_cache_lock(struct anv_pipeline_cache *cache)
-{
-   if (!cache->external_sync)
-      pthread_mutex_lock(&cache->mutex);
-}
-
-static inline void
-anv_cache_unlock(struct anv_pipeline_cache *cache)
-{
-   if (!cache->external_sync)
-      pthread_mutex_unlock(&cache->mutex);
-}
-
-struct anv_shader_bin *
-anv_pipeline_cache_search(struct anv_pipeline_cache *cache,
-                          const void *key_data, uint32_t key_size)
-{
-   if (!cache->cache)
-      return NULL;
-
-   anv_cache_lock(cache);
-
-   struct anv_shader_bin *shader =
-      anv_pipeline_cache_search_locked(cache, key_data, key_size);
-
-   anv_cache_unlock(cache);
-
-   /* We increment refcount before handing it to the caller */
-   if (shader)
-      anv_shader_bin_ref(shader);
-
-   return shader;
-}
-
-static void
-anv_pipeline_cache_add_shader_bin(struct anv_pipeline_cache *cache,
-                                  struct anv_shader_bin *bin)
-{
-   if (!cache->cache)
-      return;
-
-   anv_cache_lock(cache);
-
-   struct hash_entry *entry = _mesa_hash_table_search(cache->cache, bin->key);
-   if (entry == NULL) {
-      /* Take a reference for the cache */
-      anv_shader_bin_ref(bin);
-      _mesa_hash_table_insert(cache->cache, bin->key, bin);
-   }
-
-   anv_cache_unlock(cache);
-}
-
-static struct anv_shader_bin *
-anv_pipeline_cache_add_shader_locked(struct anv_pipeline_cache *cache,
-                                     gl_shader_stage stage,
-                                     const void *key_data, uint32_t key_size,
-                                     const void *kernel_data,
-                                     uint32_t kernel_size,
-                                     const struct brw_stage_prog_data *prog_data,
-                                     uint32_t prog_data_size,
-                                     const struct brw_compile_stats *stats,
-                                     uint32_t num_stats,
-                                     const nir_xfb_info *xfb_info,
-                                     const struct anv_pipeline_bind_map *bind_map)
-{
    struct anv_shader_bin *shader =
-      anv_pipeline_cache_search_locked(cache, key_data, key_size);
-   if (shader)
-      return shader;
-
-   struct anv_shader_bin *bin =
-      anv_shader_bin_create(cache->device, stage,
+      anv_shader_bin_create(device, stage,
                             key_data, key_size,
                             kernel_data, kernel_size,
-                            prog_data, prog_data_size,
-                            stats, num_stats, xfb_info, bind_map);
-   if (!bin)
+                            &prog_data.base, prog_data_size,
+                            stats, num_stats, xfb_info, &bind_map,
+                            &push_desc_info,
+                            dynamic_push_values);
+   if (shader == NULL)
       return NULL;
 
-   _mesa_hash_table_insert(cache->cache, bin->key, bin);
-
-   return bin;
-}
-
-struct anv_shader_bin *
-anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache,
-                                 gl_shader_stage stage,
-                                 const void *key_data, uint32_t key_size,
-                                 const void *kernel_data, uint32_t kernel_size,
-                                 const struct brw_stage_prog_data *prog_data,
-                                 uint32_t prog_data_size,
-                                 const struct brw_compile_stats *stats,
-                                 uint32_t num_stats,
-                                 const nir_xfb_info *xfb_info,
-                                 const struct anv_pipeline_bind_map *bind_map)
-{
-   if (cache->cache) {
-      anv_cache_lock(cache);
-
-      struct anv_shader_bin *bin =
-         anv_pipeline_cache_add_shader_locked(cache, stage, key_data, key_size,
-                                              kernel_data, kernel_size,
-                                              prog_data, prog_data_size,
-                                              stats, num_stats,
-                                              xfb_info, bind_map);
-
-      anv_cache_unlock(cache);
-
-      /* We increment refcount before handing it to the caller */
-      if (bin)
-         anv_shader_bin_ref(bin);
-
-      return bin;
-   } else {
-      /* In this case, we're not caching it so the caller owns it entirely */
-      return anv_shader_bin_create(cache->device, stage,
-                                   key_data, key_size,
-                                   kernel_data, kernel_size,
-                                   prog_data, prog_data_size,
-                                   stats, num_stats,
-                                   xfb_info, bind_map);
-   }
-}
-
-static void
-anv_pipeline_cache_load(struct anv_pipeline_cache *cache,
-                        const void *data, size_t size)
-{
-   struct anv_device *device = cache->device;
-   struct anv_physical_device *pdevice = device->physical;
-
-   if (cache->cache == NULL)
-      return;
-
-   struct blob_reader blob;
-   blob_reader_init(&blob, data, size);
-
-   struct vk_pipeline_cache_header header;
-   blob_copy_bytes(&blob, &header, sizeof(header));
-   uint32_t count = blob_read_uint32(&blob);
-   if (blob.overrun)
-      return;
-
-   if (header.header_size < sizeof(header))
-      return;
-   if (header.header_version != VK_PIPELINE_CACHE_HEADER_VERSION_ONE)
-      return;
-   if (header.vendor_id != 0x8086)
-      return;
-   if (header.device_id != device->info.chipset_id)
-      return;
-   if (memcmp(header.uuid, pdevice->pipeline_cache_uuid, VK_UUID_SIZE) != 0)
-      return;
-
-   for (uint32_t i = 0; i < count; i++) {
-      struct anv_shader_bin *bin =
-         anv_shader_bin_create_from_blob(device, &blob);
-      if (!bin)
-         break;
-      _mesa_hash_table_insert(cache->cache, bin->key, bin);
-   }
-}
-
-VkResult anv_CreatePipelineCache(
-    VkDevice                                    _device,
-    const VkPipelineCacheCreateInfo*            pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkPipelineCache*                            pPipelineCache)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   struct anv_pipeline_cache *cache;
-
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO);
-
-   cache = vk_alloc2(&device->vk.alloc, pAllocator,
-                       sizeof(*cache), 8,
-                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-   if (cache == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   anv_pipeline_cache_init(cache, device,
-                           device->physical->instance->pipeline_cache_enabled,
-                           pCreateInfo->flags & VK_PIPELINE_CACHE_CREATE_EXTERNALLY_SYNCHRONIZED_BIT_EXT);
-
-   if (pCreateInfo->initialDataSize > 0)
-      anv_pipeline_cache_load(cache,
-                              pCreateInfo->pInitialData,
-                              pCreateInfo->initialDataSize);
-
-   *pPipelineCache = anv_pipeline_cache_to_handle(cache);
-
-   return VK_SUCCESS;
-}
-
-void anv_DestroyPipelineCache(
-    VkDevice                                    _device,
-    VkPipelineCache                             _cache,
-    const VkAllocationCallbacks*                pAllocator)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache);
-
-   if (!cache)
-      return;
-
-   anv_pipeline_cache_finish(cache);
-
-   vk_free2(&device->vk.alloc, pAllocator, cache);
-}
-
-VkResult anv_GetPipelineCacheData(
-    VkDevice                                    _device,
-    VkPipelineCache                             _cache,
-    size_t*                                     pDataSize,
-    void*                                       pData)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache);
-
-   struct blob blob;
-   if (pData) {
-      blob_init_fixed(&blob, pData, *pDataSize);
-   } else {
-      blob_init_fixed(&blob, NULL, SIZE_MAX);
-   }
-
-   struct vk_pipeline_cache_header header = {
-      .header_size = sizeof(struct vk_pipeline_cache_header),
-      .header_version = VK_PIPELINE_CACHE_HEADER_VERSION_ONE,
-      .vendor_id = 0x8086,
-      .device_id = device->info.chipset_id,
-   };
-   memcpy(header.uuid, device->physical->pipeline_cache_uuid, VK_UUID_SIZE);
-   blob_write_bytes(&blob, &header, sizeof(header));
-
-   uint32_t count = 0;
-   intptr_t count_offset = blob_reserve_uint32(&blob);
-   if (count_offset < 0) {
-      *pDataSize = 0;
-      blob_finish(&blob);
-      return VK_INCOMPLETE;
-   }
-
-   VkResult result = VK_SUCCESS;
-   if (cache->cache) {
-      hash_table_foreach(cache->cache, entry) {
-         struct anv_shader_bin *shader = entry->data;
-
-         size_t save_size = blob.size;
-         if (!anv_shader_bin_write_to_blob(shader, &blob)) {
-            /* If it fails reset to the previous size and bail */
-            blob.size = save_size;
-            result = VK_INCOMPLETE;
-            break;
-         }
-
-         count++;
-      }
-   }
-
-   blob_overwrite_uint32(&blob, count_offset, count);
-
-   *pDataSize = blob.size;
-
-   blob_finish(&blob);
-
-   return result;
-}
-
-VkResult anv_MergePipelineCaches(
-    VkDevice                                    _device,
-    VkPipelineCache                             destCache,
-    uint32_t                                    srcCacheCount,
-    const VkPipelineCache*                      pSrcCaches)
-{
-   ANV_FROM_HANDLE(anv_pipeline_cache, dst, destCache);
-
-   if (!dst->cache)
-      return VK_SUCCESS;
-
-   for (uint32_t i = 0; i < srcCacheCount; i++) {
-      ANV_FROM_HANDLE(anv_pipeline_cache, src, pSrcCaches[i]);
-      if (!src->cache)
-         continue;
-
-      hash_table_foreach(src->cache, entry) {
-         struct anv_shader_bin *bin = entry->data;
-         assert(bin);
-
-         if (_mesa_hash_table_search(dst->cache, bin->key))
-            continue;
-
-         anv_shader_bin_ref(bin);
-         _mesa_hash_table_insert(dst->cache, bin->key, bin);
-      }
-   }
-
-   return VK_SUCCESS;
+   return &shader->base;
 }
 
 struct anv_shader_bin *
 anv_device_search_for_kernel(struct anv_device *device,
-                             struct anv_pipeline_cache *cache,
+                             struct vk_pipeline_cache *cache,
                              const void *key_data, uint32_t key_size,
                              bool *user_cache_hit)
 {
-   struct anv_shader_bin *bin;
-
-   *user_cache_hit = false;
-
-   if (cache) {
-      bin = anv_pipeline_cache_search(cache, key_data, key_size);
-      if (bin) {
-         *user_cache_hit = cache != &device->default_pipeline_cache;
-         return bin;
-      }
+   /* Use the default pipeline cache if none is specified */
+   if (cache == NULL)
+      cache = device->default_pipeline_cache;
+
+   bool cache_hit = false;
+   struct vk_pipeline_cache_object *object =
+      vk_pipeline_cache_lookup_object(cache, key_data, key_size,
+                                      &anv_shader_bin_ops, &cache_hit);
+   if (user_cache_hit != NULL) {
+      *user_cache_hit = object != NULL && cache_hit &&
+                        cache != device->default_pipeline_cache;
    }
 
-#ifdef ENABLE_SHADER_CACHE
-   struct disk_cache *disk_cache = device->physical->disk_cache;
-   if (disk_cache && device->physical->instance->pipeline_cache_enabled) {
-      cache_key cache_key;
-      disk_cache_compute_key(disk_cache, key_data, key_size, cache_key);
-
-      size_t buffer_size;
-      uint8_t *buffer = disk_cache_get(disk_cache, cache_key, &buffer_size);
-      if (buffer) {
-         struct blob_reader blob;
-         blob_reader_init(&blob, buffer, buffer_size);
-         bin = anv_shader_bin_create_from_blob(device, &blob);
-         free(buffer);
-
-         if (bin) {
-            if (cache)
-               anv_pipeline_cache_add_shader_bin(cache, bin);
-            return bin;
-         }
-      }
-   }
-#endif
+   if (object == NULL)
+      return NULL;
 
-   return NULL;
+   return container_of(object, struct anv_shader_bin, base);
 }
 
 struct anv_shader_bin *
 anv_device_upload_kernel(struct anv_device *device,
-                         struct anv_pipeline_cache *cache,
-                         gl_shader_stage stage,
-                         const void *key_data, uint32_t key_size,
-                         const void *kernel_data, uint32_t kernel_size,
-                         const struct brw_stage_prog_data *prog_data,
-                         uint32_t prog_data_size,
-                         const struct brw_compile_stats *stats,
-                         uint32_t num_stats,
-                         const nir_xfb_info *xfb_info,
-                         const struct anv_pipeline_bind_map *bind_map)
+                         struct vk_pipeline_cache *cache,
+                         const struct anv_shader_upload_params *params)
 {
-   struct anv_shader_bin *bin;
-   if (cache) {
-      bin = anv_pipeline_cache_upload_kernel(cache, stage, key_data, key_size,
-                                             kernel_data, kernel_size,
-                                             prog_data, prog_data_size,
-                                             stats, num_stats,
-                                             xfb_info, bind_map);
-   } else {
-      bin = anv_shader_bin_create(device, stage, key_data, key_size,
-                                  kernel_data, kernel_size,
-                                  prog_data, prog_data_size,
-                                  stats, num_stats,
-                                  xfb_info, bind_map);
-   }
+   /* Use the default pipeline cache if none is specified */
+   if (cache == NULL)
+      cache = device->default_pipeline_cache;
 
-   if (bin == NULL)
-      return NULL;
 
-#ifdef ENABLE_SHADER_CACHE
-   struct disk_cache *disk_cache = device->physical->disk_cache;
-   if (disk_cache) {
-      struct blob binary;
-      blob_init(&binary);
-      if (anv_shader_bin_write_to_blob(bin, &binary)) {
-         cache_key cache_key;
-         disk_cache_compute_key(disk_cache, key_data, key_size, cache_key);
 
-         disk_cache_put(disk_cache, cache_key, binary.data, binary.size, NULL);
-      }
+   struct anv_shader_bin *shader =
+      anv_shader_bin_create(device,
+                            params->stage,
+                            params->key_data,
+                            params->key_size,
+                            params->kernel_data,
+                            params->kernel_size,
+                            params->prog_data,
+                            params->prog_data_size,
+                            params->stats,
+                            params->num_stats,
+                            params->xfb_info,
+                            params->bind_map,
+                            params->push_desc_info,
+                            params->dynamic_push_values);
+   if (shader == NULL)
+      return NULL;
 
-      blob_finish(&binary);
-   }
-#endif
+   struct vk_pipeline_cache_object *cached =
+      vk_pipeline_cache_add_object(cache, &shader->base);
 
-   return bin;
+   return container_of(cached, struct anv_shader_bin, base);
 }
 
-struct serialized_nir {
-   unsigned char sha1_key[20];
-   size_t size;
-   char data[0];
-};
+#define SHA1_KEY_SIZE 20
 
 struct nir_shader *
 anv_device_search_for_nir(struct anv_device *device,
-                          struct anv_pipeline_cache *cache,
+                          struct vk_pipeline_cache *cache,
                           const nir_shader_compiler_options *nir_options,
-                          unsigned char sha1_key[20],
+                          unsigned char sha1_key[SHA1_KEY_SIZE],
                           void *mem_ctx)
 {
-   if (cache && cache->nir_cache) {
-      const struct serialized_nir *snir = NULL;
-
-      anv_cache_lock(cache);
-      struct hash_entry *entry =
-         _mesa_hash_table_search(cache->nir_cache, sha1_key);
-      if (entry)
-         snir = entry->data;
-      anv_cache_unlock(cache);
-
-      if (snir) {
-         struct blob_reader blob;
-         blob_reader_init(&blob, snir->data, snir->size);
-
-         nir_shader *nir = nir_deserialize(mem_ctx, nir_options, &blob);
-         if (blob.overrun) {
-            ralloc_free(nir);
-         } else {
-            return nir;
-         }
-      }
-   }
+   if (cache == NULL)
+      cache = device->default_pipeline_cache;
 
-   return NULL;
+   return vk_pipeline_cache_lookup_nir(cache, sha1_key, SHA1_KEY_SIZE,
+                                       nir_options, NULL, mem_ctx);
 }
 
 void
 anv_device_upload_nir(struct anv_device *device,
-                      struct anv_pipeline_cache *cache,
+                      struct vk_pipeline_cache *cache,
                       const struct nir_shader *nir,
-                      unsigned char sha1_key[20])
+                      unsigned char sha1_key[SHA1_KEY_SIZE])
 {
-   if (cache && cache->nir_cache) {
-      anv_cache_lock(cache);
-      struct hash_entry *entry =
-         _mesa_hash_table_search(cache->nir_cache, sha1_key);
-      anv_cache_unlock(cache);
-      if (entry)
-         return;
-
-      struct blob blob;
-      blob_init(&blob);
-
-      nir_serialize(&blob, nir, false);
-      if (blob.out_of_memory) {
-         blob_finish(&blob);
-         return;
-      }
+   if (cache == NULL)
+      cache = device->default_pipeline_cache;
 
-      anv_cache_lock(cache);
-      /* Because ralloc isn't thread-safe, we have to do all this inside the
-       * lock.  We could unlock for the big memcpy but it's probably not worth
-       * the hassle.
-       */
-      entry = _mesa_hash_table_search(cache->nir_cache, sha1_key);
-      if (entry) {
-         blob_finish(&blob);
-         anv_cache_unlock(cache);
-         return;
-      }
+   vk_pipeline_cache_add_nir(cache, sha1_key, SHA1_KEY_SIZE, nir);
+}
+
+void
+anv_load_fp64_shader(struct anv_device *device)
+{
+   const nir_shader_compiler_options *nir_options =
+      device->physical->compiler->nir_options[MESA_SHADER_VERTEX];
+
+   const char* shader_name = "float64_spv_lib";
+   struct mesa_sha1 sha1_ctx;
+   uint8_t sha1[20];
+   _mesa_sha1_init(&sha1_ctx);
+   _mesa_sha1_update(&sha1_ctx, shader_name, strlen(shader_name));
+   _mesa_sha1_final(&sha1_ctx, sha1);
+
+   device->fp64_nir =
+      anv_device_search_for_nir(device, device->internal_cache,
+                                   nir_options, sha1, NULL);
+
+   /* The shader found, no need to call spirv_to_nir() again. */
+   if (device->fp64_nir)
+      return;
 
-      struct serialized_nir *snir =
-         ralloc_size(cache->nir_cache, sizeof(*snir) + blob.size);
-      memcpy(snir->sha1_key, sha1_key, 20);
-      snir->size = blob.size;
-      memcpy(snir->data, blob.data, blob.size);
+   struct spirv_to_nir_options spirv_options = {
+      .caps = {
+         .address = true,
+         .float64 = true,
+         .int8 = true,
+         .int16 = true,
+         .int64 = true,
+      },
+      .environment = NIR_SPIRV_VULKAN,
+      .create_library = true
+   };
 
-      blob_finish(&blob);
+   nir_shader* nir =
+      spirv_to_nir(float64_spv_source, sizeof(float64_spv_source) / 4,
+                   NULL, 0, MESA_SHADER_VERTEX, "main",
+                   &spirv_options, nir_options);
 
-      _mesa_hash_table_insert(cache->nir_cache, snir->sha1_key, snir);
+   assert(nir != NULL);
 
-      anv_cache_unlock(cache);
-   }
+   nir_validate_shader(nir, "after spirv_to_nir");
+   nir_validate_ssa_dominance(nir, "after spirv_to_nir");
+
+   NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp);
+   NIR_PASS_V(nir, nir_lower_returns);
+   NIR_PASS_V(nir, nir_inline_functions);
+   NIR_PASS_V(nir, nir_opt_deref);
+
+   NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+   NIR_PASS_V(nir, nir_copy_prop);
+   NIR_PASS_V(nir, nir_opt_dce);
+   NIR_PASS_V(nir, nir_opt_cse);
+   NIR_PASS_V(nir, nir_opt_gcm, true);
+   NIR_PASS_V(nir, nir_opt_peephole_select, 1, false, false);
+   NIR_PASS_V(nir, nir_opt_dce);
+
+   NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_function_temp,
+              nir_address_format_62bit_generic);
+
+   anv_device_upload_nir(device, device->internal_cache,
+                         nir, sha1);
+
+   device->fp64_nir = nir;
 }
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 5194a2f1887..3949a14c3f9 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -30,47 +30,82 @@
 #include <pthread.h>
 #include <assert.h>
 #include <stdint.h>
-#include "drm-uapi/i915_drm.h"
+#include "drm-uapi/drm_fourcc.h"
 
 #ifdef HAVE_VALGRIND
 #include <valgrind.h>
 #include <memcheck.h>
 #define VG(x) x
-#ifndef NDEBUG
-#define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x))
-#endif
 #else
 #define VG(x) ((void)0)
 #endif
 
-#include "common/intel_clflush.h"
-#include "common/intel_decoder.h"
+#include "common/intel_aux_map.h"
+#include "common/intel_bind_timeline.h"
+#include "common/intel_engine.h"
 #include "common/intel_gem.h"
 #include "common/intel_l3_config.h"
 #include "common/intel_measure.h"
+#include "common/intel_mem.h"
+#include "common/intel_sample_positions.h"
+#include "decoder/intel_decoder.h"
 #include "dev/intel_device_info.h"
 #include "blorp/blorp.h"
 #include "compiler/brw_compiler.h"
+#include "compiler/brw_kernel.h"
 #include "compiler/brw_rt.h"
+#include "ds/intel_driver_ds.h"
 #include "util/bitset.h"
 #include "util/bitscan.h"
+#include "util/detect_os.h"
 #include "util/macros.h"
 #include "util/hash_table.h"
 #include "util/list.h"
+#include "util/perf/u_trace.h"
+#include "util/set.h"
 #include "util/sparse_array.h"
 #include "util/u_atomic.h"
+#if DETECT_OS_ANDROID
+#include "util/u_gralloc/u_gralloc.h"
+#endif
 #include "util/u_vector.h"
 #include "util/u_math.h"
 #include "util/vma.h"
 #include "util/xmlconfig.h"
+#include "vk_acceleration_structure.h"
 #include "vk_alloc.h"
+#include "vk_buffer.h"
+#include "vk_buffer_view.h"
+#include "vk_command_buffer.h"
+#include "vk_command_pool.h"
 #include "vk_debug_report.h"
+#include "vk_descriptor_update_template.h"
 #include "vk_device.h"
+#include "vk_device_memory.h"
+#include "vk_drm_syncobj.h"
+#include "vk_enum_defines.h"
+#include "vk_format.h"
+#include "vk_framebuffer.h"
+#include "vk_graphics_state.h"
 #include "vk_image.h"
 #include "vk_instance.h"
+#include "vk_pipeline_cache.h"
 #include "vk_physical_device.h"
+#include "vk_sampler.h"
 #include "vk_shader_module.h"
+#include "vk_sync.h"
+#include "vk_sync_timeline.h"
+#include "vk_texcompress_astc.h"
 #include "vk_util.h"
+#include "vk_query_pool.h"
+#include "vk_queue.h"
+#include "vk_log.h"
+#include "vk_ycbcr_conversion.h"
+#include "vk_video.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /* Pre-declarations needed for WSI entrypoints */
 struct wl_surface;
@@ -83,7 +118,6 @@ struct anv_batch;
 struct anv_buffer;
 struct anv_buffer_view;
 struct anv_image_view;
-struct anv_acceleration_structure;
 struct anv_instance;
 
 struct intel_aux_map_context;
@@ -96,6 +130,8 @@ struct intel_perf_query_result;
 
 #include "anv_android.h"
 #include "anv_entrypoints.h"
+#include "anv_kmd_backend.h"
+#include "anv_rmv.h"
 #include "isl/isl.h"
 
 #include "dev/intel_debug.h"
@@ -106,59 +142,7 @@ struct intel_perf_query_result;
 
 #define NSEC_PER_SEC 1000000000ull
 
-/* anv Virtual Memory Layout
- * =========================
- *
- * When the anv driver is determining the virtual graphics addresses of memory
- * objects itself using the softpin mechanism, the following memory ranges
- * will be used.
- *
- * Three special considerations to notice:
- *
- * (1) the dynamic state pool is located within the same 4 GiB as the low
- * heap. This is to work around a VF cache issue described in a comment in
- * anv_physical_device_init_heaps.
- *
- * (2) the binding table pool is located at lower addresses than the surface
- * state pool, within a 4 GiB range. This allows surface state base addresses
- * to cover both binding tables (16 bit offsets) and surface states (32 bit
- * offsets).
- *
- * (3) the last 4 GiB of the address space is withheld from the high
- * heap. Various hardware units will read past the end of an object for
- * various reasons. This healthy margin prevents reads from wrapping around
- * 48-bit addresses.
- */
-#define GENERAL_STATE_POOL_MIN_ADDRESS     0x000000010000ULL /* 64 KiB */
-#define GENERAL_STATE_POOL_MAX_ADDRESS     0x00003fffffffULL
-#define LOW_HEAP_MIN_ADDRESS               0x000040000000ULL /* 1 GiB */
-#define LOW_HEAP_MAX_ADDRESS               0x00007fffffffULL
-#define DYNAMIC_STATE_POOL_MIN_ADDRESS     0x0000c0000000ULL /* 3 GiB */
-#define DYNAMIC_STATE_POOL_MAX_ADDRESS     0x0000ffffffffULL
-#define BINDING_TABLE_POOL_MIN_ADDRESS     0x000100000000ULL /* 4 GiB */
-#define BINDING_TABLE_POOL_MAX_ADDRESS     0x00013fffffffULL
-#define SURFACE_STATE_POOL_MIN_ADDRESS     0x000140000000ULL /* 5 GiB */
-#define SURFACE_STATE_POOL_MAX_ADDRESS     0x00017fffffffULL
-#define INSTRUCTION_STATE_POOL_MIN_ADDRESS 0x000180000000ULL /* 6 GiB */
-#define INSTRUCTION_STATE_POOL_MAX_ADDRESS 0x0001bfffffffULL
-#define CLIENT_VISIBLE_HEAP_MIN_ADDRESS    0x0001c0000000ULL /* 7 GiB */
-#define CLIENT_VISIBLE_HEAP_MAX_ADDRESS    0x0002bfffffffULL
-#define HIGH_HEAP_MIN_ADDRESS              0x0002c0000000ULL /* 11 GiB */
-
-#define GENERAL_STATE_POOL_SIZE     \
-   (GENERAL_STATE_POOL_MAX_ADDRESS - GENERAL_STATE_POOL_MIN_ADDRESS + 1)
-#define LOW_HEAP_SIZE               \
-   (LOW_HEAP_MAX_ADDRESS - LOW_HEAP_MIN_ADDRESS + 1)
-#define DYNAMIC_STATE_POOL_SIZE     \
-   (DYNAMIC_STATE_POOL_MAX_ADDRESS - DYNAMIC_STATE_POOL_MIN_ADDRESS + 1)
-#define BINDING_TABLE_POOL_SIZE     \
-   (BINDING_TABLE_POOL_MAX_ADDRESS - BINDING_TABLE_POOL_MIN_ADDRESS + 1)
-#define SURFACE_STATE_POOL_SIZE     \
-   (SURFACE_STATE_POOL_MAX_ADDRESS - SURFACE_STATE_POOL_MIN_ADDRESS + 1)
-#define INSTRUCTION_STATE_POOL_SIZE \
-   (INSTRUCTION_STATE_POOL_MAX_ADDRESS - INSTRUCTION_STATE_POOL_MIN_ADDRESS + 1)
-#define CLIENT_VISIBLE_HEAP_SIZE               \
-   (CLIENT_VISIBLE_HEAP_MAX_ADDRESS - CLIENT_VISIBLE_HEAP_MIN_ADDRESS + 1)
+#define BINDING_TABLE_POOL_BLOCK_SIZE (65536)
 
 /* Allowing different clear colors requires us to perform a depth resolve at
  * the end of certain render passes. This is because while slow clears store
@@ -175,7 +159,16 @@ struct intel_perf_query_result;
  */
 #define ANV_HZ_FC_VAL 1.0f
 
-#define MAX_VBS         28
+/* 3DSTATE_VERTEX_BUFFER supports 33 VBs, we use 2 for base & drawid SGVs */
+#define MAX_VBS         (33 - 2)
+
+/* 3DSTATE_VERTEX_ELEMENTS supports up to 34 VEs, but our backend compiler
+ * only supports the push model of VS inputs, and we only have 128 GRFs,
+ * minus the g0 and g1 payload, which gives us a maximum of 31 VEs.  Plus,
+ * we use two of them for SGVs.
+ */
+#define MAX_VES         (31 - 2)
+
 #define MAX_XFB_BUFFERS  4
 #define MAX_XFB_STREAMS  4
 #define MAX_SETS         8
@@ -184,10 +177,10 @@ struct intel_perf_query_result;
 #define MAX_SCISSORS    16
 #define MAX_PUSH_CONSTANTS_SIZE 128
 #define MAX_DYNAMIC_BUFFERS 16
-#define MAX_IMAGES 64
 #define MAX_PUSH_DESCRIPTORS 32 /* Minimum requirement */
 #define MAX_INLINE_UNIFORM_BLOCK_SIZE 4096
 #define MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS 32
+#define MAX_EMBEDDED_SAMPLERS 2048
 /* We need 16 for UBO block reads to work and 32 for push UBOs. However, we
  * use 64 here to avoid cache issues. This could most likely bring it back to
  * 32 if we had different virtual addresses for the different views on a given
@@ -199,6 +192,11 @@ struct intel_perf_query_result;
 #define MAX_VIEWS_FOR_PRIMITIVE_REPLICATION 16
 #define MAX_SAMPLE_LOCATIONS 16
 
+/* RENDER_SURFACE_STATE is a bit smaller (48b) but since it is aligned to 64
+ * and we can't put anything else there we use 64b.
+ */
+#define ANV_SURFACE_STATE_SIZE (64)
+
 /* From the Skylake PRM Vol. 7 "Binding Table Surface State Model":
  *
  *    "The surface state model is used when a Binding Table Index (specified
@@ -211,25 +209,6 @@ struct intel_perf_query_result;
  */
 #define MAX_BINDING_TABLE_SIZE 240
 
-/* The kernel relocation API has a limitation of a 32-bit delta value
- * applied to the address before it is written which, in spite of it being
- * unsigned, is treated as signed .  Because of the way that this maps to
- * the Vulkan API, we cannot handle an offset into a buffer that does not
- * fit into a signed 32 bits.  The only mechanism we have for dealing with
- * this at the moment is to limit all VkDeviceMemory objects to a maximum
- * of 2GB each.  The Vulkan spec allows us to do this:
- *
- *    "Some platforms may have a limit on the maximum size of a single
- *    allocation. For example, certain systems may fail to create
- *    allocations with a size greater than or equal to 4GB. Such a limit is
- *    implementation-dependent, and if such a failure occurs then the error
- *    VK_ERROR_OUT_OF_DEVICE_MEMORY should be returned."
- *
- * We don't use vk_error here because it's not an error so much as an
- * indication to the application that the allocation is too large.
- */
-#define MAX_MEMORY_ALLOCATION_SIZE (1ull << 31)
-
 #define ANV_SVGS_VB_INDEX    MAX_VBS
 #define ANV_DRAWID_VB_INDEX (MAX_VBS + 1)
 
@@ -244,6 +223,14 @@ struct intel_perf_query_result;
  */
 #define ANV_PERF_QUERY_OFFSET_REG 0x2670 /* MI_ALU_REG14 */
 
+#define ANV_GRAPHICS_SHADER_STAGE_COUNT (MESA_SHADER_MESH + 1)
+
+/* RENDER_SURFACE_STATE is a bit smaller (48b) but since it is aligned to 64
+ * and we can't put anything else there we use 64b.
+ */
+#define ANV_SURFACE_STATE_SIZE (64)
+#define ANV_SAMPLER_STATE_SIZE (32)
+
 /* For gfx12 we set the streamout buffers using 4 separate commands
  * (3DSTATE_SO_BUFFER_INDEX_*) instead of 3DSTATE_SO_BUFFER. However the layout
  * of the 3DSTATE_SO_BUFFER_INDEX_* commands is identical to that of
@@ -255,46 +242,28 @@ struct intel_perf_query_result;
 #define SO_BUFFER_INDEX_0_CMD 0x60
 #define anv_printflike(a, b) __attribute__((__format__(__printf__, a, b)))
 
+/* The TR-TT L1 page table entries may contain these values instead of actual
+ * pointers to indicate the regions are either NULL or invalid. We program
+ * these values to TR-TT registers, so we could change them, but it's super
+ * convenient to have the NULL value be 0 because everything is
+ * zero-initialized when allocated.
+ *
+ * Since we reserve these values for NULL/INVALID, then we can't use them as
+ * destinations for TR-TT address translation. Both values are shifted by 16
+ * bits, wich results in graphic addresses 0 and 64k. On Anv the first vma
+ * starts at 2MB, so we already don't use 0 and 64k for anything, so there's
+ * nothing really to reserve. We could instead just reserve random 64kb
+ * ranges from any of the non-TR-TT vmas and use their addresses.
+ */
+#define ANV_TRTT_L1_NULL_TILE_VAL 0
+#define ANV_TRTT_L1_INVALID_TILE_VAL 1
+
 static inline uint32_t
 align_down_npot_u32(uint32_t v, uint32_t a)
 {
    return v - (v % a);
 }
 
-static inline uint32_t
-align_down_u32(uint32_t v, uint32_t a)
-{
-   assert(a != 0 && a == (a & -a));
-   return v & ~(a - 1);
-}
-
-static inline uint32_t
-align_u32(uint32_t v, uint32_t a)
-{
-   assert(a != 0 && a == (a & -a));
-   return align_down_u32(v + a - 1, a);
-}
-
-static inline uint64_t
-align_down_u64(uint64_t v, uint64_t a)
-{
-   assert(a != 0 && a == (a & -a));
-   return v & ~(a - 1);
-}
-
-static inline uint64_t
-align_u64(uint64_t v, uint64_t a)
-{
-   return align_down_u64(v + a - 1, a);
-}
-
-static inline int32_t
-align_i32(int32_t v, int32_t a)
-{
-   assert(a != 0 && a == (a & -a));
-   return (v + a - 1) & ~(a - 1);
-}
-
 /** Alignment must be a power of 2. */
 static inline bool
 anv_is_aligned(uintmax_t n, uintmax_t a)
@@ -303,39 +272,6 @@ anv_is_aligned(uintmax_t n, uintmax_t a)
    return (n & (a - 1)) == 0;
 }
 
-static inline uint32_t
-anv_minify(uint32_t n, uint32_t levels)
-{
-   if (unlikely(n == 0))
-      return 0;
-   else
-      return MAX2(n >> levels, 1);
-}
-
-static inline float
-anv_clamp_f(float f, float min, float max)
-{
-   assert(min < max);
-
-   if (f > max)
-      return max;
-   else if (f < min)
-      return min;
-   else
-      return f;
-}
-
-static inline bool
-anv_clear_mask(uint32_t *inout_mask, uint32_t clear_mask)
-{
-   if (*inout_mask & clear_mask) {
-      *inout_mask &= ~clear_mask;
-      return true;
-   } else {
-      return false;
-   }
-}
-
 static inline union isl_color_value
 vk_to_isl_color(VkClearColorValue color)
 {
@@ -349,55 +285,26 @@ vk_to_isl_color(VkClearColorValue color)
    };
 }
 
-static inline void *anv_unpack_ptr(uintptr_t ptr, int bits, int *flags)
-{
-   uintptr_t mask = (1ull << bits) - 1;
-   *flags = ptr & mask;
-   return (void *) (ptr & ~mask);
-}
-
-static inline uintptr_t anv_pack_ptr(void *ptr, int bits, int flags)
+static inline union isl_color_value
+vk_to_isl_color_with_format(VkClearColorValue color, enum isl_format format)
 {
-   uintptr_t value = (uintptr_t) ptr;
-   uintptr_t mask = (1ull << bits) - 1;
-   return value | (mask & flags);
-}
+   const struct isl_format_layout *fmtl = isl_format_get_layout(format);
+   union isl_color_value isl_color = { .u32 = {0, } };
 
-/* Whenever we generate an error, pass it through this function. Useful for
- * debugging, where we can break on it. Only call at error site, not when
- * propagating errors. Might be useful to plug in a stack trace here.
- */
+#define COPY_COLOR_CHANNEL(c, i) \
+   if (fmtl->channels.c.bits) \
+      isl_color.u32[i] = color.uint32[i]
 
-VkResult __vk_errorv(struct anv_instance *instance,
-                     const struct vk_object_base *object, VkResult error,
-                     const char *file, int line, const char *format,
-                     va_list args);
+   COPY_COLOR_CHANNEL(r, 0);
+   COPY_COLOR_CHANNEL(g, 1);
+   COPY_COLOR_CHANNEL(b, 2);
+   COPY_COLOR_CHANNEL(a, 3);
 
-VkResult __vk_errorf(struct anv_instance *instance,
-                     const struct vk_object_base *object, VkResult error,
-                     const char *file, int line, const char *format, ...)
-   anv_printflike(6, 7);
-
-#ifdef DEBUG
-#define vk_error(error) __vk_errorf(NULL, NULL, error, __FILE__, __LINE__, NULL)
-#define vk_errorfi(instance, obj, error, format, ...)\
-    __vk_errorf(instance, obj, error,\
-                __FILE__, __LINE__, format, ## __VA_ARGS__)
-#define vk_errorf(device, obj, error, format, ...)\
-   vk_errorfi(anv_device_instance_or_null(device),\
-              obj, error, format, ## __VA_ARGS__)
-#else
+#undef COPY_COLOR_CHANNEL
 
-static inline VkResult __dummy_vk_error(VkResult error, UNUSED const void *ignored)
-{
-   return error;
+   return isl_color;
 }
 
-#define vk_error(error) __dummy_vk_error(error, NULL)
-#define vk_errorfi(instance, obj, error, format, ...) __dummy_vk_error(error, instance)
-#define vk_errorf(device, obj, error, format, ...) __dummy_vk_error(error, device)
-#endif
-
 /**
  * Warn on ignored extension structs.
  *
@@ -420,8 +327,6 @@ void __anv_perf_warn(struct anv_device *device,
                      const struct vk_object_base *object,
                      const char *file, int line, const char *format, ...)
    anv_printflike(5, 6);
-void anv_loge(const char *format, ...) anv_printflike(1, 2);
-void anv_loge_v(const char *format, va_list va);
 
 /**
  * Print a FINISHME message, including its source location.
@@ -439,18 +344,20 @@ void anv_loge_v(const char *format, va_list va);
 /**
  * Print a perf warning message.  Set INTEL_DEBUG=perf to see these.
  */
-#define anv_perf_warn(instance, obj, format, ...) \
+#define anv_perf_warn(objects_macro, format, ...)   \
    do { \
       static bool reported = false; \
-      if (!reported && (INTEL_DEBUG & DEBUG_PERF)) { \
-         __anv_perf_warn(instance, obj, __FILE__, __LINE__,\
-                         format, ##__VA_ARGS__); \
+      if (!reported && INTEL_DEBUG(DEBUG_PERF)) { \
+         __vk_log(VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT,      \
+                  VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT,      \
+                  objects_macro, __FILE__, __LINE__,                    \
+                  format, ## __VA_ARGS__);                              \
          reported = true; \
       } \
    } while (0)
 
 /* A non-fatal assert.  Useful for debugging. */
-#ifdef DEBUG
+#if MESA_DEBUG
 #define anv_assert(x) ({ \
    if (unlikely(!(x))) \
       mesa_loge("%s:%d ASSERT: %s", __FILE__, __LINE__, #x); \
@@ -459,89 +366,179 @@ void anv_loge_v(const char *format, va_list va);
 #define anv_assert(x)
 #endif
 
+enum anv_bo_alloc_flags {
+   /** Specifies that the BO must have a 32-bit address
+    *
+    * This is the opposite of EXEC_OBJECT_SUPPORTS_48B_ADDRESS.
+    */
+   ANV_BO_ALLOC_32BIT_ADDRESS =           (1 << 0),
+
+   /** Specifies that the BO may be shared externally */
+   ANV_BO_ALLOC_EXTERNAL =                (1 << 1),
+
+   /** Specifies that the BO should be mapped */
+   ANV_BO_ALLOC_MAPPED =                  (1 << 2),
+
+   /** Specifies that the BO should be coherent.
+    *
+    * Note: In platforms with LLC where HOST_CACHED + HOST_COHERENT is free,
+    * bo can get upgraded to HOST_CACHED_COHERENT
+    */
+   ANV_BO_ALLOC_HOST_COHERENT =           (1 << 3),
+
+   /** Specifies that the BO should be captured in error states */
+   ANV_BO_ALLOC_CAPTURE =                 (1 << 4),
+
+   /** Specifies that the BO will have an address assigned by the caller
+    *
+    * Such BOs do not exist in any VMA heap.
+    */
+   ANV_BO_ALLOC_FIXED_ADDRESS =           (1 << 5),
+
+   /** Enables implicit synchronization on the BO
+    *
+    * This is the opposite of EXEC_OBJECT_ASYNC.
+    */
+   ANV_BO_ALLOC_IMPLICIT_SYNC =           (1 << 6),
+
+   /** Enables implicit synchronization on the BO
+    *
+    * This is equivalent to EXEC_OBJECT_WRITE.
+    */
+   ANV_BO_ALLOC_IMPLICIT_WRITE =          (1 << 7),
+
+   /** Has an address which is visible to the client */
+   ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS =  (1 << 8),
+
+   /** Align the BO's virtual address to match AUX-TT requirements */
+   ANV_BO_ALLOC_AUX_TT_ALIGNED =          (1 << 9),
+
+   /** This buffer is allocated from local memory and should be cpu visible */
+   ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE =   (1 << 10),
+
+   /** For non device local allocations */
+   ANV_BO_ALLOC_NO_LOCAL_MEM =            (1 << 11),
+
+   /** This buffer will be scanout to display */
+   ANV_BO_ALLOC_SCANOUT =                 (1 << 12),
+
+   /** For descriptor pools */
+   ANV_BO_ALLOC_DESCRIPTOR_POOL =         (1 << 13),
+
+   /** For buffers that will be bound using TR-TT.
+    *
+    * Not for buffers used as the TR-TT page tables.
+    */
+   ANV_BO_ALLOC_TRTT =                    (1 << 14),
+
+   /** Protected buffer */
+   ANV_BO_ALLOC_PROTECTED =               (1 << 15),
+
+   /** Specifies that the BO should be cached and incoherent. */
+   ANV_BO_ALLOC_HOST_CACHED =             (1 << 16),
+
+   /** For sampler pools */
+   ANV_BO_ALLOC_SAMPLER_POOL =            (1 << 17),
+
+   /** Specifies that the BO is imported.
+    *
+    * Imported BOs must also be marked as ANV_BO_ALLOC_EXTERNAL
+    */
+   ANV_BO_ALLOC_IMPORTED =                (1 << 18),
+
+   /** Specify whether this BO is internal to the driver */
+   ANV_BO_ALLOC_INTERNAL =                (1 << 19),
+
+   /** Allocate with CCS AUX requirements
+    *
+    * This pads the BO include CCS data mapppable through the AUX-TT and
+    * aligned to the AUX-TT requirements.
+    */
+   ANV_BO_ALLOC_AUX_CCS =                 (1 << 20),
+
+   /** For descriptor buffer pools */
+   ANV_BO_ALLOC_DESCRIPTOR_BUFFER_POOL =  (1 << 21),
+};
+
+/** Specifies that the BO should be cached and coherent. */
+#define ANV_BO_ALLOC_HOST_CACHED_COHERENT (ANV_BO_ALLOC_HOST_COHERENT | \
+                                           ANV_BO_ALLOC_HOST_CACHED)
+
+
 struct anv_bo {
    const char *name;
 
+   /* The VMA heap in anv_device from which this BO takes its offset.
+    *
+    * This can only be NULL when has_fixed_address is true.
+    */
+   struct util_vma_heap *vma_heap;
+
+   /* All userptr bos in Xe KMD has gem_handle set to workaround_bo->gem_handle */
    uint32_t gem_handle;
 
    uint32_t refcount;
 
    /* Index into the current validation list.  This is used by the
-    * validation list building alrogithm to track which buffers are already
+    * validation list building algorithm to track which buffers are already
     * in the validation list so that we can ensure uniqueness.
     */
-   uint32_t index;
+   uint32_t exec_obj_index;
 
    /* Index for use with util_sparse_array_free_list */
    uint32_t free_index;
 
    /* Last known offset.  This value is provided by the kernel when we
     * execbuf and is used as the presumed offset for the next bunch of
-    * relocations.
+    * relocations, in canonical address format.
     */
    uint64_t offset;
 
-   /** Size of the buffer not including implicit aux */
+   /** Size of the buffer */
    uint64_t size;
 
+   /** Offset at which the CCS data is stored */
+   uint64_t ccs_offset;
+
    /* Map for internally mapped BOs.
     *
-    * If ANV_BO_WRAPPER is set in flags, map points to the wrapped BO.
+    * If ANV_BO_ALLOC_MAPPED is set in flags, this is the map for the whole
+    * BO.
     */
    void *map;
 
-   /** Size of the implicit CCS range at the end of the buffer
-    *
-    * On Gfx12, CCS data is always a direct 1/256 scale-down.  A single 64K
-    * page of main surface data maps to a 256B chunk of CCS data and that
-    * mapping is provided on TGL-LP by the AUX table which maps virtual memory
-    * addresses in the main surface to virtual memory addresses for CCS data.
-    *
-    * Because we can't change these maps around easily and because Vulkan
-    * allows two VkImages to be bound to overlapping memory regions (as long
-    * as the app is careful), it's not feasible to make this mapping part of
-    * the image.  (On Gfx11 and earlier, the mapping was provided via
-    * RENDER_SURFACE_STATE so each image had its own main -> CCS mapping.)
-    * Instead, we attach the CCS data directly to the buffer object and setup
-    * the AUX table mapping at BO creation time.
-    *
-    * This field is for internal tracking use by the BO allocator only and
-    * should not be touched by other parts of the code.  If something wants to
-    * know if a BO has implicit CCS data, it should instead look at the
-    * has_implicit_ccs boolean below.
-    *
-    * This data is not included in maps of this buffer.
+   /* The actual size of bo allocated by kmd, basically:
+    * align(size, mem_alignment)
     */
-   uint32_t _ccs_size;
+   uint64_t actual_size;
 
    /** Flags to pass to the kernel through drm_i915_exec_object2::flags */
    uint32_t flags;
 
-   /** True if this BO may be shared with other processes */
-   bool is_external:1;
-
-   /** True if this BO is a wrapper
-    *
-    * When set to true, none of the fields in this BO are meaningful except
-    * for anv_bo::is_wrapper and anv_bo::map which points to the actual BO.
-    * See also anv_bo_unwrap().  Wrapper BOs are not allowed when use_softpin
-    * is set in the physical device.
-    */
-   bool is_wrapper:1;
-
-   /** See also ANV_BO_ALLOC_FIXED_ADDRESS */
-   bool has_fixed_address:1;
+   enum anv_bo_alloc_flags alloc_flags;
 
    /** True if this BO wraps a host pointer */
    bool from_host_ptr:1;
 
-   /** See also ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS */
-   bool has_client_visible_address:1;
-
-   /** True if this BO has implicit CCS data attached to it */
-   bool has_implicit_ccs:1;
+   /** True if this BO is mapped in the GTT (only used for RMV) */
+   bool gtt_mapped:1;
 };
 
+static inline bool
+anv_bo_is_external(const struct anv_bo *bo)
+{
+   return bo->alloc_flags & ANV_BO_ALLOC_EXTERNAL;
+}
+
+static inline bool
+anv_bo_is_vram_only(const struct anv_bo *bo)
+{
+   return !(bo->alloc_flags & (ANV_BO_ALLOC_NO_LOCAL_MEM |
+                               ANV_BO_ALLOC_MAPPED |
+                               ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE |
+                               ANV_BO_ALLOC_IMPORTED));
+}
+
 static inline struct anv_bo *
 anv_bo_ref(struct anv_bo *bo)
 {
@@ -549,14 +546,71 @@ anv_bo_ref(struct anv_bo *bo)
    return bo;
 }
 
-static inline struct anv_bo *
-anv_bo_unwrap(struct anv_bo *bo)
+enum intel_device_info_mmap_mode
+anv_bo_get_mmap_mode(struct anv_device *device, struct anv_bo *bo);
+
+static inline bool
+anv_bo_needs_host_cache_flush(enum anv_bo_alloc_flags alloc_flags)
 {
-   while (bo->is_wrapper)
-      bo = bo->map;
-   return bo;
+   return (alloc_flags & (ANV_BO_ALLOC_HOST_CACHED | ANV_BO_ALLOC_HOST_COHERENT)) ==
+          ANV_BO_ALLOC_HOST_CACHED;
+}
+
+struct anv_address {
+   struct anv_bo *bo;
+   int64_t offset;
+};
+
+#define ANV_NULL_ADDRESS ((struct anv_address) { NULL, 0 })
+
+static inline struct anv_address
+anv_address_from_u64(uint64_t addr_u64)
+{
+   assert(addr_u64 == intel_canonical_address(addr_u64));
+   return (struct anv_address) {
+      .bo = NULL,
+      .offset = addr_u64,
+   };
+}
+
+static inline bool
+anv_address_is_null(struct anv_address addr)
+{
+   return addr.bo == NULL && addr.offset == 0;
 }
 
+static inline uint64_t
+anv_address_physical(struct anv_address addr)
+{
+   uint64_t address = (addr.bo ? addr.bo->offset : 0ull) + addr.offset;
+   return intel_canonical_address(address);
+}
+
+static inline struct anv_address
+anv_address_add(struct anv_address addr, uint64_t offset)
+{
+   addr.offset += offset;
+   return addr;
+}
+
+static inline void *
+anv_address_map(struct anv_address addr)
+{
+   if (addr.bo == NULL)
+      return NULL;
+
+   if (addr.bo->map == NULL)
+      return NULL;
+
+   return addr.bo->map + addr.offset;
+}
+
+/* Represent a virtual address range */
+struct anv_va_range {
+   uint64_t addr;
+   uint64_t size;
+};
+
 /* Represents a lock-free linked list of "free" things.  This is used by
  * both the block pool and the state pools.  Unfortunately, in order to
  * solve the ABA problem, we can't use a single uint32_t head.
@@ -571,7 +625,7 @@ union anv_free_list {
    /* Make sure it's aligned to 64 bits. This will make atomic operations
     * faster on 32 bit platforms.
     */
-   uint64_t u64 __attribute__ ((aligned (8)));
+   alignas(8) uint64_t u64;
 };
 
 #define ANV_FREE_LIST_EMPTY ((union anv_free_list) { { UINT32_MAX, 0 } })
@@ -585,7 +639,7 @@ struct anv_block_state {
       /* Make sure it's aligned to 64 bits. This will make atomic operations
        * faster on 32 bit platforms.
        */
-      uint64_t u64 __attribute__ ((aligned (8)));
+      alignas(8) uint64_t u64;
    };
 };
 
@@ -600,22 +654,18 @@ struct anv_block_pool {
    const char *name;
 
    struct anv_device *device;
-   bool use_softpin;
-
-   /* Wrapper BO for use in relocation lists.  This BO is simply a wrapper
-    * around the actual BO so that we grow the pool after the wrapper BO has
-    * been put in a relocation list.  This is only used in the non-softpin
-    * case.
-    */
-   struct anv_bo wrapper_bo;
 
    struct anv_bo *bos[ANV_MAX_BLOCK_POOL_BOS];
    struct anv_bo *bo;
    uint32_t nbos;
 
+   /* Maximum size of the pool */
+   uint64_t max_size;
+
+   /* Current size of the pool */
    uint64_t size;
 
-   /* The address where the start of the pool is pinned. The various bos that
+   /* The canonical address where the start of the pool is pinned. The various bos that
     * are created as the pool grows will have addresses in the range
     * [start_address, start_address + BLOCK_POOL_MEMFD_SIZE).
     */
@@ -627,30 +677,9 @@ struct anv_block_pool {
     */
    uint32_t center_bo_offset;
 
-   /* Current memory map of the block pool.  This pointer may or may not
-    * point to the actual beginning of the block pool memory.  If
-    * anv_block_pool_alloc_back has ever been called, then this pointer
-    * will point to the "center" position of the buffer and all offsets
-    * (negative or positive) given out by the block pool alloc functions
-    * will be valid relative to this pointer.
-    *
-    * In particular, map == bo.map + center_offset
-    *
-    * DO NOT access this pointer directly. Use anv_block_pool_map() instead,
-    * since it will handle the softpin case as well, where this points to NULL.
-    */
-   void *map;
-   int fd;
-
-   /**
-    * Array of mmaps and gem handles owned by the block pool, reclaimed when
-    * the block pool is destroyed.
-    */
-   struct u_vector mmap_cleanups;
-
    struct anv_block_state state;
 
-   struct anv_block_state back_state;
+   enum anv_bo_alloc_flags bo_alloc_flags;
 };
 
 /* Block pools are backed by a fixed-size 1GB memfd */
@@ -664,14 +693,14 @@ struct anv_block_pool {
 static inline uint32_t
 anv_block_pool_size(struct anv_block_pool *pool)
 {
-   return pool->state.end + pool->back_state.end;
+   return pool->state.end;
 }
 
 struct anv_state {
-   int32_t offset;
+   int64_t offset;
    uint32_t alloc_size;
-   void *map;
    uint32_t idx;
+   void *map;
 };
 
 #define ANV_STATE_NULL ((struct anv_state) { .alloc_size = 0 })
@@ -682,7 +711,7 @@ struct anv_fixed_size_state_pool {
 };
 
 #define ANV_MIN_STATE_SIZE_LOG2 6
-#define ANV_MAX_STATE_SIZE_LOG2 21
+#define ANV_MAX_STATE_SIZE_LOG2 22
 
 #define ANV_STATE_BUCKETS (ANV_MAX_STATE_SIZE_LOG2 - ANV_MIN_STATE_SIZE_LOG2 + 1)
 
@@ -696,6 +725,7 @@ struct anv_state_table {
    int fd;
    struct anv_free_entry *map;
    uint32_t size;
+   uint64_t max_size;
    struct anv_block_state state;
    struct u_vector cleanups;
 };
@@ -706,16 +736,13 @@ struct anv_state_pool {
    /* Offset into the relevant state base address where the state pool starts
     * allocating memory.
     */
-   int32_t start_offset;
+   int64_t start_offset;
 
    struct anv_state_table table;
 
    /* The size of blocks which will be allocated from the block pool */
    uint32_t block_size;
 
-   /** Free list for "back" allocations */
-   union anv_free_list back_alloc_free_list;
-
    struct anv_fixed_size_state_pool buckets[ANV_STATE_BUCKETS];
 };
 
@@ -725,6 +752,21 @@ struct anv_state_reserved_pool {
    uint32_t count;
 };
 
+struct anv_state_reserved_array_pool {
+   struct anv_state_pool *pool;
+   simple_mtx_t mutex;
+   /* Bitfield of usable elements */
+   BITSET_WORD *states;
+   /* Backing store */
+   struct anv_state state;
+   /* Number of elements */
+   uint32_t count;
+   /* Stride between each element */
+   uint32_t stride;
+   /* Size of each element */
+   uint32_t size;
+};
+
 struct anv_state_stream {
    struct anv_state_pool *state_pool;
 
@@ -737,10 +779,42 @@ struct anv_state_stream {
    /* Offset into the current block at which to allocate the next state */
    uint32_t next;
 
+   /* Sum of all the blocks in all_blocks */
+   uint32_t total_size;
+
    /* List of all blocks allocated from this pool */
    struct util_dynarray all_blocks;
 };
 
+struct anv_sparse_submission {
+   struct anv_queue *queue;
+
+   struct anv_vm_bind *binds;
+   int binds_len;
+   int binds_capacity;
+
+   uint32_t wait_count;
+   uint32_t signal_count;
+
+   struct vk_sync_wait *waits;
+   struct vk_sync_signal *signals;
+};
+
+struct anv_trtt_bind {
+   uint64_t pte_addr;
+   uint64_t entry_addr;
+};
+
+struct anv_trtt_submission {
+   struct anv_sparse_submission *sparse;
+
+   struct anv_trtt_bind *l3l2_binds;
+   struct anv_trtt_bind *l1_binds;
+
+   int l3l2_binds_len;
+   int l1_binds_len;
+};
+
 /* The block_pool functions exported for testing only.  The block pool should
  * only be used via a state pool (see below).
  */
@@ -748,26 +822,54 @@ VkResult anv_block_pool_init(struct anv_block_pool *pool,
                              struct anv_device *device,
                              const char *name,
                              uint64_t start_address,
-                             uint32_t initial_size);
+                             uint32_t initial_size,
+                             uint32_t max_size);
 void anv_block_pool_finish(struct anv_block_pool *pool);
-int32_t anv_block_pool_alloc(struct anv_block_pool *pool,
-                             uint32_t block_size, uint32_t *padding);
-int32_t anv_block_pool_alloc_back(struct anv_block_pool *pool,
-                                  uint32_t block_size);
+VkResult anv_block_pool_alloc(struct anv_block_pool *pool,
+                              uint32_t block_size,
+                              int64_t *offset,
+                              uint32_t *padding);
 void* anv_block_pool_map(struct anv_block_pool *pool, int32_t offset, uint32_t
 size);
 
+struct anv_state_pool_params {
+   const char *name;
+   uint64_t    base_address;
+   int64_t     start_offset;
+   uint32_t    block_size;
+   uint32_t    max_size;
+};
+
 VkResult anv_state_pool_init(struct anv_state_pool *pool,
                              struct anv_device *device,
-                             const char *name,
-                             uint64_t base_address,
-                             int32_t start_offset,
-                             uint32_t block_size);
+                             const struct anv_state_pool_params *params);
 void anv_state_pool_finish(struct anv_state_pool *pool);
 struct anv_state anv_state_pool_alloc(struct anv_state_pool *pool,
                                       uint32_t state_size, uint32_t alignment);
-struct anv_state anv_state_pool_alloc_back(struct anv_state_pool *pool);
 void anv_state_pool_free(struct anv_state_pool *pool, struct anv_state state);
+
+static inline struct anv_address
+anv_state_pool_state_address(struct anv_state_pool *pool, struct anv_state state)
+{
+   return (struct anv_address) {
+      .bo = pool->block_pool.bo,
+      .offset = state.offset - pool->start_offset,
+   };
+}
+
+static inline struct anv_state
+anv_state_pool_emit_data(struct anv_state_pool *pool,
+                         size_t size, size_t align,
+                         const void *p)
+{
+   struct anv_state state;
+
+   state = anv_state_pool_alloc(pool, size, align);
+   memcpy(state.map, p, size);
+
+   return state;
+}
+
 void anv_state_stream_init(struct anv_state_stream *stream,
                            struct anv_state_pool *state_pool,
                            uint32_t block_size);
@@ -784,6 +886,20 @@ struct anv_state anv_state_reserved_pool_alloc(struct anv_state_reserved_pool *p
 void anv_state_reserved_pool_free(struct anv_state_reserved_pool *pool,
                                   struct anv_state state);
 
+VkResult anv_state_reserved_array_pool_init(struct anv_state_reserved_array_pool *pool,
+                                            struct anv_state_pool *parent,
+                                            uint32_t count, uint32_t size,
+                                            uint32_t alignment);
+void anv_state_reserved_array_pool_finish(struct anv_state_reserved_array_pool *pool);
+struct anv_state anv_state_reserved_array_pool_alloc(struct anv_state_reserved_array_pool *pool,
+                                                     bool alloc_back);
+struct anv_state anv_state_reserved_array_pool_alloc_index(struct anv_state_reserved_array_pool *pool,
+                                                           unsigned idx);
+uint32_t anv_state_reserved_array_pool_state_index(struct anv_state_reserved_array_pool *pool,
+                                                   struct anv_state state);
+void anv_state_reserved_array_pool_free(struct anv_state_reserved_array_pool *pool,
+                                        struct anv_state state);
+
 VkResult anv_state_table_init(struct anv_state_table *table,
                              struct anv_device *device,
                              uint32_t initial_entries);
@@ -811,11 +927,13 @@ struct anv_bo_pool {
 
    struct anv_device *device;
 
+   enum anv_bo_alloc_flags bo_alloc_flags;
+
    struct util_sparse_array_free_list free_list[16];
 };
 
 void anv_bo_pool_init(struct anv_bo_pool *pool, struct anv_device *device,
-                      const char *name);
+                      const char *name, enum anv_bo_alloc_flags alloc_flags);
 void anv_bo_pool_finish(struct anv_bo_pool *pool);
 VkResult anv_bo_pool_alloc(struct anv_bo_pool *pool, uint32_t size,
                            struct anv_bo **bo_out);
@@ -846,7 +964,8 @@ struct anv_bo_cache {
    pthread_mutex_t mutex;
 };
 
-VkResult anv_bo_cache_init(struct anv_bo_cache *cache);
+VkResult anv_bo_cache_init(struct anv_bo_cache *cache,
+                           struct anv_device *device);
 void anv_bo_cache_finish(struct anv_bo_cache *cache);
 
 struct anv_queue_family {
@@ -854,16 +973,17 @@ struct anv_queue_family {
    VkQueueFlags   queueFlags;
    uint32_t       queueCount;
 
-   /* Driver internal information */
-   enum drm_i915_gem_engine_class engine_class;
+   enum intel_engine_class engine_class;
 };
 
-#define ANV_MAX_QUEUE_FAMILIES 3
+#define ANV_MAX_QUEUE_FAMILIES 5
 
 struct anv_memory_type {
    /* Standard bits passed on to the client */
    VkMemoryPropertyFlags   propertyFlags;
    uint32_t                heapIndex;
+   /* Whether this is the descriptor buffer memory type */
+   bool                    descriptor_buffer;
 };
 
 struct anv_memory_heap {
@@ -875,17 +995,25 @@ struct anv_memory_heap {
     *
     * Align it to 64 bits to make atomic operations faster on 32 bit platforms.
     */
-   VkDeviceSize      used __attribute__ ((aligned (8)));
+   alignas(8) VkDeviceSize used;
 
    bool              is_local_mem;
 };
 
 struct anv_memregion {
-   struct drm_i915_gem_memory_class_instance region;
+   const struct intel_memory_class_instance *region;
    uint64_t size;
    uint64_t available;
 };
 
+enum anv_timestamp_capture_type {
+    ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE,
+    ANV_TIMESTAMP_CAPTURE_END_OF_PIPE,
+    ANV_TIMESTAMP_CAPTURE_AT_CS_STALL,
+    ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER,
+    ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH,
+};
+
 struct anv_physical_device {
     struct vk_physical_device                   vk;
 
@@ -894,22 +1022,10 @@ struct anv_physical_device {
 
     struct anv_instance *                       instance;
     char                                        path[20];
-    struct {
-       uint16_t                                 domain;
-       uint8_t                                  bus;
-       uint8_t                                  device;
-       uint8_t                                  function;
-    }                                           pci_info;
     struct intel_device_info                      info;
-    /** Amount of "GPU memory" we want to advertise
-     *
-     * Clearly, this value is bogus since Intel is a UMA architecture.  On
-     * gfx7 platforms, we are limited by GTT size unless we want to implement
-     * fine-grained tracking and GTT splitting.  On Broadwell and above we are
-     * practically unlimited.  However, we will never report more than 3/4 of
-     * the total system ram to try and avoid running out of RAM.
-     */
-    bool                                        supports_48bit_addresses;
+
+    bool                                        video_decode_enabled;
+
     struct brw_compiler *                       compiler;
     struct isl_device                           isl_dev;
     struct intel_perf_config *                    perf;
@@ -918,29 +1034,14 @@ struct anv_physical_device {
      * end.
      */
     uint32_t                                    n_perf_query_commands;
-    int                                         cmd_parser_version;
     bool                                        has_exec_async;
     bool                                        has_exec_capture;
-    bool                                        has_exec_fence;
-    bool                                        has_syncobj_wait;
-    bool                                        has_syncobj_wait_available;
-    bool                                        has_context_priority;
-    bool                                        has_context_isolation;
-    bool                                        has_thread_submit;
-    bool                                        has_mmap_offset;
-    bool                                        has_userptr_probe;
+    VkQueueGlobalPriorityKHR                    max_context_priority;
     uint64_t                                    gtt_size;
 
-    bool                                        use_softpin;
     bool                                        always_use_bindless;
     bool                                        use_call_secondary;
 
-    /** True if we can access buffers using A64 messages */
-    bool                                        has_a64_buffer_access;
-    /** True if we can use bindless access for images */
-    bool                                        has_bindless_images;
-    /** True if we can use bindless access for samplers */
-    bool                                        has_bindless_samplers;
     /** True if we can use timeline semaphores through execbuf */
     bool                                        has_exec_timeline;
 
@@ -951,15 +1052,78 @@ struct anv_physical_device {
      */
     bool                                        has_reg_timestamp;
 
-    /** True if this device has implicit AUX
-     *
-     * If true, CCS is handled as an implicit attachment to the BO rather than
-     * as an explicitly bound surface.
+    /** True if we can create protected contexts. */
+    bool                                        has_protected_contexts;
+
+    /** Whether KMD has the ability to create VM objects */
+    bool                                        has_vm_control;
+
+    /** True if we have the means to do sparse binding (e.g., a Kernel driver
+     * a vm_bind ioctl).
      */
-    bool                                        has_implicit_ccs;
+    enum anv_sparse_type {
+      ANV_SPARSE_TYPE_NOT_SUPPORTED = 0,
+      ANV_SPARSE_TYPE_VM_BIND,
+      ANV_SPARSE_TYPE_TRTT,
+      ANV_SPARSE_TYPE_FAKE,
+    } sparse_type;
+
+    /** True if HW supports ASTC LDR */
+    bool                                        has_astc_ldr;
+    /** True if denorms in void extents should be flushed to zero */
+    bool                                        flush_astc_ldr_void_extent_denorms;
+    /** True if ASTC LDR is supported via emulation */
+    bool                                        emu_astc_ldr;
+    /* true if FCV optimization should be disabled. */
+    bool                                        disable_fcv;
+    /**/
+    bool                                        uses_ex_bso;
 
     bool                                        always_flush_cache;
 
+    /** True if application memory is allocated with extra AUX memory
+     *
+     * Applications quite often pool image allocations together in a single
+     * VkDeviceMemory object. On platforms like MTL, the alignment of images
+     * with compression mapped through the AUX translation tables is large :
+     * 1MB. This can create a lot of wasted space in the application memory
+     * objects.
+     *
+     * To workaround this problem, we allocate CCS data at the end of
+     * VkDeviceMemory objects. This would not work well for TGL-like platforms
+     * because the AUX translation tables also contain the format of the
+     * images, but on MTL the HW ignore those values. So we can share the AUX
+     * TT entries between different images without problem.
+     *
+     * This should be only true for platforms with AUX TT.
+     */
+    bool                                         alloc_aux_tt_mem;
+
+    /**
+     * True if the descriptors buffers are holding one of the following :
+     *    - anv_sampled_image_descriptor
+     *    - anv_storage_image_descriptor
+     *    - anv_address_range_descriptor
+     *
+     * Accessing the descriptors in a bindless fashion from the shader
+     * requires an indirection in the shader, first fetch one of the structure
+     * listed above from the descriptor buffer, then emit the send message to
+     * the fixed function (sampler, dataport, etc...) with the handle fetched
+     * above.
+     *
+     * We need to do things this way prior to DG2 because the bindless surface
+     * state space is limited to 64Mb and some application will allocate more
+     * than what HW can support. On DG2+ we get 4Gb of bindless surface state
+     * and so we can reference directly RENDER_SURFACE_STATE/SAMPLER_STATE
+     * structures instead.
+     */
+    bool                                        indirect_descriptors;
+
+    bool                                        uses_relocs;
+
+    /** Can the platform support cooperative matrices and is it enabled? */
+    bool                                        has_cooperative_matrix;
+
     struct {
       uint32_t                                  family_count;
       struct anv_queue_family                   families[ANV_MAX_QUEUE_FAMILIES];
@@ -970,17 +1134,104 @@ struct anv_physical_device {
       struct anv_memory_type                    types[VK_MAX_MEMORY_TYPES];
       uint32_t                                  heap_count;
       struct anv_memory_heap                    heaps[VK_MAX_MEMORY_HEAPS];
-      bool                                      need_clflush;
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+      bool                                      need_flush;
+#endif
+      /** Mask of memory types of normal allocations */
+      uint32_t                                  default_buffer_mem_types;
+      /** Mask of memory types of descriptor buffers */
+      uint32_t                                  desc_buffer_mem_types;
+      /** Mask of memory types of protected buffers/images */
+      uint32_t                                  protected_mem_types;
     } memory;
 
-    struct anv_memregion                        vram;
+    struct {
+       /**
+        * General state pool
+        */
+       struct anv_va_range                      general_state_pool;
+       /**
+        * Low 32bit heap
+        */
+       struct anv_va_range                      low_heap;
+       /**
+        * Binding table pool
+        */
+       struct anv_va_range                      binding_table_pool;
+       /**
+        * Internal surface states for blorp & push descriptors.
+        */
+       struct anv_va_range                      internal_surface_state_pool;
+       /**
+        * Scratch surfaces (overlaps with internal_surface_state_pool).
+        */
+       struct anv_va_range                      scratch_surface_state_pool;
+       /**
+        * Bindless surface states (indirectly referred to by indirect
+        * descriptors or for direct descriptors)
+        */
+       struct anv_va_range                      bindless_surface_state_pool;
+       /**
+        * Dynamic state pool
+        */
+       struct anv_va_range                      dynamic_state_pool;
+       /**
+        * Sampler state pool
+        */
+       struct anv_va_range                      sampler_state_pool;
+       /**
+        * Indirect descriptor pool
+        */
+       struct anv_va_range                      indirect_descriptor_pool;
+       /**
+        * Indirect push descriptor pool
+        */
+       struct anv_va_range                      indirect_push_descriptor_pool;
+       /**
+        * Instruction state pool
+        */
+       struct anv_va_range                      instruction_state_pool;
+       /**
+        * Dynamic state pool when using descriptor buffers
+        */
+       struct anv_va_range                      dynamic_state_db_pool;
+       /**
+        * Descriptor buffers
+        */
+       struct anv_va_range                      descriptor_buffer_pool;
+       /**
+        * Push descriptor with descriptor buffers
+        */
+       struct anv_va_range                      push_descriptor_buffer_pool;
+       /**
+        * AUX-TT
+        */
+       struct anv_va_range                      aux_tt_pool;
+       /**
+        * Client heap
+        */
+       struct anv_va_range                      high_heap;
+       struct anv_va_range                      trtt;
+    } va;
+
+    /* Either we have a single vram region and it's all mappable, or we have
+     * both mappable & non-mappable parts. System memory is always available.
+     */
+    struct anv_memregion                        vram_mappable;
+    struct anv_memregion                        vram_non_mappable;
     struct anv_memregion                        sys;
     uint8_t                                     driver_build_sha1[20];
     uint8_t                                     pipeline_cache_uuid[VK_UUID_SIZE];
     uint8_t                                     driver_uuid[VK_UUID_SIZE];
     uint8_t                                     device_uuid[VK_UUID_SIZE];
+    uint8_t                                     rt_uuid[VK_UUID_SIZE];
 
-    struct disk_cache *                         disk_cache;
+    /* Maximum amount of scratch space used by all the GRL kernels */
+    uint32_t                                    max_grl_scratch_size;
+
+    struct vk_sync_type                         sync_syncobj_type;
+    struct vk_sync_timeline_type                sync_timeline_type;
+    const struct vk_sync_type *                 sync_types[4];
 
     struct wsi_device                       wsi_device;
     int                                         local_fd;
@@ -991,229 +1242,609 @@ struct anv_physical_device {
     bool                                        has_master;
     int64_t                                     master_major;
     int64_t                                     master_minor;
-    struct drm_i915_query_engine_info *         engine_info;
+    struct intel_query_engine_info *            engine_info;
 
-    void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_bo *, uint32_t );
+    void (*cmd_emit_timestamp)(struct anv_batch *, struct anv_device *, struct anv_address,
+                               enum anv_timestamp_capture_type, void *);
     struct intel_measure_device                 measure_device;
-};
 
-struct anv_app_info {
-   const char*        app_name;
-   uint32_t           app_version;
-   const char*        engine_name;
-   uint32_t           engine_version;
-   uint32_t           api_version;
+    /* Value of PIPELINE_SELECT::PipelineSelection == GPGPU */
+    uint32_t                                    gpgpu_pipeline_value;
 };
 
-struct anv_instance {
-    struct vk_instance                          vk;
+static inline uint32_t
+anv_physical_device_bindless_heap_size(const struct anv_physical_device *device,
+                                       bool descriptor_buffer)
+{
+   /* Pre-Gfx12.5, the HW bindless surface heap is only 64MB. After it's 4GB,
+    * but we have some workarounds that require 2 heaps to overlap, so the
+    * size is dictated by our VA allocation.
+    */
+   return device->uses_ex_bso ?
+      (descriptor_buffer ?
+       device->va.descriptor_buffer_pool.size :
+       device->va.bindless_surface_state_pool.size) :
+      64 * 1024 * 1024 /* 64 MiB */;
+}
 
-    bool                                        physical_devices_enumerated;
-    struct list_head                            physical_devices;
+static inline bool
+anv_physical_device_has_vram(const struct anv_physical_device *device)
+{
+   return device->vram_mappable.size > 0;
+}
 
-    bool                                        pipeline_cache_enabled;
+struct anv_instance {
+    struct vk_instance                          vk;
 
     struct driOptionCache                       dri_options;
     struct driOptionCache                       available_dri_options;
+
+    int                                         mesh_conv_prim_attrs_to_vert_attrs;
+    bool                                        enable_tbimr;
+    bool                                        external_memory_implicit_sync;
+
+    /**
+     * Workarounds for game bugs.
+     */
+    uint8_t                                     assume_full_subgroups;
+    bool                                        limit_trig_input_range;
+    bool                                        sample_mask_out_opengl_behaviour;
+    bool                                        force_filter_addr_rounding;
+    bool                                        fp64_workaround_enabled;
+    float                                       lower_depth_range_rate;
+    unsigned                                    generated_indirect_threshold;
+    unsigned                                    generated_indirect_ring_threshold;
+    unsigned                                    query_clear_with_blorp_threshold;
+    unsigned                                    query_copy_with_shader_threshold;
+    unsigned                                    force_vk_vendor;
+    bool                                        has_fake_sparse;
+    bool                                        disable_fcv;
+    bool                                        compression_control_enabled;
+
+    /* HW workarounds */
+    bool                                        no_16bit;
+    bool                                        intel_enable_wa_14018912822;
 };
 
 VkResult anv_init_wsi(struct anv_physical_device *physical_device);
 void anv_finish_wsi(struct anv_physical_device *physical_device);
 
-struct anv_queue_submit {
-   struct anv_cmd_buffer **                  cmd_buffers;
-   uint32_t                                  cmd_buffer_count;
-   uint32_t                                  cmd_buffer_array_length;
-
-   uint32_t                                  fence_count;
-   uint32_t                                  fence_array_length;
-   struct drm_i915_gem_exec_fence *          fences;
-   uint64_t *                                fence_values;
-
-   uint32_t                                  temporary_semaphore_count;
-   uint32_t                                  temporary_semaphore_array_length;
-   struct anv_semaphore_impl *               temporary_semaphores;
-
-   /* Allocated only with non shareable timelines. */
-   union {
-      struct anv_timeline **                 wait_timelines;
-      uint32_t *                             wait_timeline_syncobjs;
-   };
-   uint32_t                                  wait_timeline_count;
-   uint32_t                                  wait_timeline_array_length;
-   uint64_t *                                wait_timeline_values;
-
-   struct anv_timeline **                    signal_timelines;
-   uint32_t                                  signal_timeline_count;
-   uint32_t                                  signal_timeline_array_length;
-   uint64_t *                                signal_timeline_values;
-
-   int                                       in_fence;
-   bool                                      need_out_fence;
-   int                                       out_fence;
-
-   uint32_t                                  fence_bo_count;
-   uint32_t                                  fence_bo_array_length;
-   /* An array of struct anv_bo pointers with lower bit used as a flag to
-    * signal we will wait on that BO (see anv_(un)pack_ptr).
-    */
-   uintptr_t *                               fence_bos;
-
-   int                                       perf_query_pass;
-   struct anv_query_pool *                   perf_query_pool;
-
-   const VkAllocationCallbacks *             alloc;
-   VkSystemAllocationScope                   alloc_scope;
-
-   struct anv_bo *                           simple_bo;
-   uint32_t                                  simple_bo_size;
-
-   struct list_head                          link;
-};
-
 struct anv_queue {
-   struct vk_object_base                     base;
+   struct vk_queue                           vk;
 
    struct anv_device *                       device;
 
-   VkDeviceQueueCreateFlags                  flags;
    const struct anv_queue_family *           family;
 
-   uint32_t                                  exec_flags;
+   struct intel_batch_decode_ctx *           decoder;
 
-   /* Set once from the device api calls. */
-   bool                                      lost_signaled;
+   union {
+      uint32_t                               exec_flags; /* i915 */
+      uint32_t                               context_id; /* i915 */
+      uint32_t                               exec_queue_id; /* Xe */
+   };
 
-   /* Only set once atomically by the queue */
-   int                                       lost;
-   int                                       error_line;
-   const char *                              error_file;
-   char                                      error_msg[80];
+   /** Context/Engine id which executes companion RCS command buffer */
+   uint32_t                                  companion_rcs_id;
 
-   /*
-    * This mutext protects the variables below.
-    */
-   pthread_mutex_t                           mutex;
+   /** Synchronization object for debug purposes (DEBUG_SYNC) */
+   struct vk_sync                           *sync;
 
-   pthread_t                                 thread;
-   pthread_cond_t                            cond;
-
-   /*
-    * A list of struct anv_queue_submit to be submitted to i915.
+   /** Companion synchronization object
+    *
+    * Vulkan command buffers can be destroyed as soon as their lifecycle moved
+    * from the Pending state to the Invalid/Executable state. This transition
+    * happens when the VkFence/VkSemaphore associated with the completion of
+    * the command buffer work is signaled.
+    *
+    * When we're using a companion command buffer to execute part of another
+    * command buffer, we need to tie the 2 work submissions together to ensure
+    * when the associated VkFence/VkSemaphore is signaled, both command
+    * buffers are actually unused by the HW. To do this, we run an empty batch
+    * buffer that we use to signal after both submissions :
+    *
+    *   CCS -->    main   ---> empty_batch (with wait on companion) --> signal
+    *   RCS --> companion -|
+    *
+    * When companion batch completes, it signals companion_sync and allow
+    * empty_batch to execute. Since empty_batch is running on the main engine,
+    * we're guaranteed that upon completion both main & companion command
+    * buffers are not used by HW anymore.
     */
-   struct list_head                          queued_submits;
+   struct vk_sync                           *companion_sync;
 
-   /* Set to true to stop the submission thread */
-   bool                                      quit;
-};
-
-struct anv_pipeline_cache {
-   struct vk_object_base                        base;
-   struct anv_device *                          device;
-   pthread_mutex_t                              mutex;
-
-   struct hash_table *                          nir_cache;
-
-   struct hash_table *                          cache;
-
-   bool                                         external_sync;
+   struct intel_ds_queue                     ds;
 };
 
 struct nir_xfb_info;
 struct anv_pipeline_bind_map;
+struct anv_pipeline_sets_layout;
+struct anv_push_descriptor_info;
+enum anv_dynamic_push_bits;
 
-void anv_pipeline_cache_init(struct anv_pipeline_cache *cache,
-                             struct anv_device *device,
-                             bool cache_enabled,
-                             bool external_sync);
-void anv_pipeline_cache_finish(struct anv_pipeline_cache *cache);
+void anv_device_init_embedded_samplers(struct anv_device *device);
+void anv_device_finish_embedded_samplers(struct anv_device *device);
 
-struct anv_shader_bin *
-anv_pipeline_cache_search(struct anv_pipeline_cache *cache,
-                          const void *key, uint32_t key_size);
-struct anv_shader_bin *
-anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache,
-                                 gl_shader_stage stage,
-                                 const void *key_data, uint32_t key_size,
-                                 const void *kernel_data, uint32_t kernel_size,
-                                 const struct brw_stage_prog_data *prog_data,
-                                 uint32_t prog_data_size,
-                                 const struct brw_compile_stats *stats,
-                                 uint32_t num_stats,
-                                 const struct nir_xfb_info *xfb_info,
-                                 const struct anv_pipeline_bind_map *bind_map);
+extern const struct vk_pipeline_cache_object_ops *const anv_cache_import_ops[2];
 
 struct anv_shader_bin *
 anv_device_search_for_kernel(struct anv_device *device,
-                             struct anv_pipeline_cache *cache,
+                             struct vk_pipeline_cache *cache,
                              const void *key_data, uint32_t key_size,
                              bool *user_cache_bit);
 
+struct anv_shader_upload_params;
+
 struct anv_shader_bin *
 anv_device_upload_kernel(struct anv_device *device,
-                         struct anv_pipeline_cache *cache,
-                         gl_shader_stage stage,
-                         const void *key_data, uint32_t key_size,
-                         const void *kernel_data, uint32_t kernel_size,
-                         const struct brw_stage_prog_data *prog_data,
-                         uint32_t prog_data_size,
-                         const struct brw_compile_stats *stats,
-                         uint32_t num_stats,
-                         const struct nir_xfb_info *xfb_info,
-                         const struct anv_pipeline_bind_map *bind_map);
+                         struct vk_pipeline_cache *cache,
+                         const struct anv_shader_upload_params *params);
 
 struct nir_shader;
 struct nir_shader_compiler_options;
 
 struct nir_shader *
 anv_device_search_for_nir(struct anv_device *device,
-                          struct anv_pipeline_cache *cache,
+                          struct vk_pipeline_cache *cache,
                           const struct nir_shader_compiler_options *nir_options,
                           unsigned char sha1_key[20],
                           void *mem_ctx);
 
 void
 anv_device_upload_nir(struct anv_device *device,
-                      struct anv_pipeline_cache *cache,
+                      struct vk_pipeline_cache *cache,
                       const struct nir_shader *nir,
                       unsigned char sha1_key[20]);
 
-struct anv_address {
+void
+anv_load_fp64_shader(struct anv_device *device);
+
+/**
+ * This enum tracks the various HW instructions that hold graphics state
+ * needing to be reprogrammed. Some instructions are grouped together as they
+ * pretty much need to be emitted together (like 3DSTATE_URB_*).
+ *
+ * Not all bits apply to all platforms. We build a dirty state based on
+ * enabled extensions & generation on anv_device.
+ */
+enum anv_gfx_state_bits {
+   /* Pipeline states */
+   ANV_GFX_STATE_URB, /* All legacy stages, including mesh */
+   ANV_GFX_STATE_VF_STATISTICS,
+   ANV_GFX_STATE_VF_SGVS,
+   ANV_GFX_STATE_VF_SGVS_2,
+   ANV_GFX_STATE_VF_SGVS_VI, /* 3DSTATE_VERTEX_ELEMENTS for sgvs elements */
+   ANV_GFX_STATE_VF_SGVS_INSTANCING, /* 3DSTATE_VF_INSTANCING for sgvs elements */
+   ANV_GFX_STATE_PRIMITIVE_REPLICATION,
+   ANV_GFX_STATE_SBE,
+   ANV_GFX_STATE_SBE_SWIZ,
+   ANV_GFX_STATE_SO_DECL_LIST,
+   ANV_GFX_STATE_VS,
+   ANV_GFX_STATE_HS,
+   ANV_GFX_STATE_DS,
+   ANV_GFX_STATE_GS,
+   ANV_GFX_STATE_PS,
+   ANV_GFX_STATE_SBE_MESH,
+   ANV_GFX_STATE_CLIP_MESH,
+   ANV_GFX_STATE_MESH_CONTROL,
+   ANV_GFX_STATE_MESH_SHADER,
+   ANV_GFX_STATE_MESH_DISTRIB,
+   ANV_GFX_STATE_TASK_CONTROL,
+   ANV_GFX_STATE_TASK_SHADER,
+   ANV_GFX_STATE_TASK_REDISTRIB,
+   /* Dynamic states */
+   ANV_GFX_STATE_BLEND_STATE, /* Just the dynamic state structure */
+   ANV_GFX_STATE_BLEND_STATE_PTR, /* The pointer to the dynamic state */
+   ANV_GFX_STATE_CLIP,
+   ANV_GFX_STATE_CC_STATE,
+   ANV_GFX_STATE_CC_STATE_PTR,
+   ANV_GFX_STATE_CPS,
+   ANV_GFX_STATE_DEPTH_BOUNDS,
+   ANV_GFX_STATE_INDEX_BUFFER,
+   ANV_GFX_STATE_LINE_STIPPLE,
+   ANV_GFX_STATE_MULTISAMPLE,
+   ANV_GFX_STATE_PS_BLEND,
+   ANV_GFX_STATE_RASTER,
+   ANV_GFX_STATE_SAMPLE_MASK,
+   ANV_GFX_STATE_SAMPLE_PATTERN,
+   ANV_GFX_STATE_SCISSOR,
+   ANV_GFX_STATE_SF,
+   ANV_GFX_STATE_STREAMOUT,
+   ANV_GFX_STATE_TE,
+   ANV_GFX_STATE_VERTEX_INPUT,
+   ANV_GFX_STATE_VF,
+   ANV_GFX_STATE_VF_TOPOLOGY,
+   ANV_GFX_STATE_VFG,
+   ANV_GFX_STATE_VIEWPORT_CC,
+   ANV_GFX_STATE_VIEWPORT_CC_PTR,
+   ANV_GFX_STATE_VIEWPORT_SF_CLIP,
+   ANV_GFX_STATE_WM,
+   ANV_GFX_STATE_WM_DEPTH_STENCIL,
+   ANV_GFX_STATE_PS_EXTRA,
+   ANV_GFX_STATE_PMA_FIX, /* Fake state to implement workaround */
+   ANV_GFX_STATE_WA_18019816803, /* Fake state to implement workaround */
+   ANV_GFX_STATE_TBIMR_TILE_PASS_INFO,
+
+   ANV_GFX_STATE_MAX,
+};
+
+const char *anv_gfx_state_bit_to_str(enum anv_gfx_state_bits state);
+
+/* This structure tracks the values to program in HW instructions for
+ * corresponding to dynamic states of the Vulkan API. Only fields that need to
+ * be reemitted outside of the VkPipeline object are tracked here.
+ */
+struct anv_gfx_dynamic_state {
+   /* 3DSTATE_BLEND_STATE_POINTERS */
+   struct {
+      bool AlphaToCoverageEnable;
+      bool AlphaToOneEnable;
+      bool IndependentAlphaBlendEnable;
+      struct {
+         bool     WriteDisableAlpha;
+         bool     WriteDisableRed;
+         bool     WriteDisableGreen;
+         bool     WriteDisableBlue;
+
+         uint32_t LogicOpFunction;
+         bool     LogicOpEnable;
+
+         bool     ColorBufferBlendEnable;
+         uint32_t ColorClampRange;
+         bool     PreBlendColorClampEnable;
+         bool     PostBlendColorClampEnable;
+         uint32_t SourceBlendFactor;
+         uint32_t DestinationBlendFactor;
+         uint32_t ColorBlendFunction;
+         uint32_t SourceAlphaBlendFactor;
+         uint32_t DestinationAlphaBlendFactor;
+         uint32_t AlphaBlendFunction;
+      } rts[MAX_RTS];
+
+      struct anv_state state;
+   } blend;
+
+   /* 3DSTATE_CC_STATE_POINTERS */
+   struct {
+      float BlendConstantColorRed;
+      float BlendConstantColorGreen;
+      float BlendConstantColorBlue;
+      float BlendConstantColorAlpha;
+
+      struct anv_state state;
+   } cc;
+
+   /* 3DSTATE_CLIP */
+   struct {
+      uint32_t APIMode;
+      uint32_t ViewportXYClipTestEnable;
+      uint32_t MaximumVPIndex;
+      uint32_t TriangleStripListProvokingVertexSelect;
+      uint32_t LineStripListProvokingVertexSelect;
+      uint32_t TriangleFanProvokingVertexSelect;
+   } clip;
+
+   /* 3DSTATE_CPS/3DSTATE_CPS_POINTERS */
+   struct {
+      /* Gfx11 */
+      uint32_t CoarsePixelShadingMode;
+      float    MinCPSizeX;
+      float    MinCPSizeY;
+      /* Gfx12+ */
+      uint32_t CoarsePixelShadingStateArrayPointer;
+   } cps;
+
+   /* 3DSTATE_DEPTH_BOUNDS */
+   struct {
+      bool     DepthBoundsTestEnable;
+      float    DepthBoundsTestMinValue;
+      float    DepthBoundsTestMaxValue;
+   } db;
+
+   /* 3DSTATE_GS */
+   struct {
+      uint32_t ReorderMode;
+   } gs;
+
+   /* 3DSTATE_LINE_STIPPLE */
+   struct {
+      uint32_t LineStipplePattern;
+      float    LineStippleInverseRepeatCount;
+      uint32_t LineStippleRepeatCount;
+   } ls;
+
+   /* 3DSTATE_MULTISAMPLE */
+   struct {
+      uint32_t NumberofMultisamples;
+   } ms;
+
+   /* 3DSTATE_PS */
+   struct {
+      uint32_t PositionXYOffsetSelect;
+
+      uint32_t KernelStartPointer0;
+      uint32_t KernelStartPointer1;
+      uint32_t KernelStartPointer2;
+
+      uint32_t DispatchGRFStartRegisterForConstantSetupData0;
+      uint32_t DispatchGRFStartRegisterForConstantSetupData1;
+      uint32_t DispatchGRFStartRegisterForConstantSetupData2;
+
+      /* Pre-Gfx20 only */
+      bool     _8PixelDispatchEnable;
+      bool     _16PixelDispatchEnable;
+      bool     _32PixelDispatchEnable;
+
+      /* Gfx20+ only */
+      bool     Kernel0Enable;
+      bool     Kernel1Enable;
+      uint32_t Kernel0SIMDWidth;
+      uint32_t Kernel1SIMDWidth;
+      uint32_t Kernel0PolyPackingPolicy;
+   } ps;
+
+   /* 3DSTATE_PS_EXTRA */
+   struct {
+      bool PixelShaderIsPerSample;
+      bool PixelShaderKillsPixel;
+      bool PixelShaderIsPerCoarsePixel;
+      bool EnablePSDependencyOnCPsizeChange;
+   } ps_extra;
+
+   /* 3DSTATE_PS_BLEND */
+   struct {
+      bool     HasWriteableRT;
+      bool     ColorBufferBlendEnable;
+      uint32_t SourceAlphaBlendFactor;
+      uint32_t DestinationAlphaBlendFactor;
+      uint32_t SourceBlendFactor;
+      uint32_t DestinationBlendFactor;
+      bool     AlphaTestEnable;
+      bool     IndependentAlphaBlendEnable;
+      bool     AlphaToCoverageEnable;
+   } ps_blend;
+
+   /* 3DSTATE_RASTER */
+   struct {
+      uint32_t APIMode;
+      bool     DXMultisampleRasterizationEnable;
+      bool     AntialiasingEnable;
+      uint32_t CullMode;
+      uint32_t FrontWinding;
+      bool     GlobalDepthOffsetEnableSolid;
+      bool     GlobalDepthOffsetEnableWireframe;
+      bool     GlobalDepthOffsetEnablePoint;
+      float    GlobalDepthOffsetConstant;
+      float    GlobalDepthOffsetScale;
+      float    GlobalDepthOffsetClamp;
+      uint32_t FrontFaceFillMode;
+      uint32_t BackFaceFillMode;
+      bool     ViewportZFarClipTestEnable;
+      bool     ViewportZNearClipTestEnable;
+      bool     ConservativeRasterizationEnable;
+   } raster;
+
+   /* 3DSTATE_SCISSOR_STATE_POINTERS */
+   struct {
+      uint32_t count;
+      struct {
+         uint32_t ScissorRectangleYMin;
+         uint32_t ScissorRectangleXMin;
+         uint32_t ScissorRectangleYMax;
+         uint32_t ScissorRectangleXMax;
+      } elem[MAX_SCISSORS];
+   } scissor;
+
+   /* 3DSTATE_SF */
+   struct {
+      float    LineWidth;
+      uint32_t TriangleStripListProvokingVertexSelect;
+      uint32_t LineStripListProvokingVertexSelect;
+      uint32_t TriangleFanProvokingVertexSelect;
+      bool     LegacyGlobalDepthBiasEnable;
+   } sf;
+
+   /* 3DSTATE_STREAMOUT */
+   struct {
+      bool     RenderingDisable;
+      uint32_t RenderStreamSelect;
+      uint32_t ReorderMode;
+      uint32_t ForceRendering;
+   } so;
+
+   /* 3DSTATE_SAMPLE_MASK */
+   struct {
+      uint32_t SampleMask;
+   } sm;
+
+   /* 3DSTATE_TE */
+   struct {
+      uint32_t OutputTopology;
+   } te;
+
+   /* 3DSTATE_VF */
+   struct {
+      bool     IndexedDrawCutIndexEnable;
+      uint32_t CutIndex;
+   } vf;
+
+   /* 3DSTATE_VFG */
+   struct {
+      uint32_t DistributionMode;
+      bool     ListCutIndexEnable;
+   } vfg;
+
+   /* 3DSTATE_VF_TOPOLOGY */
+   struct {
+      uint32_t PrimitiveTopologyType;
+   } vft;
+
+   /* 3DSTATE_VIEWPORT_STATE_POINTERS_CC */
+   struct {
+      uint32_t count;
+      struct {
+         float MinimumDepth;
+         float MaximumDepth;
+      } elem[MAX_VIEWPORTS];
+
+      struct anv_state state;
+   } vp_cc;
+
+   /* 3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP */
+   struct {
+      uint32_t count;
+      struct {
+         float ViewportMatrixElementm00;
+         float ViewportMatrixElementm11;
+         float ViewportMatrixElementm22;
+         float ViewportMatrixElementm30;
+         float ViewportMatrixElementm31;
+         float ViewportMatrixElementm32;
+         float XMinClipGuardband;
+         float XMaxClipGuardband;
+         float YMinClipGuardband;
+         float YMaxClipGuardband;
+         float XMinViewPort;
+         float XMaxViewPort;
+         float YMinViewPort;
+         float YMaxViewPort;
+      } elem[MAX_VIEWPORTS];
+   } vp_sf_clip;
+
+   /* 3DSTATE_WM */
+   struct {
+      uint32_t ForceThreadDispatchEnable;
+      bool     LineStippleEnable;
+      uint32_t BarycentricInterpolationMode;
+   } wm;
+
+   /* 3DSTATE_WM_DEPTH_STENCIL */
+   struct {
+      bool     DoubleSidedStencilEnable;
+      uint32_t StencilTestMask;
+      uint32_t StencilWriteMask;
+      uint32_t BackfaceStencilTestMask;
+      uint32_t BackfaceStencilWriteMask;
+      uint32_t StencilReferenceValue;
+      uint32_t BackfaceStencilReferenceValue;
+      bool     DepthTestEnable;
+      bool     DepthBufferWriteEnable;
+      uint32_t DepthTestFunction;
+      bool     StencilTestEnable;
+      bool     StencilBufferWriteEnable;
+      uint32_t StencilFailOp;
+      uint32_t StencilPassDepthPassOp;
+      uint32_t StencilPassDepthFailOp;
+      uint32_t StencilTestFunction;
+      uint32_t BackfaceStencilFailOp;
+      uint32_t BackfaceStencilPassDepthPassOp;
+      uint32_t BackfaceStencilPassDepthFailOp;
+      uint32_t BackfaceStencilTestFunction;
+   } ds;
+
+   /* 3DSTATE_TBIMR_TILE_PASS_INFO */
+   struct {
+      unsigned TileRectangleHeight;
+      unsigned TileRectangleWidth;
+      unsigned VerticalTileCount;
+      unsigned HorizontalTileCount;
+      unsigned TBIMRBatchSize;
+      unsigned TileBoxCheck;
+   } tbimr;
+   bool use_tbimr;
+
+   bool pma_fix;
+
+   BITSET_DECLARE(dirty, ANV_GFX_STATE_MAX);
+};
+
+enum anv_internal_kernel_name {
+   ANV_INTERNAL_KERNEL_GENERATED_DRAWS,
+   ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_COMPUTE,
+   ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_FRAGMENT,
+   ANV_INTERNAL_KERNEL_MEMCPY_COMPUTE,
+
+   ANV_INTERNAL_KERNEL_COUNT,
+};
+
+enum anv_rt_bvh_build_method {
+   ANV_BVH_BUILD_METHOD_TRIVIAL,
+   ANV_BVH_BUILD_METHOD_NEW_SAH,
+};
+
+struct anv_device_astc_emu {
+    struct vk_texcompress_astc_state           *texcompress;
+
+    /* for flush_astc_ldr_void_extent_denorms */
+    simple_mtx_t mutex;
+    VkDescriptorSetLayout ds_layout;
+    VkPipelineLayout pipeline_layout;
+    VkPipeline pipeline;
+};
+
+struct anv_trtt_batch_bo {
    struct anv_bo *bo;
-   int64_t offset;
+   uint32_t size;
+
+   /* Once device->trtt.timeline_handle signals timeline_val as complete we
+    * can free this struct and its members.
+    */
+   uint64_t timeline_val;
+
+   /* Part of device->trtt.in_flight_batches. */
+   struct list_head link;
 };
 
 struct anv_device {
     struct vk_device                            vk;
 
     struct anv_physical_device *                physical;
-    struct intel_device_info                      info;
+    const struct intel_device_info *            info;
+    const struct anv_kmd_backend *              kmd_backend;
     struct isl_device                           isl_dev;
-    int                                         context_id;
+    union {
+       uint32_t                                 context_id; /* i915 */
+       uint32_t                                 vm_id; /* Xe */
+    };
     int                                         fd;
-    bool                                        can_chain_batches;
-    bool                                        robust_buffer_access;
-    bool                                        has_thread_submit;
 
     pthread_mutex_t                             vma_mutex;
     struct util_vma_heap                        vma_lo;
-    struct util_vma_heap                        vma_cva;
     struct util_vma_heap                        vma_hi;
+    struct util_vma_heap                        vma_desc;
+    struct util_vma_heap                        vma_desc_buf;
+    struct util_vma_heap                        vma_samplers;
+    struct util_vma_heap                        vma_trtt;
 
     /** List of all anv_device_memory objects */
     struct list_head                            memory_objects;
 
+    /** List of anv_image objects with a private binding for implicit CCS */
+    struct list_head                            image_private_objects;
+
+    /** Memory pool for batch buffers */
     struct anv_bo_pool                          batch_bo_pool;
+    /** Memory pool for utrace timestamp buffers */
+    struct anv_bo_pool                          utrace_bo_pool;
+    /** Memory pool for BVH build buffers */
+    struct anv_bo_pool                          bvh_bo_pool;
 
     struct anv_bo_cache                         bo_cache;
 
     struct anv_state_pool                       general_state_pool;
+    struct anv_state_pool                       aux_tt_pool;
     struct anv_state_pool                       dynamic_state_pool;
+    struct anv_state_pool                       dynamic_state_db_pool;
     struct anv_state_pool                       instruction_state_pool;
     struct anv_state_pool                       binding_table_pool;
-    struct anv_state_pool                       surface_state_pool;
+    struct anv_state_pool                       scratch_surface_state_pool;
+    struct anv_state_pool                       internal_surface_state_pool;
+    struct anv_state_pool                       bindless_surface_state_pool;
+    struct anv_state_pool                       indirect_push_descriptor_pool;
+    struct anv_state_pool                       push_descriptor_buffer_pool;
 
     struct anv_state_reserved_pool              custom_border_colors;
+    struct anv_state_reserved_array_pool        custom_border_colors_db;
 
     /** BO used for various workarounds
      *
@@ -1227,31 +1858,100 @@ struct anv_device {
     struct anv_bo *                             workaround_bo;
     struct anv_address                          workaround_address;
 
+    /**
+     * Workarounds for game bugs.
+     */
+    struct {
+       struct set *                             doom64_images;
+    } workarounds;
+
     struct anv_bo *                             trivial_batch_bo;
     struct anv_state                            null_surface_state;
 
-    struct anv_pipeline_cache                   default_pipeline_cache;
-    struct blorp_context                        blorp;
+    /**
+     * NULL surface state copy stored in host memory for use as a fast
+     * memcpy() source.
+     */
+    char                                        host_null_surface_state[ANV_SURFACE_STATE_SIZE];
+
+    struct vk_pipeline_cache *                  default_pipeline_cache;
+    struct vk_pipeline_cache *                  internal_cache;
+
+    struct {
+       struct blorp_context                     context;
+       struct {
+          struct anv_state                      state;
+          struct anv_state                      db_state;
+       }                                        dynamic_states[BLORP_DYNAMIC_STATE_COUNT];
+    }                                           blorp;
 
     struct anv_state                            border_colors;
+    struct anv_state                            border_colors_db;
 
     struct anv_state                            slice_hash;
+    struct anv_state                            slice_hash_db;
+
+    /** An array of CPS_STATE structures grouped by MAX_VIEWPORTS elements
+     *
+     * We need to emit CPS_STATE structures for each viewport accessible by a
+     * pipeline. So rather than write many identical CPS_STATE structures
+     * dynamically, we can enumerate all possible combinaisons and then just
+     * emit a 3DSTATE_CPS_POINTERS instruction with the right offset into this
+     * array.
+     */
+    struct anv_state                            cps_states;
+    struct anv_state                            cps_states_db;
 
     uint32_t                                    queue_count;
     struct anv_queue  *                         queues;
 
     struct anv_scratch_pool                     scratch_pool;
     struct anv_bo                              *rt_scratch_bos[16];
+    struct anv_bo                              *btd_fifo_bo;
+    struct anv_address                          rt_uuid_addr;
+
+    /** A pre packed VERTEX_ELEMENT_STATE feeding 0s to the VS stage
+     *
+     * For use when a pipeline has no VS input
+     */
+    uint32_t                                    empty_vs_input[2];
+
+    bool                                        robust_buffer_access;
+
+    uint32_t                                    protected_session_id;
+
+    /** Shadow ray query BO
+     *
+     * The ray_query_bo only holds the current ray being traced. When using
+     * more than 1 ray query per thread, we cannot fit all the queries in
+     * there, so we need a another buffer to hold query data that is not
+     * currently being used by the HW for tracing, similar to a scratch space.
+     *
+     * The size of the shadow buffer depends on the number of queries per
+     * shader.
+     */
+    struct anv_bo                              *ray_query_shadow_bos[16];
+    /** Ray query buffer used to communicated with HW unit.
+     */
+    struct anv_bo                              *ray_query_bo;
 
     struct anv_shader_bin                      *rt_trampoline;
     struct anv_shader_bin                      *rt_trivial_return;
 
+    enum anv_rt_bvh_build_method                bvh_build_method;
+
+    /** Draw generation shader
+     *
+     * Generates direct draw calls out of indirect parameters. Used to
+     * workaround slowness with indirect draw calls.
+     */
+    struct anv_shader_bin                      *internal_kernels[ANV_INTERNAL_KERNEL_COUNT];
+    const struct intel_l3_config               *internal_kernels_l3_config;
+
     pthread_mutex_t                             mutex;
     pthread_cond_t                              queue_submit;
-    int                                         _lost;
-    int                                         lost_reported;
 
-    struct intel_batch_decode_ctx               decoder_ctx;
+    struct intel_batch_decode_ctx               decoder[ANV_MAX_QUEUE_FAMILIES];
     /*
      * When decoding a anv_cmd_buffer, we might need to search for BOs through
      * the cmd_buffer's list.
@@ -1266,62 +1966,138 @@ struct anv_device {
     const struct intel_l3_config                *l3_config;
 
     struct intel_debug_block_frame              *debug_frame_desc;
-};
 
-#if defined(GFX_VERx10) && GFX_VERx10 >= 90
-#define ANV_ALWAYS_SOFTPIN true
-#else
-#define ANV_ALWAYS_SOFTPIN false
+    struct intel_ds_device                       ds;
+
+    nir_shader                                  *fp64_nir;
+
+    uint32_t                                    draw_call_count;
+    struct anv_state                            breakpoint;
+#if DETECT_OS_ANDROID
+    struct u_gralloc                            *u_gralloc;
 #endif
 
-static inline bool
-anv_use_softpin(const struct anv_physical_device *pdevice)
+    /** Precompute all dirty graphics bits
+     *
+     * Depending on platforms, some of the dirty bits don't apply (for example
+     * 3DSTATE_PRIMITIVE_REPLICATION is only Gfx12.0+). Disabling some
+     * extensions like Mesh shaders also allow us to avoid emitting any
+     * mesh/task related instructions (we only initialize them once at device
+     * initialization).
+     */
+    BITSET_DECLARE(gfx_dirty_state, ANV_GFX_STATE_MAX);
+
+    /*
+     * Command pool for companion RCS command buffer.
+     */
+    VkCommandPool                               companion_rcs_cmd_pool;
+
+    struct anv_trtt {
+       pthread_mutex_t mutex;
+
+       /* Sometimes we need to run batches from places where we don't have a
+        * queue coming from the API, so we use this.
+        */
+       struct anv_queue *queue;
+
+       /* There's only one L3 table, so if l3_addr is zero that means we
+        * didn't initialize the TR-TT context yet (i.e., we're not using TR-TT
+        * yet in this context).
+        */
+       uint64_t l3_addr;
+
+       /* We don't want to access the page tables from the CPU, so just
+        * maintain a mirror that we can use.
+        */
+       uint64_t *l3_mirror;
+       uint64_t *l2_mirror;
+
+       /* We keep a dynamic list of page table bos, and each bo can store
+        * multiple page tables.
+        */
+       struct anv_bo **page_table_bos;
+       int num_page_table_bos;
+       int page_table_bos_capacity;
+
+       /* These are used to keep track of space available for more page tables
+        * within a bo.
+        */
+       struct anv_bo *cur_page_table_bo;
+       uint64_t next_page_table_bo_offset;
+
+       /* Timeline syncobj used to track completion of the TR-TT batch BOs. */
+       uint32_t timeline_handle;
+       uint64_t timeline_val;
+
+       /* List of struct anv_trtt_batch_bo batches that are in flight and can
+        * be freed once their timeline gets signaled.
+        */
+       struct list_head in_flight_batches;
+    } trtt;
+
+    /* Number of sparse resources that currently exist. This is used for a
+     * workaround that makes every memoryBarrier flush more things than it
+     * should. Some workloads create and then immediately destroy sparse
+     * resources when they start, so just counting if a sparse resource was
+     * ever created is not enough.
+     */
+    uint32_t num_sparse_resources;
+
+    struct anv_device_astc_emu                   astc_emu;
+
+    struct intel_bind_timeline bind_timeline; /* Xe only */
+
+    struct {
+       simple_mtx_t                              mutex;
+       struct hash_table                        *map;
+    }                                            embedded_samplers;
+};
+
+static inline uint32_t
+anv_get_first_render_queue_index(struct anv_physical_device *pdevice)
 {
-#if defined(GFX_VERx10) && GFX_VERx10 >= 90
-   /* Sky Lake and later always uses softpin */
-   assert(pdevice->use_softpin);
-   return true;
-#elif defined(GFX_VERx10) && GFX_VERx10 < 80
-   /* Haswell and earlier never use softpin */
-   assert(!pdevice->use_softpin);
-   return false;
-#else
-   /* If we don't have a GFX_VERx10 #define, we need to look at the physical
-    * device.  Also, for GFX version 8, we need to look at the physical
-    * device because Broadwell softpins but Cherryview doesn't.
-    */
-   return pdevice->use_softpin;
-#endif
+   assert(pdevice != NULL);
+
+   for (uint32_t i = 0; i < pdevice->queue.family_count; i++) {
+      if (pdevice->queue.families[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) {
+         return i;
+      }
+   }
+
+   unreachable("Graphics capable queue family not found");
 }
 
-static inline struct anv_instance *
-anv_device_instance_or_null(const struct anv_device *device)
+static inline struct anv_state
+anv_binding_table_pool_alloc(struct anv_device *device)
 {
-   return device ? device->physical->instance : NULL;
+   return anv_state_pool_alloc(&device->binding_table_pool,
+                               device->binding_table_pool.block_size, 0);
 }
 
-static inline struct anv_state_pool *
-anv_binding_table_pool(struct anv_device *device)
+static inline void
+anv_binding_table_pool_free(struct anv_device *device, struct anv_state state)
 {
-   if (anv_use_softpin(device->physical))
-      return &device->binding_table_pool;
-   else
-      return &device->surface_state_pool;
+   anv_state_pool_free(&device->binding_table_pool, state);
 }
 
 static inline struct anv_state
-anv_binding_table_pool_alloc(struct anv_device *device)
+anv_null_surface_state_for_binding_table(struct anv_device *device)
 {
-   if (anv_use_softpin(device->physical))
-      return anv_state_pool_alloc(&device->binding_table_pool,
-                                  device->binding_table_pool.block_size, 0);
-   else
-      return anv_state_pool_alloc_back(&device->surface_state_pool);
+   struct anv_state state = device->null_surface_state;
+   if (device->physical->indirect_descriptors) {
+      state.offset += device->physical->va.bindless_surface_state_pool.addr -
+                      device->physical->va.internal_surface_state_pool.addr;
+   }
+   return state;
 }
 
-static inline void
-anv_binding_table_pool_free(struct anv_device *device, struct anv_state state) {
-   anv_state_pool_free(anv_binding_table_pool(device), state);
+static inline struct anv_state
+anv_bindless_state_for_binding_table(struct anv_device *device,
+                                     struct anv_state state)
+{
+   state.offset += device->physical->va.bindless_surface_state_pool.addr -
+                   device->physical->va.internal_surface_state_pool.addr;
+   return state;
 }
 
 static inline uint32_t
@@ -1329,92 +2105,34 @@ anv_mocs(const struct anv_device *device,
          const struct anv_bo *bo,
          isl_surf_usage_flags_t usage)
 {
-   return isl_mocs(&device->isl_dev, usage, bo && bo->is_external);
+   return isl_mocs(&device->isl_dev, usage, bo && anv_bo_is_external(bo));
 }
 
-void anv_device_init_blorp(struct anv_device *device);
-void anv_device_finish_blorp(struct anv_device *device);
-
-void _anv_device_report_lost(struct anv_device *device);
-VkResult _anv_device_set_lost(struct anv_device *device,
-                              const char *file, int line,
-                              const char *msg, ...)
-   anv_printflike(4, 5);
-VkResult _anv_queue_set_lost(struct anv_queue *queue,
-                              const char *file, int line,
-                              const char *msg, ...)
-   anv_printflike(4, 5);
-#define anv_device_set_lost(dev, ...) \
-   _anv_device_set_lost(dev, __FILE__, __LINE__, __VA_ARGS__)
-#define anv_queue_set_lost(queue, ...) \
-   (queue)->device->has_thread_submit ? \
-   _anv_queue_set_lost(queue, __FILE__, __LINE__, __VA_ARGS__) : \
-   _anv_device_set_lost(queue->device, __FILE__, __LINE__, __VA_ARGS__)
-
-static inline bool
-anv_device_is_lost(struct anv_device *device)
+static inline uint32_t
+anv_mocs_for_address(const struct anv_device *device,
+                     struct anv_address *addr)
 {
-   int lost = p_atomic_read(&device->_lost);
-   if (unlikely(lost && !device->lost_reported))
-      _anv_device_report_lost(device);
-   return lost;
+   return anv_mocs(device, addr->bo, 0);
 }
 
-VkResult anv_device_query_status(struct anv_device *device);
-
-
-enum anv_bo_alloc_flags {
-   /** Specifies that the BO must have a 32-bit address
-    *
-    * This is the opposite of EXEC_OBJECT_SUPPORTS_48B_ADDRESS.
-    */
-   ANV_BO_ALLOC_32BIT_ADDRESS =  (1 << 0),
-
-   /** Specifies that the BO may be shared externally */
-   ANV_BO_ALLOC_EXTERNAL =       (1 << 1),
-
-   /** Specifies that the BO should be mapped */
-   ANV_BO_ALLOC_MAPPED =         (1 << 2),
-
-   /** Specifies that the BO should be snooped so we get coherency */
-   ANV_BO_ALLOC_SNOOPED =        (1 << 3),
-
-   /** Specifies that the BO should be captured in error states */
-   ANV_BO_ALLOC_CAPTURE =        (1 << 4),
-
-   /** Specifies that the BO will have an address assigned by the caller
-    *
-    * Such BOs do not exist in any VMA heap.
-    */
-   ANV_BO_ALLOC_FIXED_ADDRESS = (1 << 5),
-
-   /** Enables implicit synchronization on the BO
-    *
-    * This is the opposite of EXEC_OBJECT_ASYNC.
-    */
-   ANV_BO_ALLOC_IMPLICIT_SYNC =  (1 << 6),
-
-   /** Enables implicit synchronization on the BO
-    *
-    * This is equivalent to EXEC_OBJECT_WRITE.
-    */
-   ANV_BO_ALLOC_IMPLICIT_WRITE = (1 << 7),
-
-   /** Has an address which is visible to the client */
-   ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS = (1 << 8),
-
-   /** This buffer has implicit CCS data attached to it */
-   ANV_BO_ALLOC_IMPLICIT_CCS = (1 << 9),
-
-   /** This buffer is allocated from local memory */
-   ANV_BO_ALLOC_LOCAL_MEM = (1 << 10),
-};
+void anv_device_init_blorp(struct anv_device *device);
+void anv_device_finish_blorp(struct anv_device *device);
 
 VkResult anv_device_alloc_bo(struct anv_device *device,
                              const char *name, uint64_t size,
                              enum anv_bo_alloc_flags alloc_flags,
                              uint64_t explicit_address,
                              struct anv_bo **bo);
+VkResult anv_device_map_bo(struct anv_device *device,
+                           struct anv_bo *bo,
+                           uint64_t offset,
+                           size_t size,
+                           void *placed_addr,
+                           void **map_out);
+VkResult anv_device_unmap_bo(struct anv_device *device,
+                             struct anv_bo *bo,
+                             void *map, size_t map_size,
+                             bool replace);
 VkResult anv_device_import_bo_from_host_ptr(struct anv_device *device,
                                             void *host_ptr, uint32_t size,
                                             enum anv_bo_alloc_flags alloc_flags,
@@ -1426,128 +2144,124 @@ VkResult anv_device_import_bo(struct anv_device *device, int fd,
                               struct anv_bo **bo);
 VkResult anv_device_export_bo(struct anv_device *device,
                               struct anv_bo *bo, int *fd_out);
+VkResult anv_device_get_bo_tiling(struct anv_device *device,
+                                  struct anv_bo *bo,
+                                  enum isl_tiling *tiling_out);
+VkResult anv_device_set_bo_tiling(struct anv_device *device,
+                                  struct anv_bo *bo,
+                                  uint32_t row_pitch_B,
+                                  enum isl_tiling tiling);
 void anv_device_release_bo(struct anv_device *device,
                            struct anv_bo *bo);
 
+static inline void anv_device_set_physical(struct anv_device *device,
+                                           struct anv_physical_device *physical_device)
+{
+   device->physical = physical_device;
+   device->info = &physical_device->info;
+   device->isl_dev = physical_device->isl_dev;
+}
+
 static inline struct anv_bo *
 anv_device_lookup_bo(struct anv_device *device, uint32_t gem_handle)
 {
    return util_sparse_array_get(&device->bo_cache.bo_map, gem_handle);
 }
 
-VkResult anv_device_bo_busy(struct anv_device *device, struct anv_bo *bo);
 VkResult anv_device_wait(struct anv_device *device, struct anv_bo *bo,
                          int64_t timeout);
 
 VkResult anv_queue_init(struct anv_device *device, struct anv_queue *queue,
-                        uint32_t exec_flags,
-                        const VkDeviceQueueCreateInfo *pCreateInfo);
+                        const VkDeviceQueueCreateInfo *pCreateInfo,
+                        uint32_t index_in_family);
 void anv_queue_finish(struct anv_queue *queue);
 
-VkResult anv_queue_execbuf_locked(struct anv_queue *queue, struct anv_queue_submit *submit);
+VkResult anv_queue_submit(struct vk_queue *queue,
+                          struct vk_queue_submit *submit);
 VkResult anv_queue_submit_simple_batch(struct anv_queue *queue,
-                                       struct anv_batch *batch);
-
-uint64_t anv_gettime_ns(void);
-uint64_t anv_get_absolute_timeout(uint64_t timeout);
-
-void* anv_gem_mmap(struct anv_device *device,
-                   uint32_t gem_handle, uint64_t offset, uint64_t size, uint32_t flags);
-void anv_gem_munmap(struct anv_device *device, void *p, uint64_t size);
-uint32_t anv_gem_create(struct anv_device *device, uint64_t size);
-void anv_gem_close(struct anv_device *device, uint32_t gem_handle);
-uint32_t anv_gem_create_regions(struct anv_device *device, uint64_t anv_bo_size,
-                                uint32_t num_regions,
-                                struct drm_i915_gem_memory_class_instance *regions);
-uint32_t anv_gem_userptr(struct anv_device *device, void *mem, size_t size);
-int anv_gem_busy(struct anv_device *device, uint32_t gem_handle);
+                                       struct anv_batch *batch,
+                                       bool is_companion_rcs_batch);
+VkResult anv_queue_submit_trtt_batch(struct anv_sparse_submission *submit,
+                                     struct anv_batch *batch);
+
+static inline void
+anv_trtt_batch_bo_free(struct anv_device *device,
+                       struct anv_trtt_batch_bo *trtt_bbo)
+{
+   anv_bo_pool_free(&device->batch_bo_pool, trtt_bbo->bo);
+   list_del(&trtt_bbo->link);
+   vk_free(&device->vk.alloc, trtt_bbo);
+}
+
+void anv_queue_trace(struct anv_queue *queue, const char *label,
+                     bool frame, bool begin);
+
+static inline VkResult
+anv_queue_post_submit(struct anv_queue *queue, VkResult submit_result)
+{
+   if (submit_result != VK_SUCCESS)
+      return submit_result;
+
+   VkResult result = VK_SUCCESS;
+   if (queue->sync) {
+      result = vk_sync_wait(&queue->device->vk, queue->sync, 0,
+                            VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
+      if (result != VK_SUCCESS)
+         result = vk_queue_set_lost(&queue->vk, "sync wait failed");
+   }
+
+   return result;
+}
+
 int anv_gem_wait(struct anv_device *device, uint32_t gem_handle, int64_t *timeout_ns);
-int anv_gem_execbuffer(struct anv_device *device,
-                       struct drm_i915_gem_execbuffer2 *execbuf);
 int anv_gem_set_tiling(struct anv_device *device, uint32_t gem_handle,
                        uint32_t stride, uint32_t tiling);
-int anv_gem_create_context(struct anv_device *device);
-int anv_gem_create_context_engines(struct anv_device *device,
-                                   const struct drm_i915_query_engine_info *info,
-                                   int num_engines,
-                                   uint16_t *engine_classes);
-bool anv_gem_has_context_priority(int fd);
-int anv_gem_destroy_context(struct anv_device *device, int context);
-int anv_gem_set_context_param(int fd, int context, uint32_t param,
-                              uint64_t value);
-int anv_gem_get_context_param(int fd, int context, uint32_t param,
-                              uint64_t *value);
-int anv_gem_get_param(int fd, uint32_t param);
-uint64_t anv_gem_get_drm_cap(int fd, uint32_t capability);
 int anv_gem_get_tiling(struct anv_device *device, uint32_t gem_handle);
-bool anv_gem_get_bit6_swizzle(int fd, uint32_t tiling);
-int anv_gem_context_get_reset_stats(int fd, int context,
-                                    uint32_t *active, uint32_t *pending);
 int anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle);
-int anv_gem_reg_read(int fd, uint32_t offset, uint64_t *result);
 uint32_t anv_gem_fd_to_handle(struct anv_device *device, int fd);
-int anv_gem_set_caching(struct anv_device *device, uint32_t gem_handle, uint32_t caching);
-int anv_gem_set_domain(struct anv_device *device, uint32_t gem_handle,
-                       uint32_t read_domains, uint32_t write_domain);
-int anv_gem_sync_file_merge(struct anv_device *device, int fd1, int fd2);
-uint32_t anv_gem_syncobj_create(struct anv_device *device, uint32_t flags);
-void anv_gem_syncobj_destroy(struct anv_device *device, uint32_t handle);
-int anv_gem_syncobj_handle_to_fd(struct anv_device *device, uint32_t handle);
-uint32_t anv_gem_syncobj_fd_to_handle(struct anv_device *device, int fd);
-int anv_gem_syncobj_export_sync_file(struct anv_device *device,
-                                     uint32_t handle);
-int anv_gem_syncobj_import_sync_file(struct anv_device *device,
-                                     uint32_t handle, int fd);
-void anv_gem_syncobj_reset(struct anv_device *device, uint32_t handle);
-bool anv_gem_supports_syncobj_wait(int fd);
-int anv_gem_syncobj_wait(struct anv_device *device,
-                         const uint32_t *handles, uint32_t num_handles,
-                         int64_t abs_timeout_ns, bool wait_all);
-int anv_gem_syncobj_timeline_wait(struct anv_device *device,
-                                  const uint32_t *handles, const uint64_t *points,
-                                  uint32_t num_items, int64_t abs_timeout_ns,
-                                  bool wait_all, bool wait_materialize);
-int anv_gem_syncobj_timeline_signal(struct anv_device *device,
-                                    const uint32_t *handles, const uint64_t *points,
-                                    uint32_t num_items);
-int anv_gem_syncobj_timeline_query(struct anv_device *device,
-                                   const uint32_t *handles, uint64_t *points,
-                                   uint32_t num_items);
-int anv_i915_query(int fd, uint64_t query_id, void *buffer,
-                   int32_t *buffer_len);
-struct drm_i915_query_engine_info *anv_gem_get_engine_info(int fd);
-int anv_gem_count_engines(const struct drm_i915_query_engine_info *info,
-                          uint16_t engine_class);
+int anv_gem_set_context_param(int fd, uint32_t context, uint32_t param,
+                              uint64_t value);
+VkResult
+anv_gem_import_bo_alloc_flags_to_bo_flags(struct anv_device *device,
+                                          struct anv_bo *bo,
+                                          enum anv_bo_alloc_flags alloc_flags,
+                                          uint32_t *bo_flags);
+const struct intel_device_info_pat_entry *
+anv_device_get_pat_entry(struct anv_device *device,
+                         enum anv_bo_alloc_flags alloc_flags);
 
 uint64_t anv_vma_alloc(struct anv_device *device,
                        uint64_t size, uint64_t align,
                        enum anv_bo_alloc_flags alloc_flags,
-                       uint64_t client_address);
+                       uint64_t client_address,
+                       struct util_vma_heap **out_vma_heap);
 void anv_vma_free(struct anv_device *device,
+                  struct util_vma_heap *vma_heap,
                   uint64_t address, uint64_t size);
 
 struct anv_reloc_list {
-   uint32_t                                     num_relocs;
-   uint32_t                                     array_length;
-   struct drm_i915_gem_relocation_entry *       relocs;
-   struct anv_bo **                             reloc_bos;
+   bool                                         uses_relocs;
    uint32_t                                     dep_words;
    BITSET_WORD *                                deps;
+   const VkAllocationCallbacks                  *alloc;
 };
 
 VkResult anv_reloc_list_init(struct anv_reloc_list *list,
-                             const VkAllocationCallbacks *alloc);
-void anv_reloc_list_finish(struct anv_reloc_list *list,
-                           const VkAllocationCallbacks *alloc);
+                             const VkAllocationCallbacks *alloc,
+                             bool uses_relocs);
+void anv_reloc_list_finish(struct anv_reloc_list *list);
+
+VkResult
+anv_reloc_list_add_bo_impl(struct anv_reloc_list *list, struct anv_bo *target_bo);
 
-VkResult anv_reloc_list_add(struct anv_reloc_list *list,
-                            const VkAllocationCallbacks *alloc,
-                            uint32_t offset, struct anv_bo *target_bo,
-                            uint32_t delta, uint64_t *address_u64_out);
+static inline VkResult
+anv_reloc_list_add_bo(struct anv_reloc_list *list, struct anv_bo *target_bo)
+{
+   return list->uses_relocs ? anv_reloc_list_add_bo_impl(list, target_bo) : VK_SUCCESS;
+}
 
-VkResult anv_reloc_list_add_bo(struct anv_reloc_list *list,
-                               const VkAllocationCallbacks *alloc,
-                               struct anv_bo *target_bo);
+VkResult anv_reloc_list_append(struct anv_reloc_list *list,
+                               struct anv_reloc_list *other);
 
 struct anv_batch_bo {
    /* Link in the anv_cmd_buffer.owned_batch_bos list */
@@ -1572,6 +2286,12 @@ struct anv_batch_bo {
 struct anv_batch {
    const VkAllocationCallbacks *                alloc;
 
+   /**
+    * Sum of all the anv_batch_bo sizes allocated for this command buffer.
+    * Used to increase allocation size for long command buffers.
+    */
+   size_t                                       allocated_batch_size;
+
    struct anv_address                           start_addr;
 
    void *                                       start;
@@ -1583,7 +2303,7 @@ struct anv_batch {
    /* This callback is called (with the associated user data) in the event
     * that the batch runs out of space.
     */
-   VkResult (*extend_cb)(struct anv_batch *, void *);
+   VkResult (*extend_cb)(struct anv_batch *, uint32_t, void *);
    void *                                       user_data;
 
    /**
@@ -1594,12 +2314,27 @@ struct anv_batch {
     * of the driver.
     */
    VkResult                                     status;
+
+   enum intel_engine_class                      engine_class;
+
+   /**
+    * Number of 3DPRIMITIVE's emitted for WA 16014538804
+    */
+   uint8_t num_3d_primitives_emitted;
 };
 
 void *anv_batch_emit_dwords(struct anv_batch *batch, int num_dwords);
+VkResult anv_batch_emit_ensure_space(struct anv_batch *batch, uint32_t size);
+void anv_batch_advance(struct anv_batch *batch, uint32_t size);
 void anv_batch_emit_batch(struct anv_batch *batch, struct anv_batch *other);
 struct anv_address anv_batch_address(struct anv_batch *batch, void *batch_location);
 
+static inline struct anv_address
+anv_batch_current_address(struct anv_batch *batch)
+{
+   return anv_batch_address(batch, batch->next);
+}
+
 static inline void
 anv_batch_set_storage(struct anv_batch *batch, struct anv_address addr,
                       void *map, size_t size)
@@ -1625,97 +2360,16 @@ anv_batch_has_error(struct anv_batch *batch)
 }
 
 static inline uint64_t
-anv_batch_emit_reloc(struct anv_batch *batch,
-                     void *location, struct anv_bo *bo, uint32_t delta)
-{
-   uint64_t address_u64 = 0;
-   VkResult result;
-
-   if (ANV_ALWAYS_SOFTPIN) {
-      address_u64 = bo->offset + delta;
-      result = anv_reloc_list_add_bo(batch->relocs, batch->alloc, bo);
-   } else {
-      result = anv_reloc_list_add(batch->relocs, batch->alloc,
-                                  location - batch->start, bo, delta,
-                                  &address_u64);
-   }
-   if (unlikely(result != VK_SUCCESS)) {
-      anv_batch_set_error(batch, result);
-      return 0;
-   }
-
-   return address_u64;
-}
-
-
-#define ANV_NULL_ADDRESS ((struct anv_address) { NULL, 0 })
-
-static inline struct anv_address
-anv_address_from_u64(uint64_t addr_u64)
-{
-   assert(addr_u64 == intel_canonical_address(addr_u64));
-   return (struct anv_address) {
-      .bo = NULL,
-      .offset = addr_u64,
-   };
-}
-
-static inline bool
-anv_address_is_null(struct anv_address addr)
-{
-   return addr.bo == NULL && addr.offset == 0;
-}
-
-static inline uint64_t
-anv_address_physical(struct anv_address addr)
-{
-   if (addr.bo && (ANV_ALWAYS_SOFTPIN ||
-                   (addr.bo->flags & EXEC_OBJECT_PINNED))) {
-      assert(addr.bo->flags & EXEC_OBJECT_PINNED);
-      return intel_canonical_address(addr.bo->offset + addr.offset);
-   } else {
-      return intel_canonical_address(addr.offset);
-   }
-}
-
-static inline struct anv_address
-anv_address_add(struct anv_address addr, uint64_t offset)
-{
-   addr.offset += offset;
-   return addr;
-}
-
-static inline void
-write_reloc(const struct anv_device *device, void *p, uint64_t v, bool flush)
-{
-   unsigned reloc_size = 0;
-   if (device->info.ver >= 8) {
-      reloc_size = sizeof(uint64_t);
-      *(uint64_t *)p = intel_canonical_address(v);
-   } else {
-      reloc_size = sizeof(uint32_t);
-      *(uint32_t *)p = v;
-   }
-
-   if (flush && !device->info.has_llc)
-      intel_flush_range(p, reloc_size);
-}
-
-static inline uint64_t
 _anv_combine_address(struct anv_batch *batch, void *location,
                      const struct anv_address address, uint32_t delta)
 {
-   if (address.bo == NULL) {
+   if (address.bo == NULL)
       return address.offset + delta;
-   } else if (batch == NULL) {
-      assert(address.bo->flags & EXEC_OBJECT_PINNED);
-      return anv_address_physical(anv_address_add(address, delta));
-   } else {
-      assert(batch->start <= location && location < batch->end);
-      /* i915 relocations are signed. */
-      assert(INT32_MIN <= address.offset && address.offset <= INT32_MAX);
-      return anv_batch_emit_reloc(batch, location, address.bo, address.offset + delta);
-   }
+
+   if (batch)
+      anv_reloc_list_add_bo(batch->relocs, address.bo);
+
+   return anv_address_physical(anv_address_add(address, delta));
 }
 
 #define __gen_address_type struct anv_address
@@ -1755,18 +2409,20 @@ _anv_combine_address(struct anv_batch *batch, void *location,
       __dst;                                               \
    })
 
-#define anv_batch_emit_merge(batch, dwords0, dwords1)                   \
-   do {                                                                 \
-      uint32_t *dw;                                                     \
-                                                                        \
-      STATIC_ASSERT(ARRAY_SIZE(dwords0) == ARRAY_SIZE(dwords1));        \
-      dw = anv_batch_emit_dwords((batch), ARRAY_SIZE(dwords0));         \
-      if (!dw)                                                          \
-         break;                                                         \
-      for (uint32_t i = 0; i < ARRAY_SIZE(dwords0); i++)                \
-         dw[i] = (dwords0)[i] | (dwords1)[i];                           \
-      VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, ARRAY_SIZE(dwords0) * 4));\
-   } while (0)
+#define anv_batch_emit_merge(batch, cmd, pipeline, state, name)         \
+   for (struct cmd name = { 0 },                                        \
+        *_dst = anv_batch_emit_dwords(batch, __anv_cmd_length(cmd));    \
+        __builtin_expect(_dst != NULL, 1);                              \
+        ({ uint32_t _partial[__anv_cmd_length(cmd)];                    \
+           assert((pipeline)->state.len == __anv_cmd_length(cmd));      \
+           __anv_cmd_pack(cmd)(batch, _partial, &name);                 \
+           for (uint32_t i = 0; i < __anv_cmd_length(cmd); i++) {       \
+              ((uint32_t *)_dst)[i] = _partial[i] |                     \
+                 (pipeline)->batch_data[(pipeline)->state.offset + i];  \
+           }                                                            \
+           VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
+           _dst = NULL;                                                 \
+         }))
 
 #define anv_batch_emit(batch, cmd, name)                            \
    for (struct cmd name = { __anv_cmd_header(cmd) },                    \
@@ -1797,22 +2453,18 @@ _anv_combine_address(struct anv_batch *batch, void *location,
 /* #define __gen_address_offset anv_address_add */
 
 struct anv_device_memory {
-   struct vk_object_base                        base;
+   struct vk_device_memory                      vk;
 
    struct list_head                             link;
 
    struct anv_bo *                              bo;
    const struct anv_memory_type *               type;
-   VkDeviceSize                                 map_size;
-   void *                                       map;
 
-   /* If set, we are holding reference to AHardwareBuffer
-    * which we must release when memory is freed.
-    */
-   struct AHardwareBuffer *                     ahw;
+   void *                                       map;
+   size_t                                       map_size;
 
-   /* If set, this memory comes from a host pointer. */
-   void *                                       host_ptr;
+   /* The map, from the user PoV is map + map_delta */
+   uint64_t                                     map_delta;
 };
 
 /**
@@ -1846,17 +2498,6 @@ struct anv_sampled_image_descriptor {
    uint32_t sampler;
 };
 
-struct anv_texture_swizzle_descriptor {
-   /** Texture swizzle
-    *
-    * See also nir_intrinsic_channel_select_intel
-    */
-   uint8_t swizzle[4];
-
-   /** Unused padding to ensure the struct is a multiple of 64 bits */
-   uint32_t _pad;
-};
-
 /** Struct representing a storage image descriptor */
 struct anv_storage_image_descriptor {
    /** Bindless image handles
@@ -1864,8 +2505,29 @@ struct anv_storage_image_descriptor {
     * These are expected to already be shifted such that the 20-bit
     * SURFACE_STATE table index is in the top 20 bits.
     */
-   uint32_t read_write;
-   uint32_t write_only;
+   uint32_t vanilla;
+
+   /** Image depth
+    *
+    * By default the HW RESINFO message allows us to query the depth of an image :
+    *
+    * From the Kaby Lake docs for the RESINFO message:
+    *
+    *    "Surface Type | ... | Blue
+    *    --------------+-----+----------------
+    *    SURFTYPE_3D  | ... | (Depth+1)»LOD"
+    *
+    * With VK_EXT_sliced_view_of_3d, we have to support a slice of a 3D image,
+    * meaning at a depth offset with a new depth value potentially reduced
+    * from the original image. Unfortunately if we change the Depth value of
+    * the image, we then run into issues with Yf/Ys tilings where the HW fetch
+    * data at incorrect locations.
+    *
+    * To solve this, we put the slice depth in the descriptor and recompose
+    * the vec3 (width, height, depth) using this field for z and xy using the
+    * RESINFO result.
+    */
+   uint32_t image_depth;
 };
 
 /** Struct representing a address/range descriptor
@@ -1883,23 +2545,25 @@ struct anv_address_range_descriptor {
 
 enum anv_descriptor_data {
    /** The descriptor contains a BTI reference to a surface state */
-   ANV_DESCRIPTOR_SURFACE_STATE  = (1 << 0),
+   ANV_DESCRIPTOR_BTI_SURFACE_STATE       = BITFIELD_BIT(0),
    /** The descriptor contains a BTI reference to a sampler state */
-   ANV_DESCRIPTOR_SAMPLER_STATE  = (1 << 1),
+   ANV_DESCRIPTOR_BTI_SAMPLER_STATE       = BITFIELD_BIT(1),
    /** The descriptor contains an actual buffer view */
-   ANV_DESCRIPTOR_BUFFER_VIEW    = (1 << 2),
-   /** The descriptor contains auxiliary image layout data */
-   ANV_DESCRIPTOR_IMAGE_PARAM    = (1 << 3),
-   /** The descriptor contains auxiliary image layout data */
-   ANV_DESCRIPTOR_INLINE_UNIFORM = (1 << 4),
+   ANV_DESCRIPTOR_BUFFER_VIEW             = BITFIELD_BIT(2),
+   /** The descriptor contains inline uniform data */
+   ANV_DESCRIPTOR_INLINE_UNIFORM          = BITFIELD_BIT(3),
    /** anv_address_range_descriptor with a buffer address and range */
-   ANV_DESCRIPTOR_ADDRESS_RANGE  = (1 << 5),
-   /** Bindless surface handle */
-   ANV_DESCRIPTOR_SAMPLED_IMAGE  = (1 << 6),
-   /** Storage image handles */
-   ANV_DESCRIPTOR_STORAGE_IMAGE  = (1 << 7),
-   /** Storage image handles */
-   ANV_DESCRIPTOR_TEXTURE_SWIZZLE  = (1 << 8),
+   ANV_DESCRIPTOR_INDIRECT_ADDRESS_RANGE  = BITFIELD_BIT(4),
+   /** Bindless surface handle (through anv_sampled_image_descriptor) */
+   ANV_DESCRIPTOR_INDIRECT_SAMPLED_IMAGE  = BITFIELD_BIT(5),
+   /** Storage image handles (through anv_storage_image_descriptor) */
+   ANV_DESCRIPTOR_INDIRECT_STORAGE_IMAGE  = BITFIELD_BIT(6),
+   /** The descriptor contains a single RENDER_SURFACE_STATE */
+   ANV_DESCRIPTOR_SURFACE                 = BITFIELD_BIT(7),
+   /** The descriptor contains a SAMPLER_STATE */
+   ANV_DESCRIPTOR_SAMPLER                 = BITFIELD_BIT(8),
+   /** A tuple of RENDER_SURFACE_STATE & SAMPLER_STATE */
+   ANV_DESCRIPTOR_SURFACE_SAMPLER         = BITFIELD_BIT(9),
 };
 
 struct anv_descriptor_set_binding_layout {
@@ -1907,7 +2571,7 @@ struct anv_descriptor_set_binding_layout {
    VkDescriptorType type;
 
    /* Flags provided when this binding was created */
-   VkDescriptorBindingFlagsEXT flags;
+   VkDescriptorBindingFlags flags;
 
    /* Bitfield representing the type of data this descriptor contains */
    enum anv_descriptor_data data;
@@ -1920,38 +2584,58 @@ struct anv_descriptor_set_binding_layout {
     */
    uint32_t array_size;
 
-   /* Index into the flattend descriptor set */
+   /* Index into the flattened descriptor set */
    uint32_t descriptor_index;
 
-   /* Index into the dynamic state array for a dynamic buffer */
+   /* Index into the dynamic state array for a dynamic buffer, relative to the
+    * set.
+    */
    int16_t dynamic_offset_index;
 
+   /* Computed surface size from data (for one plane) */
+   uint16_t descriptor_data_surface_size;
+
+   /* Computed sampler size from data (for one plane) */
+   uint16_t descriptor_data_sampler_size;
+
    /* Index into the descriptor set buffer views */
    int32_t buffer_view_index;
 
-   /* Offset into the descriptor buffer where this descriptor lives */
-   uint32_t descriptor_offset;
+   /* Offset into the descriptor buffer where the surface descriptor lives */
+   uint32_t descriptor_surface_offset;
 
-   /* Immutable samplers (or NULL if no immutable samplers) */
-   struct anv_sampler **immutable_samplers;
-};
+   /* Offset into the descriptor buffer where the sampler descriptor lives */
+   uint16_t descriptor_sampler_offset;
 
-unsigned anv_descriptor_size(const struct anv_descriptor_set_binding_layout *layout);
+   /* Pre computed surface stride (with multiplane descriptor, the descriptor
+    * includes all the planes)
+    */
+   uint16_t descriptor_surface_stride;
 
-unsigned anv_descriptor_type_size(const struct anv_physical_device *pdevice,
-                                  VkDescriptorType type);
+   /* Pre computed sampler stride (with multiplane descriptor, the descriptor
+    * includes all the planes)
+    */
+   uint16_t descriptor_sampler_stride;
 
-bool anv_descriptor_supports_bindless(const struct anv_physical_device *pdevice,
-                                      const struct anv_descriptor_set_binding_layout *binding,
-                                      bool sampler);
+   /* Immutable samplers (or NULL if no immutable samplers) */
+   struct anv_sampler **immutable_samplers;
+};
 
-bool anv_descriptor_requires_bindless(const struct anv_physical_device *pdevice,
-                                      const struct anv_descriptor_set_binding_layout *binding,
-                                      bool sampler);
+enum anv_descriptor_set_layout_type {
+   ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_UNKNOWN,
+   ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT,
+   ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT,
+   ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER,
+};
 
 struct anv_descriptor_set_layout {
    struct vk_object_base base;
 
+   VkDescriptorSetLayoutCreateFlags flags;
+
+   /* Type of descriptor set layout */
+   enum anv_descriptor_set_layout_type type;
+
    /* Descriptor set layouts can be destroyed at almost any time */
    uint32_t ref_cnt;
 
@@ -1975,21 +2659,43 @@ struct anv_descriptor_set_layout {
     */
    VkShaderStageFlags dynamic_offset_stages[MAX_DYNAMIC_BUFFERS];
 
-   /* Size of the descriptor buffer for this descriptor set */
-   uint32_t descriptor_buffer_size;
+   /* Size of the descriptor buffer dedicated to surface states for this
+    * descriptor set
+    */
+   uint32_t descriptor_buffer_surface_size;
+
+   /* Size of the descriptor buffer dedicated to sampler states for this
+    * descriptor set
+    */
+   uint32_t descriptor_buffer_sampler_size;
+
+   /* Number of embedded sampler count */
+   uint32_t embedded_sampler_count;
 
    /* Bindings in this descriptor set */
    struct anv_descriptor_set_binding_layout binding[0];
 };
 
+bool anv_descriptor_supports_bindless(const struct anv_physical_device *pdevice,
+                                      const struct anv_descriptor_set_layout *set,
+                                      const struct anv_descriptor_set_binding_layout *binding);
+
+bool anv_descriptor_requires_bindless(const struct anv_physical_device *pdevice,
+                                      const struct anv_descriptor_set_layout *set,
+                                      const struct anv_descriptor_set_binding_layout *binding);
+
 void anv_descriptor_set_layout_destroy(struct anv_device *device,
                                        struct anv_descriptor_set_layout *layout);
 
-static inline void
+void anv_descriptor_set_layout_print(const struct anv_descriptor_set_layout *layout);
+
+static inline struct anv_descriptor_set_layout *
 anv_descriptor_set_layout_ref(struct anv_descriptor_set_layout *layout)
 {
    assert(layout && layout->ref_cnt >= 1);
    p_atomic_inc(&layout->ref_cnt);
+
+   return layout;
 }
 
 static inline void
@@ -2012,12 +2718,16 @@ struct anv_descriptor {
       };
 
       struct {
+         struct anv_buffer_view *set_buffer_view;
          struct anv_buffer *buffer;
          uint64_t offset;
          uint64_t range;
+         uint64_t bind_range;
       };
 
       struct anv_buffer_view *buffer_view;
+
+      struct vk_acceleration_structure *accel_struct;
    };
 };
 
@@ -2032,13 +2742,36 @@ struct anv_descriptor_set {
     */
    uint32_t size;
 
-   /* State relative to anv_descriptor_pool::bo */
-   struct anv_state desc_mem;
+   /* Is this descriptor set a push descriptor */
+   bool is_push;
+
+   /* Bitfield of descriptors for which we need to generate surface states.
+    * Only valid for push descriptors
+    */
+   uint32_t generate_surface_states;
+
+   /* State relative to anv_descriptor_pool::surface_bo */
+   struct anv_state desc_surface_mem;
+   /* State relative to anv_descriptor_pool::sampler_bo */
+   struct anv_state desc_sampler_mem;
    /* Surface state for the descriptor buffer */
    struct anv_state desc_surface_state;
 
-   /* Descriptor set address. */
-   struct anv_address desc_addr;
+   /* Descriptor set address pointing to desc_surface_mem (we don't need one
+    * for sampler because they're never accessed other than by the HW through
+    * the shader sampler handle).
+    */
+   struct anv_address desc_surface_addr;
+
+   struct anv_address desc_sampler_addr;
+
+   /* Descriptor offset from the
+    * device->va.internal_surface_state_pool.addr
+    *
+    * It just needs to be added to the binding table offset to be put into the
+    * HW BTI entry.
+    */
+   uint32_t desc_offset;
 
    uint32_t buffer_view_count;
    struct anv_buffer_view *buffer_views;
@@ -2056,19 +2789,31 @@ anv_descriptor_set_is_push(struct anv_descriptor_set *set)
    return set->pool == NULL;
 }
 
-struct anv_buffer_view {
-   struct vk_object_base base;
+struct anv_surface_state_data {
+   uint8_t data[ANV_SURFACE_STATE_SIZE];
+};
 
-   enum isl_format format; /**< VkBufferViewCreateInfo::format */
-   uint64_t range; /**< VkBufferViewCreateInfo::range */
+struct anv_buffer_state {
+   /** Surface state allocated from the bindless heap
+    *
+    * Only valid if anv_physical_device::indirect_descriptors is true
+    */
+   struct anv_state state;
 
-   struct anv_address address;
+   /** Surface state after genxml packing
+    *
+    * Only valid if anv_physical_device::indirect_descriptors is false
+    */
+   struct anv_surface_state_data state_data;
+};
+
+struct anv_buffer_view {
+   struct vk_buffer_view vk;
 
-   struct anv_state surface_state;
-   struct anv_state storage_surface_state;
-   struct anv_state writeonly_storage_surface_state;
+   struct anv_address address;
 
-   struct brw_image_param storage_image_param;
+   struct anv_buffer_state general;
+   struct anv_buffer_state storage;
 };
 
 struct anv_push_descriptor_set {
@@ -2098,78 +2843,57 @@ anv_descriptor_set_address(struct anv_descriptor_set *set)
       push_set->set_used_on_gpu = true;
    }
 
-   return set->desc_addr;
+   return set->desc_surface_addr;
 }
 
-struct anv_descriptor_pool {
-   struct vk_object_base base;
-
-   uint32_t size;
-   uint32_t next;
-   uint32_t free_list;
+struct anv_descriptor_pool_heap {
+   /* BO allocated to back the pool (unused for host pools) */
+   struct anv_bo        *bo;
 
-   struct anv_bo *bo;
-   struct util_vma_heap bo_heap;
+   /* Host memory allocated to back a host pool */
+   void                 *host_mem;
 
-   struct anv_state_stream surface_state_stream;
-   void *surface_state_free_list;
+   /* Heap tracking allocations in bo/host_mem */
+   struct util_vma_heap  heap;
 
-   struct list_head desc_sets;
-
-   char data[0];
-};
-
-enum anv_descriptor_template_entry_type {
-   ANV_DESCRIPTOR_TEMPLATE_ENTRY_TYPE_IMAGE,
-   ANV_DESCRIPTOR_TEMPLATE_ENTRY_TYPE_BUFFER,
-   ANV_DESCRIPTOR_TEMPLATE_ENTRY_TYPE_BUFFER_VIEW
+   /* Size of the heap */
+   uint32_t              size;
 };
 
-struct anv_descriptor_template_entry {
-   /* The type of descriptor in this entry */
-   VkDescriptorType type;
-
-   /* Binding in the descriptor set */
-   uint32_t binding;
-
-   /* Offset at which to write into the descriptor set binding */
-   uint32_t array_element;
+struct anv_descriptor_pool {
+   struct vk_object_base base;
 
-   /* Number of elements to write into the descriptor set binding */
-   uint32_t array_count;
+   struct anv_descriptor_pool_heap surfaces;
+   struct anv_descriptor_pool_heap samplers;
 
-   /* Offset into the user provided data */
-   size_t offset;
+   struct anv_state_stream surface_state_stream;
+   void *surface_state_free_list;
 
-   /* Stride between elements into the user provided data */
-   size_t stride;
-};
+   /** List of anv_descriptor_set. */
+   struct list_head desc_sets;
 
-struct anv_descriptor_update_template {
-    struct vk_object_base base;
+   /** Heap over host_mem */
+   struct util_vma_heap host_heap;
 
-    VkPipelineBindPoint bind_point;
+   /** Allocated size of host_mem */
+   uint32_t host_mem_size;
 
-   /* The descriptor set this template corresponds to. This value is only
-    * valid if the template was created with the templateType
-    * VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET.
+   /**
+    * VK_DESCRIPTOR_POOL_CREATE_HOST_ONLY_BIT_EXT. If set, then
+    * surface_state_stream is unused.
     */
-   uint8_t set;
+   bool host_only;
 
-   /* Number of entries in this template */
-   uint32_t entry_count;
-
-   /* Entries of the template */
-   struct anv_descriptor_template_entry entries[0];
+   char host_mem[0];
 };
 
-size_t
-anv_descriptor_set_layout_size(const struct anv_descriptor_set_layout *layout,
-                               uint32_t var_desc_count);
+bool
+anv_push_descriptor_set_init(struct anv_cmd_buffer *cmd_buffer,
+                             struct anv_push_descriptor_set *push_set,
+                             struct anv_descriptor_set_layout *layout);
 
-uint32_t
-anv_descriptor_set_layout_descriptor_buffer_size(const struct anv_descriptor_set_layout *set_layout,
-                                                 uint32_t var_desc_count);
+void
+anv_push_descriptor_set_finish(struct anv_push_descriptor_set *push_set);
 
 void
 anv_descriptor_set_write_image_view(struct anv_device *device,
@@ -2190,7 +2914,6 @@ anv_descriptor_set_write_buffer_view(struct anv_device *device,
 void
 anv_descriptor_set_write_buffer(struct anv_device *device,
                                 struct anv_descriptor_set *set,
-                                struct anv_state_stream *alloc_stream,
                                 VkDescriptorType type,
                                 struct anv_buffer *buffer,
                                 uint32_t binding,
@@ -2199,9 +2922,14 @@ anv_descriptor_set_write_buffer(struct anv_device *device,
                                 VkDeviceSize range);
 
 void
+anv_descriptor_write_surface_state(struct anv_device *device,
+                                   struct anv_descriptor *desc,
+                                   struct anv_state surface_state);
+
+void
 anv_descriptor_set_write_acceleration_structure(struct anv_device *device,
                                                 struct anv_descriptor_set *set,
-                                                struct anv_acceleration_structure *accel,
+                                                struct vk_acceleration_structure *accel,
                                                 uint32_t binding,
                                                 uint32_t element);
 
@@ -2214,30 +2942,23 @@ anv_descriptor_set_write_inline_uniform_data(struct anv_device *device,
                                              size_t size);
 
 void
+anv_descriptor_set_write(struct anv_device *device,
+                         struct anv_descriptor_set *set_override,
+                         uint32_t write_count,
+                         const VkWriteDescriptorSet *writes);
+
+void
 anv_descriptor_set_write_template(struct anv_device *device,
                                   struct anv_descriptor_set *set,
-                                  struct anv_state_stream *alloc_stream,
-                                  const struct anv_descriptor_update_template *template,
+                                  const struct vk_descriptor_update_template *template,
                                   const void *data);
 
-VkResult
-anv_descriptor_set_create(struct anv_device *device,
-                          struct anv_descriptor_pool *pool,
-                          struct anv_descriptor_set_layout *layout,
-                          uint32_t var_desc_count,
-                          struct anv_descriptor_set **out_set);
-
-void
-anv_descriptor_set_destroy(struct anv_device *device,
-                           struct anv_descriptor_pool *pool,
-                           struct anv_descriptor_set *set);
-
-#define ANV_DESCRIPTOR_SET_NULL             (UINT8_MAX - 5)
-#define ANV_DESCRIPTOR_SET_PUSH_CONSTANTS   (UINT8_MAX - 4)
-#define ANV_DESCRIPTOR_SET_DESCRIPTORS      (UINT8_MAX - 3)
-#define ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS  (UINT8_MAX - 2)
-#define ANV_DESCRIPTOR_SET_SHADER_CONSTANTS (UINT8_MAX - 1)
-#define ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS UINT8_MAX
+#define ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER (UINT8_MAX - 5)
+#define ANV_DESCRIPTOR_SET_NULL               (UINT8_MAX - 4)
+#define ANV_DESCRIPTOR_SET_PUSH_CONSTANTS     (UINT8_MAX - 3)
+#define ANV_DESCRIPTOR_SET_DESCRIPTORS        (UINT8_MAX - 2)
+#define ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS    (UINT8_MAX - 1)
+#define ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS   UINT8_MAX
 
 struct anv_pipeline_binding {
    /** Index in the descriptor set
@@ -2247,6 +2968,19 @@ struct anv_pipeline_binding {
     */
    uint32_t index;
 
+   /** Binding in the descriptor set. Not valid for any of the
+    * ANV_DESCRIPTOR_SET_*
+    */
+   uint32_t binding;
+
+   /** Offset in the descriptor buffer
+    *
+    * Relative to anv_descriptor_set::desc_addr. This is useful for
+    * ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT, to generate the binding
+    * table entry.
+    */
+   uint32_t set_offset;
+
    /** The descriptor set this surface corresponds to.
     *
     * The special ANV_DESCRIPTOR_SET_* values above indicates that this
@@ -2261,17 +2995,39 @@ struct anv_pipeline_binding {
       /** Input attachment index (relative to the subpass) */
       uint8_t input_attachment_index;
 
-      /** Dynamic offset index (for dynamic UBOs and SSBOs) */
+      /** Dynamic offset index
+       *
+       * For dynamic UBOs and SSBOs, relative to set.
+       */
       uint8_t dynamic_offset_index;
    };
+};
 
-   /** For a storage image, whether it is write-only */
-   uint8_t write_only;
-
-   /** Pad to 64 bits so that there are no holes and we can safely memcmp
-    * assuming POD zero-initialization.
+struct anv_embedded_sampler_key {
+   /** No need to track binding elements for embedded samplers as :
+    *
+    *    VUID-VkDescriptorSetLayoutBinding-flags-08006:
+    *
+    *       "If VkDescriptorSetLayoutCreateInfo:flags contains
+    *        VK_DESCRIPTOR_SET_LAYOUT_CREATE_EMBEDDED_IMMUTABLE_SAMPLERS_BIT_EXT,
+    *        descriptorCount must: less than or equal to 1"
+    *
+    * The following struct can be safely hash as it doesn't include in
+    * address/offset.
     */
-   uint8_t pad;
+   uint32_t sampler[4];
+   uint32_t color[4];
+};
+
+struct anv_pipeline_embedded_sampler_binding {
+   /** The descriptor set this sampler belongs to */
+   uint8_t set;
+
+   /** The binding in the set this sampler belongs to */
+   uint32_t binding;
+
+   /** The data configuring the sampler */
+   struct anv_embedded_sampler_key key;
 };
 
 struct anv_push_range {
@@ -2281,7 +3037,7 @@ struct anv_push_range {
    /** Descriptor set index */
    uint8_t set;
 
-   /** Dynamic offset index (for dynamic UBOs) */
+   /** Dynamic offset index (for dynamic UBOs), relative to set. */
    uint8_t dynamic_offset_index;
 
    /** Start offset in units of 32B */
@@ -2291,175 +3047,153 @@ struct anv_push_range {
    uint8_t length;
 };
 
-struct anv_pipeline_layout {
-   struct vk_object_base base;
+struct anv_pipeline_sets_layout {
+   struct anv_device *device;
 
    struct {
       struct anv_descriptor_set_layout *layout;
       uint32_t dynamic_offset_start;
    } set[MAX_SETS];
 
+   enum anv_descriptor_set_layout_type type;
+
    uint32_t num_sets;
+   uint32_t num_dynamic_buffers;
+   int push_descriptor_set_index;
+
+   bool independent_sets;
 
    unsigned char sha1[20];
 };
 
-struct anv_buffer {
-   struct vk_object_base                        base;
+void anv_pipeline_sets_layout_init(struct anv_pipeline_sets_layout *layout,
+                                   struct anv_device *device,
+                                   bool independent_sets);
 
-   struct anv_device *                          device;
-   VkDeviceSize                                 size;
+void anv_pipeline_sets_layout_fini(struct anv_pipeline_sets_layout *layout);
 
-   VkBufferCreateFlags                          create_flags;
-   VkBufferUsageFlags                           usage;
+void anv_pipeline_sets_layout_add(struct anv_pipeline_sets_layout *layout,
+                                  uint32_t set_idx,
+                                  struct anv_descriptor_set_layout *set_layout);
 
-   /* Set when bound */
-   struct anv_address                           address;
+uint32_t
+anv_pipeline_sets_layout_embedded_sampler_count(const struct anv_pipeline_sets_layout *layout);
+
+void anv_pipeline_sets_layout_hash(struct anv_pipeline_sets_layout *layout);
+
+void anv_pipeline_sets_layout_print(const struct anv_pipeline_sets_layout *layout);
+
+struct anv_pipeline_layout {
+   struct vk_object_base base;
+
+   struct anv_pipeline_sets_layout sets_layout;
 };
 
-static inline uint64_t
-anv_buffer_get_range(struct anv_buffer *buffer, uint64_t offset, uint64_t range)
+const struct anv_descriptor_set_layout *
+anv_pipeline_layout_get_push_set(const struct anv_pipeline_sets_layout *layout,
+                                 uint8_t *desc_idx);
+
+struct anv_sparse_binding_data {
+   uint64_t address;
+   uint64_t size;
+
+   /* This is kept only because it's given to us by vma_alloc() and need to be
+    * passed back to vma_free(), we have no other particular use for it
+    */
+   struct util_vma_heap *vma_heap;
+};
+
+#define ANV_SPARSE_BLOCK_SIZE (64 * 1024)
+
+static inline bool
+anv_sparse_binding_is_enabled(struct anv_device *device)
 {
-   assert(offset <= buffer->size);
-   if (range == VK_WHOLE_SIZE) {
-      return buffer->size - offset;
-   } else {
-      assert(range + offset >= range);
-      assert(range + offset <= buffer->size);
-      return range;
-   }
+   return device->vk.enabled_features.sparseBinding;
 }
 
-enum anv_cmd_dirty_bits {
-   ANV_CMD_DIRTY_DYNAMIC_VIEWPORT                    = 1 << 0, /* VK_DYNAMIC_STATE_VIEWPORT */
-   ANV_CMD_DIRTY_DYNAMIC_SCISSOR                     = 1 << 1, /* VK_DYNAMIC_STATE_SCISSOR */
-   ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH                  = 1 << 2, /* VK_DYNAMIC_STATE_LINE_WIDTH */
-   ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS                  = 1 << 3, /* VK_DYNAMIC_STATE_DEPTH_BIAS */
-   ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS             = 1 << 4, /* VK_DYNAMIC_STATE_BLEND_CONSTANTS */
-   ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS                = 1 << 5, /* VK_DYNAMIC_STATE_DEPTH_BOUNDS */
-   ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK        = 1 << 6, /* VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK */
-   ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK          = 1 << 7, /* VK_DYNAMIC_STATE_STENCIL_WRITE_MASK */
-   ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE           = 1 << 8, /* VK_DYNAMIC_STATE_STENCIL_REFERENCE */
-   ANV_CMD_DIRTY_PIPELINE                            = 1 << 9,
-   ANV_CMD_DIRTY_INDEX_BUFFER                        = 1 << 10,
-   ANV_CMD_DIRTY_RENDER_TARGETS                      = 1 << 11,
-   ANV_CMD_DIRTY_XFB_ENABLE                          = 1 << 12,
-   ANV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE                = 1 << 13, /* VK_DYNAMIC_STATE_LINE_STIPPLE_EXT */
-   ANV_CMD_DIRTY_DYNAMIC_CULL_MODE                   = 1 << 14, /* VK_DYNAMIC_STATE_CULL_MODE_EXT */
-   ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE                  = 1 << 15, /* VK_DYNAMIC_STATE_FRONT_FACE_EXT */
-   ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY          = 1 << 16, /* VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT */
-   ANV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE = 1 << 17, /* VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT */
-   ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE           = 1 << 18, /* VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT */
-   ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE          = 1 << 19, /* VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT */
-   ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP            = 1 << 20, /* VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT */
-   ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE    = 1 << 21, /* VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT */
-   ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE         = 1 << 22, /* VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT */
-   ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP                  = 1 << 23, /* VK_DYNAMIC_STATE_STENCIL_OP_EXT */
-   ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS            = 1 << 24, /* VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT */
-   ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE           = 1 << 25, /* VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT */
-   ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE                = 1 << 26, /* VK_DYNAMIC_STATE_FRAGMENT_SHADING_RATE_KHR */
-   ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE   = 1 << 27, /* VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT */
-   ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE           = 1 << 28, /* VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE_EXT */
-   ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP                    = 1 << 29, /* VK_DYNAMIC_STATE_LOGIC_OP_EXT */
-   ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE    = 1 << 30, /* VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE_EXT */
-};
-typedef uint32_t anv_cmd_dirty_mask_t;
-
-#define ANV_CMD_DIRTY_DYNAMIC_ALL                       \
-   (ANV_CMD_DIRTY_DYNAMIC_VIEWPORT |                    \
-    ANV_CMD_DIRTY_DYNAMIC_SCISSOR |                     \
-    ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH |                  \
-    ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS |                  \
-    ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS |             \
-    ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS |                \
-    ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |        \
-    ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |          \
-    ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE |           \
-    ANV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE |                \
-    ANV_CMD_DIRTY_DYNAMIC_CULL_MODE |                   \
-    ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE |                  \
-    ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY |          \
-    ANV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE | \
-    ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE |           \
-    ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |          \
-    ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP |            \
-    ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |    \
-    ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE |         \
-    ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP |                  \
-    ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS |            \
-    ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |           \
-    ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE |                \
-    ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE |   \
-    ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE |           \
-    ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP |                    \
-    ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE)
-
-static inline enum anv_cmd_dirty_bits
-anv_cmd_dirty_bit_for_vk_dynamic_state(VkDynamicState vk_state)
+static inline bool
+anv_sparse_residency_is_enabled(struct anv_device *device)
 {
-   switch (vk_state) {
-   case VK_DYNAMIC_STATE_VIEWPORT:
-   case VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT_EXT:
-      return ANV_CMD_DIRTY_DYNAMIC_VIEWPORT;
-   case VK_DYNAMIC_STATE_SCISSOR:
-   case VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT_EXT:
-      return ANV_CMD_DIRTY_DYNAMIC_SCISSOR;
-   case VK_DYNAMIC_STATE_LINE_WIDTH:
-      return ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
-   case VK_DYNAMIC_STATE_DEPTH_BIAS:
-      return ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
-   case VK_DYNAMIC_STATE_BLEND_CONSTANTS:
-      return ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
-   case VK_DYNAMIC_STATE_DEPTH_BOUNDS:
-      return ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
-   case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK:
-      return ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
-   case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK:
-      return ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
-   case VK_DYNAMIC_STATE_STENCIL_REFERENCE:
-      return ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
-   case VK_DYNAMIC_STATE_LINE_STIPPLE_EXT:
-      return ANV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
-   case VK_DYNAMIC_STATE_CULL_MODE_EXT:
-      return ANV_CMD_DIRTY_DYNAMIC_CULL_MODE;
-   case VK_DYNAMIC_STATE_FRONT_FACE_EXT:
-      return ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE;
-   case VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT:
-      return ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
-   case VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT:
-      return ANV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE;
-   case VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT:
-      return ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE;
-   case VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT:
-      return ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE;
-   case VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT:
-      return ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP;
-   case VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT:
-      return ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
-   case VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT:
-      return ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE;
-   case VK_DYNAMIC_STATE_STENCIL_OP_EXT:
-      return ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
-   case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT:
-      return ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
-   case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
-      return ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE;
-   case VK_DYNAMIC_STATE_FRAGMENT_SHADING_RATE_KHR:
-      return ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE;
-   case VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT:
-      return ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
-   case VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE_EXT:
-      return ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE;
-   case VK_DYNAMIC_STATE_LOGIC_OP_EXT:
-      return ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
-   case VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE_EXT:
-      return ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
-   default:
-      assert(!"Unsupported dynamic state");
-      return 0;
-   }
+   return device->vk.enabled_features.sparseResidencyBuffer ||
+          device->vk.enabled_features.sparseResidencyImage2D ||
+          device->vk.enabled_features.sparseResidencyImage3D ||
+          device->vk.enabled_features.sparseResidency2Samples ||
+          device->vk.enabled_features.sparseResidency4Samples ||
+          device->vk.enabled_features.sparseResidency8Samples ||
+          device->vk.enabled_features.sparseResidency16Samples ||
+          device->vk.enabled_features.sparseResidencyAliased;
 }
 
+VkResult anv_init_sparse_bindings(struct anv_device *device,
+                                  uint64_t size,
+                                  struct anv_sparse_binding_data *sparse,
+                                  enum anv_bo_alloc_flags alloc_flags,
+                                  uint64_t client_address,
+                                  struct anv_address *out_address);
+void anv_free_sparse_bindings(struct anv_device *device,
+                              struct anv_sparse_binding_data *sparse);
+VkResult anv_sparse_bind_buffer(struct anv_device *device,
+                                struct anv_buffer *buffer,
+                                const VkSparseMemoryBind *vk_bind,
+                                struct anv_sparse_submission *submit);
+VkResult anv_sparse_bind_image_opaque(struct anv_device *device,
+                                      struct anv_image *image,
+                                      const VkSparseMemoryBind *vk_bind,
+                                      struct anv_sparse_submission *submit);
+VkResult anv_sparse_bind_image_memory(struct anv_queue *queue,
+                                      struct anv_image *image,
+                                      const VkSparseImageMemoryBind *bind,
+                                      struct anv_sparse_submission *submit);
+VkResult anv_sparse_bind(struct anv_device *device,
+                         struct anv_sparse_submission *sparse_submit);
+
+VkSparseImageFormatProperties
+anv_sparse_calc_image_format_properties(struct anv_physical_device *pdevice,
+                                        VkImageAspectFlags aspect,
+                                        VkImageType vk_image_type,
+                                        struct isl_surf *surf);
+void anv_sparse_calc_miptail_properties(struct anv_device *device,
+                                        struct anv_image *image,
+                                        VkImageAspectFlags vk_aspect,
+                                        uint32_t *imageMipTailFirstLod,
+                                        VkDeviceSize *imageMipTailSize,
+                                        VkDeviceSize *imageMipTailOffset,
+                                        VkDeviceSize *imageMipTailStride);
+VkResult anv_sparse_image_check_support(struct anv_physical_device *pdevice,
+                                        VkImageCreateFlags flags,
+                                        VkImageTiling tiling,
+                                        VkSampleCountFlagBits samples,
+                                        VkImageType type,
+                                        VkFormat format);
+VkResult anv_trtt_batch_bo_new(struct anv_device *device, uint32_t batch_size,
+                               struct anv_trtt_batch_bo **out_trtt_bbo);
+
+struct anv_buffer {
+   struct vk_buffer vk;
+
+   /* Set when bound */
+   struct anv_address address;
+
+   struct anv_sparse_binding_data sparse_data;
+};
+
+static inline bool
+anv_buffer_is_sparse(const struct anv_buffer *buffer)
+{
+   return buffer->vk.create_flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT;
+}
+
+enum anv_cmd_dirty_bits {
+   ANV_CMD_DIRTY_PIPELINE                            = 1 << 0,
+   ANV_CMD_DIRTY_INDEX_BUFFER                        = 1 << 1,
+   ANV_CMD_DIRTY_RENDER_AREA                         = 1 << 2,
+   ANV_CMD_DIRTY_RENDER_TARGETS                      = 1 << 3,
+   ANV_CMD_DIRTY_XFB_ENABLE                          = 1 << 4,
+   ANV_CMD_DIRTY_RESTART_INDEX                       = 1 << 5,
+   ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE              = 1 << 6,
+   ANV_CMD_DIRTY_FS_MSAA_FLAGS                       = 1 << 7,
+};
+typedef enum anv_cmd_dirty_bits anv_cmd_dirty_mask_t;
 
 enum anv_pipe_bits {
    ANV_PIPE_DEPTH_CACHE_FLUSH_BIT            = (1 << 0),
@@ -2479,6 +3213,20 @@ enum anv_pipe_bits {
     * must reinterpret this flush as ANV_PIPE_DATA_CACHE_FLUSH_BIT.
     */
    ANV_PIPE_HDC_PIPELINE_FLUSH_BIT           = (1 << 14),
+   ANV_PIPE_PSS_STALL_SYNC_BIT               = (1 << 15),
+
+   /*
+    * This bit flush data-port's Untyped L1 data cache (LSC L1).
+    */
+   ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT = (1 << 16),
+
+   /* This bit controls the flushing of the engine (Render, Compute) specific
+    * entries from the compression cache.
+    */
+   ANV_PIPE_CCS_CACHE_FLUSH_BIT              = (1 << 17),
+
+   ANV_PIPE_TLB_INVALIDATE_BIT               = (1 << 18),
+
    ANV_PIPE_CS_STALL_BIT                     = (1 << 20),
    ANV_PIPE_END_OF_PIPE_SYNC_BIT             = (1 << 21),
 
@@ -2489,225 +3237,158 @@ enum anv_pipe_bits {
     */
    ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT       = (1 << 22),
 
-   /* This bit does not exist directly in PIPE_CONTROL. It means that render
-    * target operations related to transfer commands with VkBuffer as
-    * destination are ongoing. Some operations like copies on the command
-    * streamer might need to be aware of this to trigger the appropriate stall
-    * before they can proceed with the copy.
-    */
-   ANV_PIPE_RENDER_TARGET_BUFFER_WRITES      = (1 << 23),
-
    /* This bit does not exist directly in PIPE_CONTROL. It means that Gfx12
     * AUX-TT data has changed and we need to invalidate AUX-TT data.  This is
     * done by writing the AUX-TT register.
     */
-   ANV_PIPE_AUX_TABLE_INVALIDATE_BIT         = (1 << 24),
+   ANV_PIPE_AUX_TABLE_INVALIDATE_BIT         = (1 << 23),
 
    /* This bit does not exist directly in PIPE_CONTROL. It means that a
     * PIPE_CONTROL with a post-sync operation will follow. This is used to
     * implement a workaround for Gfx9.
     */
-   ANV_PIPE_POST_SYNC_BIT                    = (1 << 25),
+   ANV_PIPE_POST_SYNC_BIT                    = (1 << 24),
 };
 
+/* These bits track the state of buffer writes for queries. They get cleared
+ * based on PIPE_CONTROL emissions.
+ */
+enum anv_query_bits {
+   ANV_QUERY_WRITES_RT_FLUSH      = (1 << 0),
+
+   ANV_QUERY_WRITES_TILE_FLUSH    = (1 << 1),
+
+   ANV_QUERY_WRITES_CS_STALL      = (1 << 2),
+
+   ANV_QUERY_WRITES_DATA_FLUSH    = (1 << 3),
+};
+
+/* It's not clear why DG2 doesn't have issues with L3/CS coherency. But it's
+ * likely related to performance workaround 14015868140.
+ *
+ * For now we enable this only on DG2 and platform prior to Gfx12 where there
+ * is no tile cache.
+ */
+#define ANV_DEVINFO_HAS_COHERENT_L3_CS(devinfo) \
+   (intel_device_info_is_dg2(devinfo))
+
+/* Things we need to flush before accessing query data using the command
+ * streamer.
+ *
+ * Prior to DG2 experiments show that the command streamer is not coherent
+ * with the tile cache so we need to flush it to make any data visible to CS.
+ *
+ * Otherwise we want to flush the RT cache which is where blorp writes, either
+ * for clearing the query buffer or for clearing the destination buffer in
+ * vkCopyQueryPoolResults().
+ */
+#define ANV_QUERY_RENDER_TARGET_WRITES_PENDING_BITS(devinfo) \
+   (((!ANV_DEVINFO_HAS_COHERENT_L3_CS(devinfo) && \
+      (devinfo)->ver >= 12) ? \
+     ANV_QUERY_WRITES_TILE_FLUSH : 0) | \
+    ANV_QUERY_WRITES_RT_FLUSH | \
+    ANV_QUERY_WRITES_CS_STALL)
+#define ANV_QUERY_COMPUTE_WRITES_PENDING_BITS \
+   (ANV_QUERY_WRITES_DATA_FLUSH | \
+    ANV_QUERY_WRITES_CS_STALL)
+
+#define ANV_PIPE_QUERY_BITS(pending_query_bits) ( \
+   ((pending_query_bits & ANV_QUERY_WRITES_RT_FLUSH) ?   \
+    ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0) | \
+   ((pending_query_bits & ANV_QUERY_WRITES_TILE_FLUSH) ?   \
+    ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0) | \
+   ((pending_query_bits & ANV_QUERY_WRITES_CS_STALL) ?   \
+    ANV_PIPE_CS_STALL_BIT : 0) | \
+   ((pending_query_bits & ANV_QUERY_WRITES_DATA_FLUSH) ?  \
+    (ANV_PIPE_DATA_CACHE_FLUSH_BIT | \
+     ANV_PIPE_HDC_PIPELINE_FLUSH_BIT | \
+     ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT) : 0))
+
 #define ANV_PIPE_FLUSH_BITS ( \
    ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | \
    ANV_PIPE_DATA_CACHE_FLUSH_BIT | \
    ANV_PIPE_HDC_PIPELINE_FLUSH_BIT | \
+   ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT | \
    ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | \
    ANV_PIPE_TILE_CACHE_FLUSH_BIT)
 
 #define ANV_PIPE_STALL_BITS ( \
    ANV_PIPE_STALL_AT_SCOREBOARD_BIT | \
    ANV_PIPE_DEPTH_STALL_BIT | \
-   ANV_PIPE_CS_STALL_BIT)
+   ANV_PIPE_CS_STALL_BIT | \
+   ANV_PIPE_PSS_STALL_SYNC_BIT)
 
 #define ANV_PIPE_INVALIDATE_BITS ( \
    ANV_PIPE_STATE_CACHE_INVALIDATE_BIT | \
    ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT | \
    ANV_PIPE_VF_CACHE_INVALIDATE_BIT | \
-   ANV_PIPE_HDC_PIPELINE_FLUSH_BIT | \
    ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT | \
    ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT | \
    ANV_PIPE_AUX_TABLE_INVALIDATE_BIT)
 
-static inline enum anv_pipe_bits
-anv_pipe_flush_bits_for_access_flags(struct anv_device *device,
-                                     VkAccessFlags flags)
-{
-   enum anv_pipe_bits pipe_bits = 0;
-
-   u_foreach_bit(b, flags) {
-      switch ((VkAccessFlagBits)(1 << b)) {
-      case VK_ACCESS_SHADER_WRITE_BIT:
-         /* We're transitioning a buffer that was previously used as write
-          * destination through the data port. To make its content available
-          * to future operations, flush the hdc pipeline.
-          */
-         pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
-         break;
-      case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
-         /* We're transitioning a buffer that was previously used as render
-          * target. To make its content available to future operations, flush
-          * the render target cache.
-          */
-         pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
-         break;
-      case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
-         /* We're transitioning a buffer that was previously used as depth
-          * buffer. To make its content available to future operations, flush
-          * the depth cache.
-          */
-         pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
-         break;
-      case VK_ACCESS_TRANSFER_WRITE_BIT:
-         /* We're transitioning a buffer that was previously used as a
-          * transfer write destination. Generic write operations include color
-          * & depth operations as well as buffer operations like :
-          *     - vkCmdClearColorImage()
-          *     - vkCmdClearDepthStencilImage()
-          *     - vkCmdBlitImage()
-          *     - vkCmdCopy*(), vkCmdUpdate*(), vkCmdFill*()
-          *
-          * Most of these operations are implemented using Blorp which writes
-          * through the render target, so flush that cache to make it visible
-          * to future operations. And for depth related operations we also
-          * need to flush the depth cache.
-          */
-         pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
-         pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
-         break;
-      case VK_ACCESS_MEMORY_WRITE_BIT:
-         /* We're transitioning a buffer for generic write operations. Flush
-          * all the caches.
-          */
-         pipe_bits |= ANV_PIPE_FLUSH_BITS;
-         break;
-      case VK_ACCESS_HOST_WRITE_BIT:
-         /* We're transitioning a buffer for access by CPU. Invalidate
-          * all the caches. Since data and tile caches don't have invalidate,
-          * we are forced to flush those as well.
-          */
-         pipe_bits |= ANV_PIPE_FLUSH_BITS;
-         pipe_bits |= ANV_PIPE_INVALIDATE_BITS;
-         break;
-      default:
-         break; /* Nothing to do */
-      }
-   }
-
-   return pipe_bits;
-}
+/* PIPE_CONTROL bits that should be set only in 3D RCS mode.
+ * For more details see genX(emit_apply_pipe_flushes).
+ */
+#define ANV_PIPE_GFX_BITS ( \
+   ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | \
+   ANV_PIPE_DEPTH_CACHE_FLUSH_BIT | \
+   ANV_PIPE_TILE_CACHE_FLUSH_BIT | \
+   ANV_PIPE_DEPTH_STALL_BIT | \
+   ANV_PIPE_STALL_AT_SCOREBOARD_BIT | \
+   (GFX_VERx10 >= 125 ? ANV_PIPE_PSS_STALL_SYNC_BIT : 0) | \
+   ANV_PIPE_VF_CACHE_INVALIDATE_BIT)
 
-static inline enum anv_pipe_bits
-anv_pipe_invalidate_bits_for_access_flags(struct anv_device *device,
-                                          VkAccessFlags flags)
-{
-   enum anv_pipe_bits pipe_bits = 0;
-
-   u_foreach_bit(b, flags) {
-      switch ((VkAccessFlagBits)(1 << b)) {
-      case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
-         /* Indirect draw commands take a buffer as input that we're going to
-          * read from the command streamer to load some of the HW registers
-          * (see genX_cmd_buffer.c:load_indirect_parameters). This requires a
-          * command streamer stall so that all the cache flushes have
-          * completed before the command streamer loads from memory.
-          */
-         pipe_bits |=  ANV_PIPE_CS_STALL_BIT;
-         /* Indirect draw commands also set gl_BaseVertex & gl_BaseIndex
-          * through a vertex buffer, so invalidate that cache.
-          */
-         pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
-         /* For CmdDipatchIndirect, we also load gl_NumWorkGroups through a
-          * UBO from the buffer, so we need to invalidate constant cache.
-          */
-         pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
-         pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
-         /* Tile cache flush needed For CmdDipatchIndirect since command
-          * streamer and vertex fetch aren't L3 coherent.
-          */
-         pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
-         break;
-      case VK_ACCESS_INDEX_READ_BIT:
-      case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
-         /* We transitioning a buffer to be used for as input for vkCmdDraw*
-          * commands, so we invalidate the VF cache to make sure there is no
-          * stale data when we start rendering.
-          */
-         pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
-         break;
-      case VK_ACCESS_UNIFORM_READ_BIT:
-         /* We transitioning a buffer to be used as uniform data. Because
-          * uniform is accessed through the data port & sampler, we need to
-          * invalidate the texture cache (sampler) & constant cache (data
-          * port) to avoid stale data.
-          */
-         pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
-         if (device->physical->compiler->indirect_ubos_use_sampler)
-            pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
-         else
-            pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
-         break;
-      case VK_ACCESS_SHADER_READ_BIT:
-      case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT:
-      case VK_ACCESS_TRANSFER_READ_BIT:
-         /* Transitioning a buffer to be read through the sampler, so
-          * invalidate the texture cache, we don't want any stale data.
-          */
-         pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
-         break;
-      case VK_ACCESS_MEMORY_READ_BIT:
-         /* Transitioning a buffer for generic read, invalidate all the
-          * caches.
-          */
-         pipe_bits |= ANV_PIPE_INVALIDATE_BITS;
-         break;
-      case VK_ACCESS_MEMORY_WRITE_BIT:
-         /* Generic write, make sure all previously written things land in
-          * memory.
-          */
-         pipe_bits |= ANV_PIPE_FLUSH_BITS;
-         break;
-      case VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT:
-         /* Transitioning a buffer for conditional rendering. We'll load the
-          * content of this buffer into HW registers using the command
-          * streamer, so we need to stall the command streamer to make sure
-          * any in-flight flush operations have completed. Needs tile cache 
-          * and data cache flush because command stream isn't L3 coherent yet.
-          */
-         pipe_bits |= ANV_PIPE_CS_STALL_BIT;
-         pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
-         pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
-         break;
-      case VK_ACCESS_HOST_READ_BIT:
-         /* We're transitioning a buffer that was written by CPU.  Flush 
-          * all the caches.
-          */
-         pipe_bits |= ANV_PIPE_FLUSH_BITS;
-         break;
-      default:
-         break; /* Nothing to do */
-      }
-   }
+/* PIPE_CONTROL bits that should be set only in Media/GPGPU RCS mode.
+ * For more details see genX(emit_apply_pipe_flushes).
+ *
+ * Documentation says that untyped L1 dataport cache flush is controlled by
+ * HDC pipeline flush in 3D mode according to HDC_CHICKEN0 register:
+ *
+ * BSpec 47112: PIPE_CONTROL::HDC Pipeline Flush:
+ *
+ *    "When the "Pipeline Select" mode in PIPELINE_SELECT command is set to
+ *     "3D", HDC Pipeline Flush can also flush/invalidate the LSC Untyped L1
+ *     cache based on the programming of HDC_Chicken0 register bits 13:11."
+ *
+ *    "When the 'Pipeline Select' mode is set to 'GPGPU', the LSC Untyped L1
+ *     cache flush is controlled by 'Untyped Data-Port Cache Flush' bit in the
+ *     PIPE_CONTROL command."
+ *
+ *    As part of Wa_22010960976 & Wa_14013347512, i915 is programming
+ *    HDC_CHICKEN0[11:13] = 0 ("Untyped L1 is flushed, for both 3D Pipecontrol
+ *    Dataport flush, and UAV coherency barrier event"). So there is no need
+ *    to set "Untyped Data-Port Cache" in 3D mode.
+ *
+ * On MTL the HDC_CHICKEN0 default values changed to match what was programmed
+ * by Wa_22010960976 & Wa_14013347512 on DG2, but experiments show that the
+ * change runs a bit deeper. Even manually writing to the HDC_CHICKEN0
+ * register to force L1 untyped flush with HDC pipeline flush has no effect on
+ * MTL.
+ *
+ * It seems like the HW change completely disconnected L1 untyped flush from
+ * HDC pipeline flush with no way to bring that behavior back. So leave the L1
+ * untyped flush active in 3D mode on all platforms since it doesn't seems to
+ * cause issues there too.
+ *
+ * Maybe we'll have some GPGPU only bits here at some point.
+ */
+#define ANV_PIPE_GPGPU_BITS (0)
 
-   return pipe_bits;
-}
+enum intel_ds_stall_flag
+anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits);
 
-#define VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV (         \
-   VK_IMAGE_ASPECT_COLOR_BIT | \
-   VK_IMAGE_ASPECT_PLANE_0_BIT | \
-   VK_IMAGE_ASPECT_PLANE_1_BIT | \
-   VK_IMAGE_ASPECT_PLANE_2_BIT)
 #define VK_IMAGE_ASPECT_PLANES_BITS_ANV ( \
    VK_IMAGE_ASPECT_PLANE_0_BIT | \
    VK_IMAGE_ASPECT_PLANE_1_BIT | \
    VK_IMAGE_ASPECT_PLANE_2_BIT)
 
+#define VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV (         \
+   VK_IMAGE_ASPECT_COLOR_BIT | \
+   VK_IMAGE_ASPECT_PLANES_BITS_ANV)
+
 struct anv_vertex_binding {
    struct anv_buffer *                          buffer;
    VkDeviceSize                                 offset;
-   VkDeviceSize                                 stride;
    VkDeviceSize                                 size;
 };
 
@@ -2721,129 +3402,88 @@ struct anv_push_constants {
    /** Push constant data provided by the client through vkPushConstants */
    uint8_t client_data[MAX_PUSH_CONSTANTS_SIZE];
 
-   /** Dynamic offsets for dynamic UBOs and SSBOs */
-   uint32_t dynamic_offsets[MAX_DYNAMIC_BUFFERS];
-
-   /* Robust access pushed registers. */
-   uint64_t push_reg_mask[MESA_SHADER_STAGES];
-
-   /** Pad out to a multiple of 32 bytes */
-   uint32_t pad[2];
-
-   /* Base addresses for descriptor sets */
-   uint64_t desc_sets[MAX_SETS];
-
-   struct {
-      /** Base workgroup ID
-       *
-       * Used for vkCmdDispatchBase.
-       */
-      uint32_t base_work_group_id[3];
-
-      /** Subgroup ID
-       *
-       * This is never set by software but is implicitly filled out when
-       * uploading the push constants for compute shaders.
-       */
-      uint32_t subgroup_id;
-   } cs;
-};
+#define ANV_DESCRIPTOR_SET_DYNAMIC_INDEX_MASK ((uint32_t)ANV_UBO_ALIGNMENT - 1)
+#define ANV_DESCRIPTOR_SET_OFFSET_MASK        (~(uint32_t)(ANV_UBO_ALIGNMENT - 1))
 
-struct anv_dynamic_state {
-   struct {
-      uint32_t                                  count;
-      VkViewport                                viewports[MAX_VIEWPORTS];
-   } viewport;
-
-   struct {
-      uint32_t                                  count;
-      VkRect2D                                  scissors[MAX_SCISSORS];
-   } scissor;
-
-   float                                        line_width;
-
-   struct {
-      float                                     bias;
-      float                                     clamp;
-      float                                     slope;
-   } depth_bias;
-
-   float                                        blend_constants[4];
-
-   struct {
-      float                                     min;
-      float                                     max;
-   } depth_bounds;
+   /**
+    * Base offsets for descriptor sets from
+    *
+    * The offset has different meaning depending on a number of factors :
+    *
+    *    - with descriptor sets (direct or indirect), this relative
+    *      pdevice->va.descriptor_pool
+    *
+    *    - with descriptor buffers on DG2+, relative
+    *      device->va.descriptor_buffer_pool
+    *
+    *    - with descriptor buffers prior to DG2, relative the programmed value
+    *      in STATE_BASE_ADDRESS::BindlessSurfaceStateBaseAddress
+    */
+   uint32_t desc_surface_offsets[MAX_SETS];
 
-   struct {
-      uint32_t                                  front;
-      uint32_t                                  back;
-   } stencil_compare_mask;
+   /**
+    * Base offsets for descriptor sets from
+    */
+   uint32_t desc_sampler_offsets[MAX_SETS];
 
-   struct {
-      uint32_t                                  front;
-      uint32_t                                  back;
-   } stencil_write_mask;
+   /** Dynamic offsets for dynamic UBOs and SSBOs */
+   uint32_t dynamic_offsets[MAX_DYNAMIC_BUFFERS];
 
-   struct {
-      uint32_t                                  front;
-      uint32_t                                  back;
-   } stencil_reference;
+   /** Surface buffer base offset
+    *
+    * Only used prior to DG2 with descriptor buffers.
+    *
+    * (surfaces_base_offset + desc_offsets[set_index]) is relative to
+    * device->va.descriptor_buffer_pool and can be used to compute a 64bit
+    * address to the descriptor buffer (using load_desc_set_address_intel).
+    */
+   uint32_t surfaces_base_offset;
 
-   struct {
-      struct {
-         VkStencilOp fail_op;
-         VkStencilOp pass_op;
-         VkStencilOp depth_fail_op;
-         VkCompareOp compare_op;
-      } front;
+   union {
       struct {
-         VkStencilOp fail_op;
-         VkStencilOp pass_op;
-         VkStencilOp depth_fail_op;
-         VkCompareOp compare_op;
-      } back;
-   } stencil_op;
+         /** Dynamic MSAA value */
+         uint32_t fs_msaa_flags;
 
-   struct {
-      uint32_t                                  factor;
-      uint16_t                                  pattern;
-   } line_stipple;
+         /** Dynamic TCS input vertices */
+         uint32_t tcs_input_vertices;
+      } gfx;
 
-   struct {
-      uint32_t                                  samples;
-      VkSampleLocationEXT                       locations[MAX_SAMPLE_LOCATIONS];
-   } sample_locations;
+      struct {
+         /** Base workgroup ID
+          *
+          * Used for vkCmdDispatchBase.
+          */
+         uint32_t base_work_group_id[3];
 
-   VkExtent2D                                   fragment_shading_rate;
+         /** Subgroup ID
+          *
+          * This is never set by software but is implicitly filled out when
+          * uploading the push constants for compute shaders.
+          */
+         uint32_t subgroup_id;
+      } cs;
+   };
 
-   VkCullModeFlags                              cull_mode;
-   VkFrontFace                                  front_face;
-   VkPrimitiveTopology                          primitive_topology;
-   bool                                         depth_test_enable;
-   bool                                         depth_write_enable;
-   VkCompareOp                                  depth_compare_op;
-   bool                                         depth_bounds_test_enable;
-   bool                                         stencil_test_enable;
-   bool                                         raster_discard;
-   bool                                         depth_bias_enable;
-   bool                                         primitive_restart_enable;
-   VkLogicOp                                    logic_op;
-   bool                                         dyn_vbo_stride;
-   bool                                         dyn_vbo_size;
+   /* Robust access pushed registers. */
+   uint64_t push_reg_mask[MESA_SHADER_STAGES];
 
-   /* Bitfield, one bit per render target */
-   uint8_t                                      color_writes;
+   /** Ray query globals (RT_DISPATCH_GLOBALS) */
+   uint64_t ray_query_globals;
 };
 
-extern const struct anv_dynamic_state default_dynamic_state;
-
-uint32_t anv_dynamic_state_copy(struct anv_dynamic_state *dest,
-                                const struct anv_dynamic_state *src,
-                                uint32_t copy_mask);
-
 struct anv_surface_state {
+   /** Surface state allocated from the bindless heap
+    *
+    * Can be NULL if unused.
+    */
    struct anv_state state;
+
+   /** Surface state after genxml packing
+    *
+    * Same data as in state.
+    */
+   struct anv_surface_state_data state_data;
+
    /** Address of the surface referred to by this state
     *
     * This address is relative to the start of the BO.
@@ -2864,31 +3504,16 @@ struct anv_surface_state {
    struct anv_address clear_address;
 };
 
-/**
- * Attachment state when recording a renderpass instance.
- *
- * The clear value is valid only if there exists a pending clear.
- */
-struct anv_attachment_state {
-   enum isl_aux_usage                           aux_usage;
-   struct anv_surface_state                     color;
-   struct anv_surface_state                     input;
-
-   VkImageLayout                                current_layout;
-   VkImageLayout                                current_stencil_layout;
-   VkImageAspectFlags                           pending_clear_aspects;
-   VkImageAspectFlags                           pending_load_aspects;
-   bool                                         fast_clear;
-   VkClearValue                                 clear_value;
+struct anv_attachment {
+   VkFormat vk_format;
+   const struct anv_image_view *iview;
+   VkImageLayout layout;
+   enum isl_aux_usage aux_usage;
+   struct anv_surface_state surface_state;
 
-   /* When multiview is active, attachments with a renderpass clear
-    * operation have their respective layers cleared on the first
-    * subpass that uses them, and only in that subpass. We keep track
-    * of this using a bitfield to indicate which layers of an attachment
-    * have not been cleared yet when multiview is active.
-    */
-   uint32_t                                     pending_clear_views;
-   struct anv_image_view *                      image_view;
+   VkResolveModeFlagBits resolve_mode;
+   const struct anv_image_view *resolve_iview;
+   VkImageLayout resolve_layout;
 };
 
 /** State tracking for vertex buffer flushes
@@ -2912,6 +3537,70 @@ struct anv_vb_cache_range {
    uint64_t end;
 };
 
+static inline void
+anv_merge_vb_cache_range(struct anv_vb_cache_range *dirty,
+                         const struct anv_vb_cache_range *bound)
+{
+   if (dirty->start == dirty->end) {
+      *dirty = *bound;
+   } else if (bound->start != bound->end) {
+      dirty->start = MIN2(dirty->start, bound->start);
+      dirty->end = MAX2(dirty->end, bound->end);
+   }
+}
+
+/* Check whether we need to apply the Gfx8-9 vertex buffer workaround*/
+static inline bool
+anv_gfx8_9_vb_cache_range_needs_workaround(struct anv_vb_cache_range *bound,
+                                           struct anv_vb_cache_range *dirty,
+                                           struct anv_address vb_address,
+                                           uint32_t vb_size)
+{
+   if (vb_size == 0) {
+      bound->start = 0;
+      bound->end = 0;
+      return false;
+   }
+
+   bound->start = intel_48b_address(anv_address_physical(vb_address));
+   bound->end = bound->start + vb_size;
+   assert(bound->end > bound->start); /* No overflow */
+
+   /* Align everything to a cache line */
+   bound->start &= ~(64ull - 1ull);
+   bound->end = align64(bound->end, 64);
+
+   anv_merge_vb_cache_range(dirty, bound);
+
+   /* If our range is larger than 32 bits, we have to flush */
+   assert(bound->end - bound->start <= (1ull << 32));
+   return (dirty->end - dirty->start) > (1ull << 32);
+}
+
+/**
+ * State tracking for simple internal shaders
+ */
+struct anv_simple_shader {
+   /* The device associated with this emission */
+   struct anv_device *device;
+   /* The command buffer associated with this emission (can be NULL) */
+   struct anv_cmd_buffer *cmd_buffer;
+   /* State stream used for various internal allocations */
+   struct anv_state_stream *dynamic_state_stream;
+   struct anv_state_stream *general_state_stream;
+   /* Where to emit the commands (can be different from cmd_buffer->batch) */
+   struct anv_batch *batch;
+   /* Shader to use */
+   struct anv_shader_bin *kernel;
+   /* L3 config used by the shader */
+   const struct intel_l3_config *l3_config;
+   /* Current URB config */
+   const struct intel_urb_config *urb_cfg;
+
+   /* Managed by the simpler shader helper*/
+   struct anv_state bt_state;
+};
+
 /** State tracking for particular pipeline bind point
  *
  * This struct is the base struct for anv_cmd_graphics_state and
@@ -2922,12 +3611,55 @@ struct anv_vb_cache_range {
  */
 struct anv_cmd_pipeline_state {
    struct anv_descriptor_set *descriptors[MAX_SETS];
-   struct anv_push_descriptor_set *push_descriptors[MAX_SETS];
+   struct {
+      bool             bound;
+      /**
+       * Buffer index used by this descriptor set.
+       */
+      int32_t          buffer_index; /* -1 means push descriptor */
+      /**
+       * Offset of the descriptor set in the descriptor buffer.
+       */
+      uint32_t         buffer_offset;
+      /**
+       * Final computed address to be emitted in the descriptor set surface
+       * state.
+       */
+      uint64_t         address;
+      /**
+       * The descriptor set surface state.
+       */
+      struct anv_state state;
+   } descriptor_buffers[MAX_SETS];
+   struct anv_push_descriptor_set push_descriptor;
 
    struct anv_push_constants push_constants;
 
+   /** Tracks whether the push constant data has changed and need to be reemitted */
+   bool                                         push_constants_data_dirty;
+
    /* Push constant state allocated when flushing push constants. */
    struct anv_state          push_constants_state;
+
+   /**
+    * Dynamic buffer offsets.
+    *
+    * We have a maximum of MAX_DYNAMIC_BUFFERS per pipeline, but with
+    * independent sets we cannot know which how much in total is going to be
+    * used. As a result we need to store the maximum possible number per set.
+    *
+    * Those values are written into anv_push_constants::dynamic_offsets at
+    * flush time when have the pipeline with the final
+    * anv_pipeline_sets_layout.
+    */
+   struct {
+      uint32_t                                  offsets[MAX_DYNAMIC_BUFFERS];
+   }                                            dynamic_offsets[MAX_SETS];
+
+   /**
+    * The current bound pipeline.
+    */
+   struct anv_pipeline      *pipeline;
 };
 
 /** State tracking for graphics pipeline
@@ -2940,7 +3672,17 @@ struct anv_cmd_pipeline_state {
 struct anv_cmd_graphics_state {
    struct anv_cmd_pipeline_state base;
 
-   struct anv_graphics_pipeline *pipeline;
+   VkRenderingFlags rendering_flags;
+   VkRect2D render_area;
+   uint32_t layer_count;
+   uint32_t samples;
+   uint32_t view_mask;
+   uint32_t color_att_count;
+   struct anv_state att_states;
+   struct anv_attachment color_att[MAX_RTS];
+   struct anv_attachment depth_att;
+   struct anv_attachment stencil_att;
+   struct anv_state null_surface_state;
 
    anv_cmd_dirty_mask_t dirty;
    uint32_t vb_dirty;
@@ -2950,23 +3692,55 @@ struct anv_cmd_graphics_state {
    struct anv_vb_cache_range vb_bound_ranges[33];
    struct anv_vb_cache_range vb_dirty_ranges[33];
 
-   VkShaderStageFlags push_constant_stages;
+   uint32_t restart_index;
 
-   struct anv_dynamic_state dynamic;
+   VkShaderStageFlags push_constant_stages;
 
    uint32_t primitive_topology;
+   bool used_task_shader;
 
-   struct {
-      struct anv_buffer *index_buffer;
-      uint32_t index_type; /**< 3DSTATE_INDEX_BUFFER.IndexFormat */
-      uint32_t index_offset;
-   } gfx7;
+   struct anv_buffer *index_buffer;
+   uint32_t index_type; /**< 3DSTATE_INDEX_BUFFER.IndexFormat */
+   uint32_t index_offset;
+   uint32_t index_size;
+
+   struct vk_vertex_input_state vertex_input;
+   struct vk_sample_locations_state sample_locations;
+
+   /* Dynamic msaa flags, this value can be different from
+    * anv_push_constants::gfx::fs_msaa_flags, as the push constant value only
+    * needs to be updated for fragment shaders dynamically checking the value.
+    */
+   enum intel_msaa_flags fs_msaa_flags;
+
+   bool object_preemption;
+   bool has_uint_rt;
+
+   /* State tracking for Wa_14018912822. */
+   bool color_blend_zero;
+   bool alpha_blend_zero;
+
+   /**
+    * DEPTH and STENCIL attachment write state for Wa_18019816803.
+    */
+   bool ds_write_state;
+
+   /**
+    * State tracking for Wa_18020335297.
+    */
+   bool                                         viewport_set;
+
+   struct intel_urb_config urb_cfg;
+
+   uint32_t n_occlusion_queries;
+
+   struct anv_gfx_dynamic_state dyn_state;
 };
 
 enum anv_depth_reg_mode {
    ANV_DEPTH_REG_MODE_UNKNOWN = 0,
    ANV_DEPTH_REG_MODE_HW_DEFAULT,
-   ANV_DEPTH_REG_MODE_D16,
+   ANV_DEPTH_REG_MODE_D16_1X_MSAA,
 };
 
 /** State tracking for compute pipeline
@@ -2979,26 +3753,33 @@ enum anv_depth_reg_mode {
 struct anv_cmd_compute_state {
    struct anv_cmd_pipeline_state base;
 
-   struct anv_compute_pipeline *pipeline;
-
    bool pipeline_dirty;
 
    struct anv_state push_data;
 
    struct anv_address num_workgroups;
+
+   uint32_t scratch_size;
 };
 
 struct anv_cmd_ray_tracing_state {
    struct anv_cmd_pipeline_state base;
 
-   struct anv_ray_tracing_pipeline *pipeline;
-
    bool pipeline_dirty;
 
    struct {
       struct anv_bo *bo;
       struct brw_rt_scratch_layout layout;
    } scratch;
+
+   struct anv_address build_priv_mem_addr;
+   size_t             build_priv_mem_size;
+};
+
+enum anv_cmd_descriptor_buffer_mode {
+   ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN,
+   ANV_CMD_DESCRIPTOR_BUFFER_MODE_LEGACY,
+   ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER,
 };
 
 /** State required while building cmd buffer */
@@ -3013,23 +3794,63 @@ struct anv_cmd_state {
    struct anv_cmd_ray_tracing_state             rt;
 
    enum anv_pipe_bits                           pending_pipe_bits;
+   const char *                                 pc_reasons[4];
+   uint32_t                                     pc_reasons_count;
+
+   /**
+    * Whether the last programmed STATE_BASE_ADDRESS references
+    * anv_device::dynamic_state_pool or anv_device::dynamic_state_pool_db for
+    * the dynamic state heap.
+    */
+   enum anv_cmd_descriptor_buffer_mode          current_db_mode;
+
+   /**
+    * Whether the command buffer has pending descriptor buffers bound it. This
+    * variable changes before anv_device::current_db_mode.
+    */
+   enum anv_cmd_descriptor_buffer_mode          pending_db_mode;
+
+   struct {
+      /**
+       * Tracks operations susceptible to interfere with queries in the
+       * destination buffer of vkCmdCopyQueryResults, we need those operations to
+       * have completed before we do the work of vkCmdCopyQueryResults.
+       */
+      enum anv_query_bits                          buffer_write_bits;
+
+      /**
+       * Tracks clear operations of query buffers that can interact with
+       * vkCmdQueryBegin*, vkCmdWriteTimestamp*,
+       * vkCmdWriteAccelerationStructuresPropertiesKHR, etc...
+       *
+       * We need the clearing of the buffer completed before with write data with
+       * the command streamer or a shader.
+       */
+      enum anv_query_bits                          clear_bits;
+   } queries;
+
    VkShaderStageFlags                           descriptors_dirty;
+   VkShaderStageFlags                           push_descriptors_dirty;
+   /** Tracks the 3DSTATE_CONSTANT_* instruction that needs to be reemitted */
    VkShaderStageFlags                           push_constants_dirty;
 
-   struct anv_framebuffer *                     framebuffer;
-   struct anv_render_pass *                     pass;
-   struct anv_subpass *                         subpass;
-   VkRect2D                                     render_area;
-   uint32_t                                     restart_index;
+   struct {
+      uint64_t                                  surfaces_address;
+      uint64_t                                  samplers_address;
+      bool                                      dirty;
+      VkShaderStageFlags                        offsets_dirty;
+      uint64_t                                  address[MAX_SETS];
+   }                                            descriptor_buffers;
+
    struct anv_vertex_binding                    vertex_bindings[MAX_VBS];
    bool                                         xfb_enabled;
    struct anv_xfb_binding                       xfb_bindings[MAX_XFB_BUFFERS];
    struct anv_state                             binding_tables[MESA_VULKAN_SHADER_STAGES];
    struct anv_state                             samplers[MESA_VULKAN_SHADER_STAGES];
 
-   unsigned char                                sampler_sha1s[MESA_SHADER_STAGES][20];
-   unsigned char                                surface_sha1s[MESA_SHADER_STAGES][20];
-   unsigned char                                push_sha1s[MESA_SHADER_STAGES][20];
+   unsigned char                                sampler_sha1s[MESA_VULKAN_SHADER_STAGES][20];
+   unsigned char                                surface_sha1s[MESA_VULKAN_SHADER_STAGES][20];
+   unsigned char                                push_sha1s[MESA_VULKAN_SHADER_STAGES][20];
 
    /**
     * Whether or not the gfx8 PMA fix is enabled.  We ensure that, at the top
@@ -3045,47 +3866,36 @@ struct anv_cmd_state {
     */
    bool                                         hiz_enabled;
 
-   /* We ensure the registers for the gfx12 D16 fix are initalized at the
+   /* We ensure the registers for the gfx12 D16 fix are initialized at the
     * first non-NULL depth stencil packet emission of every command buffer.
     * For secondary command buffer execution, we transfer the state from the
     * last command buffer to the primary (if known).
     */
    enum anv_depth_reg_mode                      depth_reg_mode;
 
-   bool                                         conditional_render_enabled;
-
    /**
-    * Last rendering scale argument provided to
-    * genX(cmd_buffer_emit_hashing_mode)().
+    * Whether RHWO optimization is enabled (Wa_1508744258).
     */
-   unsigned                                     current_hash_scale;
+   bool                                         rhwo_optimization_enabled;
 
    /**
-    * Array length is anv_cmd_state::pass::attachment_count. Array content is
-    * valid only when recording a render pass instance.
+    * Pending state of the RHWO optimization, to be applied at the next
+    * genX(cmd_buffer_apply_pipe_flushes).
     */
-   struct anv_attachment_state *                attachments;
+   bool                                         pending_rhwo_optimization_enabled;
+
+   bool                                         conditional_render_enabled;
 
    /**
-    * Surface states for color render targets.  These are stored in a single
-    * flat array.  For depth-stencil attachments, the surface state is simply
-    * left blank.
+    * Last rendering scale argument provided to
+    * genX(cmd_buffer_emit_hashing_mode)().
     */
-   struct anv_state                             attachment_states;
+   unsigned                                     current_hash_scale;
 
    /**
-    * A null surface state of the right size to match the framebuffer.  This
-    * is one of the states in attachment_states.
+    * A buffer used for spill/fill of ray queries.
     */
-   struct anv_state                             null_surface_state;
-};
-
-struct anv_cmd_pool {
-   struct vk_object_base                        base;
-   VkAllocationCallbacks                        alloc;
-   struct list_head                             cmd_buffers;
-
-   VkCommandPoolCreateFlags                     flags;
+   struct anv_bo *                              ray_query_shadow_bo;
 };
 
 #define ANV_MIN_CMD_BUFFER_BATCH_SIZE 8192
@@ -3103,13 +3913,12 @@ enum anv_cmd_buffer_exec_mode {
 struct anv_measure_batch;
 
 struct anv_cmd_buffer {
-   struct vk_object_base                        base;
+   struct vk_command_buffer                     vk;
 
    struct anv_device *                          device;
+   struct anv_queue_family *                    queue_family;
 
-   struct anv_cmd_pool *                        pool;
-   struct list_head                             pool_link;
-
+   /** Batch where the main commands live */
    struct anv_batch                             batch;
 
    /* Pointer to the location in the batch where MI_BATCH_BUFFER_END was
@@ -3140,8 +3949,6 @@ struct anv_cmd_buffer {
    struct anv_state                             bt_next;
 
    struct anv_reloc_list                        surface_relocs;
-   /** Last seen surface state block pool center bo offset */
-   uint32_t                                     last_ss_pool_center;
 
    /* Serial for tracking buffer completion */
    uint32_t                                     serial;
@@ -3149,10 +3956,12 @@ struct anv_cmd_buffer {
    /* Stream objects for storing temporary data */
    struct anv_state_stream                      surface_state_stream;
    struct anv_state_stream                      dynamic_state_stream;
+   struct anv_state_stream                      dynamic_state_db_stream;
    struct anv_state_stream                      general_state_stream;
+   struct anv_state_stream                      indirect_push_descriptor_stream;
+   struct anv_state_stream                      push_descriptor_buffer_stream;
 
    VkCommandBufferUsageFlags                    usage_flags;
-   VkCommandBufferLevel                         level;
 
    struct anv_query_pool                       *perf_query_pool;
 
@@ -3180,22 +3989,169 @@ struct anv_cmd_buffer {
    uint32_t                                      perf_reloc_idx;
 
    /**
-    * Sum of all the anv_batch_bo sizes allocated for this command buffer.
-    * Used to increase allocation size for long command buffers.
+    * Sum of all the anv_batch_bo written sizes for this command buffer
+    * including any executed secondary command buffer.
     */
    uint32_t                                     total_batch_size;
+
+   struct {
+      /** Batch generating part of the anv_cmd_buffer::batch */
+      struct anv_batch                          batch;
+
+      /**
+       * Location in anv_cmd_buffer::batch at which we left some space to
+       * insert a MI_BATCH_BUFFER_START into the
+       * anv_cmd_buffer::generation::batch if needed.
+       */
+      struct anv_address                        jump_addr;
+
+      /**
+       * Location in anv_cmd_buffer::batch at which the generation batch
+       * should jump back to.
+       */
+      struct anv_address                        return_addr;
+
+      /** List of anv_batch_bo used for generation
+       *
+       * We have to keep this separated of the anv_cmd_buffer::batch_bos that
+       * is used for a chaining optimization.
+       */
+      struct list_head                          batch_bos;
+
+      /** Ring buffer of generated commands
+       *
+       * When generating draws in ring mode, this buffer will hold generated
+       * 3DPRIMITIVE commands.
+       */
+      struct anv_bo                            *ring_bo;
+
+      /**
+       * State tracking of the generation shader (only used for the non-ring
+       * mode).
+       */
+      struct anv_simple_shader                  shader_state;
+   } generation;
+
+   /**
+    * A vector of anv_bo pointers for chunks of memory used by the command
+    * buffer that are too large to be allocated through dynamic_state_stream.
+    * This is the case for large enough acceleration structures.
+    *
+    * initialized by anv_cmd_buffer_init_batch_bo_chain()
+    */
+   struct u_vector                              dynamic_bos;
+
+   /**
+    * Structure holding tracepoints recorded in the command buffer.
+    */
+   struct u_trace                               trace;
+
+   /** Pointer to the last emitted COMPUTE_WALKER.
+    *
+    * This is used to edit the instruction post emission to replace the "Post
+    * Sync" field for utrace timestamp emission.
+    */
+   void                                        *last_compute_walker;
+
+   /** Pointer to the last emitted EXECUTE_INDIRECT_DISPATCH.
+    *
+    * This is used to edit the instruction post emission to replace the "Post
+    * Sync" field for utrace timestamp emission.
+    */
+   void                                        *last_indirect_dispatch;
+
+   struct {
+      struct anv_video_session *vid;
+      struct anv_video_session_params *params;
+   } video;
+
+   /**
+    * Companion RCS command buffer to support the MSAA operations on compute
+    * queue.
+    */
+   struct anv_cmd_buffer                        *companion_rcs_cmd_buffer;
+
+   /**
+    * Whether this command buffer is a companion command buffer of compute one.
+    */
+   bool                                         is_companion_rcs_cmd_buffer;
+
 };
 
+extern const struct vk_command_buffer_ops anv_cmd_buffer_ops;
+
 /* Determine whether we can chain a given cmd_buffer to another one. We need
- * softpin and we also need to make sure that we can edit the end of the batch
- * to point to next one, which requires the command buffer to not be used
- * simultaneously.
+ * to make sure that we can edit the end of the batch to point to next one,
+ * which requires the command buffer to not be used simultaneously.
+ *
+ * We could in theory also implement chaining with companion command buffers,
+ * but let's sparse ourselves some pain and misery. This optimization has no
+ * benefit on the brand new Xe kernel driver.
  */
 static inline bool
 anv_cmd_buffer_is_chainable(struct anv_cmd_buffer *cmd_buffer)
 {
-   return anv_use_softpin(cmd_buffer->device->physical) &&
-      !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT);
+   return !(cmd_buffer->usage_flags &
+            VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT) &&
+          !(cmd_buffer->is_companion_rcs_cmd_buffer);
+}
+
+static inline bool
+anv_cmd_buffer_is_render_queue(const struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_queue_family *queue_family = cmd_buffer->queue_family;
+   return (queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT) != 0;
+}
+
+static inline bool
+anv_cmd_buffer_is_video_queue(const struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_queue_family *queue_family = cmd_buffer->queue_family;
+   return (queue_family->queueFlags & VK_QUEUE_VIDEO_DECODE_BIT_KHR) != 0;
+}
+
+static inline bool
+anv_cmd_buffer_is_compute_queue(const struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_queue_family *queue_family = cmd_buffer->queue_family;
+   return queue_family->engine_class == INTEL_ENGINE_CLASS_COMPUTE;
+}
+
+static inline bool
+anv_cmd_buffer_is_blitter_queue(const struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_queue_family *queue_family = cmd_buffer->queue_family;
+   return queue_family->engine_class == INTEL_ENGINE_CLASS_COPY;
+}
+
+static inline bool
+anv_cmd_buffer_is_render_or_compute_queue(const struct anv_cmd_buffer *cmd_buffer)
+{
+   return anv_cmd_buffer_is_render_queue(cmd_buffer) ||
+          anv_cmd_buffer_is_compute_queue(cmd_buffer);
+}
+
+static inline struct anv_address
+anv_cmd_buffer_dynamic_state_address(struct anv_cmd_buffer *cmd_buffer,
+                                     struct anv_state state)
+{
+   if (cmd_buffer->state.current_db_mode ==
+       ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER) {
+      return anv_state_pool_state_address(
+         &cmd_buffer->device->dynamic_state_db_pool, state);
+   }
+   return anv_state_pool_state_address(
+      &cmd_buffer->device->dynamic_state_pool, state);
+}
+
+static inline uint64_t
+anv_cmd_buffer_descriptor_buffer_address(struct anv_cmd_buffer *cmd_buffer,
+                                         int32_t buffer_index)
+{
+   if (buffer_index == -1)
+      return cmd_buffer->device->physical->va.push_descriptor_buffer_pool.addr;
+
+   return cmd_buffer->state.descriptor_buffers.address[buffer_index];
 }
 
 VkResult anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer);
@@ -3216,7 +4172,8 @@ VkResult anv_cmd_buffer_execbuf(struct anv_queue *queue,
                                 VkFence fence,
                                 int perf_query_pass);
 
-VkResult anv_cmd_buffer_reset(struct anv_cmd_buffer *cmd_buffer);
+void anv_cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer,
+                          UNUSED VkCommandBufferResetFlags flags);
 
 struct anv_state anv_cmd_buffer_emit_dynamic(struct anv_cmd_buffer *cmd_buffer,
                                              const void *data, uint32_t size, uint32_t alignment);
@@ -3230,111 +4187,153 @@ struct anv_state
 anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer,
                                    uint32_t entries, uint32_t *state_offset);
 struct anv_state
-anv_cmd_buffer_alloc_surface_state(struct anv_cmd_buffer *cmd_buffer);
+anv_cmd_buffer_alloc_surface_states(struct anv_cmd_buffer *cmd_buffer,
+                                    uint32_t count);
 struct anv_state
 anv_cmd_buffer_alloc_dynamic_state(struct anv_cmd_buffer *cmd_buffer,
                                    uint32_t size, uint32_t alignment);
+struct anv_state
+anv_cmd_buffer_alloc_general_state(struct anv_cmd_buffer *cmd_buffer,
+                                   uint32_t size, uint32_t alignment);
+static inline struct anv_state
+anv_cmd_buffer_alloc_temporary_state(struct anv_cmd_buffer *cmd_buffer,
+                                     uint32_t size, uint32_t alignment)
+{
+   struct anv_state state =
+      anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream,
+                             size, alignment);
+   if (state.map == NULL)
+      anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+   return state;
+}
+static inline struct anv_address
+anv_cmd_buffer_temporary_state_address(struct anv_cmd_buffer *cmd_buffer,
+                                       struct anv_state state)
+{
+   return anv_state_pool_state_address(
+      &cmd_buffer->device->dynamic_state_pool, state);
+}
 
-VkResult
-anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer);
+void
+anv_cmd_buffer_chain_command_buffers(struct anv_cmd_buffer **cmd_buffers,
+                                     uint32_t num_cmd_buffers);
+void
+anv_cmd_buffer_exec_batch_debug(struct anv_queue *queue,
+                                uint32_t cmd_buffer_count,
+                                struct anv_cmd_buffer **cmd_buffers,
+                                struct anv_query_pool *perf_query_pool,
+                                uint32_t perf_query_pass);
+void
+anv_cmd_buffer_clflush(struct anv_cmd_buffer **cmd_buffers,
+                       uint32_t num_cmd_buffers);
+
+void
+anv_cmd_buffer_update_pending_query_bits(struct anv_cmd_buffer *cmd_buffer,
+                                         enum anv_pipe_bits flushed_bits);
+
+/**
+ * A allocation tied to a command buffer.
+ *
+ * Don't use anv_cmd_alloc::address::map to write memory from userspace, use
+ * anv_cmd_alloc::map instead.
+ */
+struct anv_cmd_alloc {
+   struct anv_address  address;
+   void               *map;
+   size_t              size;
+};
+
+#define ANV_EMPTY_ALLOC ((struct anv_cmd_alloc) { .map = NULL, .size = 0 })
+
+static inline bool
+anv_cmd_alloc_is_empty(struct anv_cmd_alloc alloc)
+{
+   return alloc.size == 0;
+}
 
-void gfx8_cmd_buffer_emit_viewport(struct anv_cmd_buffer *cmd_buffer);
-void gfx8_cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer *cmd_buffer,
-                                         bool depth_clamp_enable);
-void gfx7_cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer);
+struct anv_cmd_alloc
+anv_cmd_buffer_alloc_space(struct anv_cmd_buffer *cmd_buffer,
+                           size_t size, uint32_t alignment,
+                           bool private);
 
-void anv_cmd_buffer_setup_attachments(struct anv_cmd_buffer *cmd_buffer,
-                                      struct anv_render_pass *pass,
-                                      struct anv_framebuffer *framebuffer,
-                                      const VkClearValue *clear_values);
+VkResult
+anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer);
 
-void anv_cmd_buffer_emit_state_base_address(struct anv_cmd_buffer *cmd_buffer);
+void anv_cmd_buffer_emit_bt_pool_base_address(struct anv_cmd_buffer *cmd_buffer);
 
 struct anv_state
 anv_cmd_buffer_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer);
 struct anv_state
 anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer);
 
-const struct anv_image_view *
-anv_cmd_buffer_get_depth_stencil_view(const struct anv_cmd_buffer *cmd_buffer);
-
 VkResult
 anv_cmd_buffer_alloc_blorp_binding_table(struct anv_cmd_buffer *cmd_buffer,
                                          uint32_t num_entries,
                                          uint32_t *state_offset,
                                          struct anv_state *bt_state);
 
-void anv_cmd_buffer_dump(struct anv_cmd_buffer *cmd_buffer);
-
 void anv_cmd_emit_conditional_render_predicate(struct anv_cmd_buffer *cmd_buffer);
 
-enum anv_fence_type {
-   ANV_FENCE_TYPE_NONE = 0,
-   ANV_FENCE_TYPE_BO,
-   ANV_FENCE_TYPE_WSI_BO,
-   ANV_FENCE_TYPE_SYNCOBJ,
-   ANV_FENCE_TYPE_WSI,
+static inline unsigned
+anv_cmd_buffer_get_view_count(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   return MAX2(1, util_bitcount(gfx->view_mask));
+}
+
+/* Save/restore cmd buffer states for meta operations */
+enum anv_cmd_saved_state_flags {
+   ANV_CMD_SAVED_STATE_COMPUTE_PIPELINE         = BITFIELD_BIT(0),
+   ANV_CMD_SAVED_STATE_DESCRIPTOR_SET_0         = BITFIELD_BIT(1),
+   ANV_CMD_SAVED_STATE_PUSH_CONSTANTS           = BITFIELD_BIT(2),
+};
+
+struct anv_cmd_saved_state {
+   uint32_t flags;
+
+   struct anv_pipeline *pipeline;
+   struct anv_descriptor_set *descriptor_set;
+   uint8_t push_constants[MAX_PUSH_CONSTANTS_SIZE];
 };
 
-enum anv_bo_fence_state {
+void anv_cmd_buffer_save_state(struct anv_cmd_buffer *cmd_buffer,
+                               uint32_t flags,
+                               struct anv_cmd_saved_state *state);
+
+void anv_cmd_buffer_restore_state(struct anv_cmd_buffer *cmd_buffer,
+                                  struct anv_cmd_saved_state *state);
+
+enum anv_bo_sync_state {
    /** Indicates that this is a new (or newly reset fence) */
-   ANV_BO_FENCE_STATE_RESET,
+   ANV_BO_SYNC_STATE_RESET,
 
    /** Indicates that this fence has been submitted to the GPU but is still
     * (as far as we know) in use by the GPU.
     */
-   ANV_BO_FENCE_STATE_SUBMITTED,
+   ANV_BO_SYNC_STATE_SUBMITTED,
 
-   ANV_BO_FENCE_STATE_SIGNALED,
+   ANV_BO_SYNC_STATE_SIGNALED,
 };
 
-struct anv_fence_impl {
-   enum anv_fence_type type;
-
-   union {
-      /** Fence implementation for BO fences
-       *
-       * These fences use a BO and a set of CPU-tracked state flags.  The BO
-       * is added to the object list of the last execbuf call in a QueueSubmit
-       * and is marked EXEC_WRITE.  The state flags track when the BO has been
-       * submitted to the kernel.  We need to do this because Vulkan lets you
-       * wait on a fence that has not yet been submitted and I915_GEM_BUSY
-       * will say it's idle in this case.
-       */
-      struct {
-         struct anv_bo *bo;
-         enum anv_bo_fence_state state;
-      } bo;
-
-      /** DRM syncobj handle for syncobj-based fences */
-      uint32_t syncobj;
+struct anv_bo_sync {
+   struct vk_sync sync;
 
-      /** WSI fence */
-      struct wsi_fence *fence_wsi;
-   };
+   enum anv_bo_sync_state state;
+   struct anv_bo *bo;
 };
 
-struct anv_fence {
-   struct vk_object_base base;
-
-   /* Permanent fence state.  Every fence has some form of permanent state
-    * (type != ANV_SEMAPHORE_TYPE_NONE).  This may be a BO to fence on (for
-    * cross-process fences) or it could just be a dummy for use internally.
-    */
-   struct anv_fence_impl permanent;
+extern const struct vk_sync_type anv_bo_sync_type;
 
-   /* Temporary fence state.  A fence *may* have temporary state.  That state
-    * is added to the fence by an import operation and is reset back to
-    * ANV_SEMAPHORE_TYPE_NONE when the fence is reset.  A fence with temporary
-    * state cannot be signaled because the fence must already be signaled
-    * before the temporary state can be exported from the fence in the other
-    * process and imported here.
-    */
-   struct anv_fence_impl temporary;
-};
+static inline bool
+vk_sync_is_anv_bo_sync(const struct vk_sync *sync)
+{
+   return sync->type == &anv_bo_sync_type;
+}
 
-void anv_fence_reset_temporary(struct anv_device *device,
-                               struct anv_fence *fence);
+VkResult anv_create_sync_for_memory(struct vk_device *device,
+                                    VkDeviceMemory memory,
+                                    bool signal_memory,
+                                    struct vk_sync **sync_out);
 
 struct anv_event {
    struct vk_object_base                        base;
@@ -3342,89 +4341,6 @@ struct anv_event {
    struct anv_state                             state;
 };
 
-enum anv_semaphore_type {
-   ANV_SEMAPHORE_TYPE_NONE = 0,
-   ANV_SEMAPHORE_TYPE_DUMMY,
-   ANV_SEMAPHORE_TYPE_WSI_BO,
-   ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ,
-   ANV_SEMAPHORE_TYPE_TIMELINE,
-   ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE,
-};
-
-struct anv_timeline_point {
-   struct list_head link;
-
-   uint64_t serial;
-
-   /* Number of waiter on this point, when > 0 the point should not be garbage
-    * collected.
-    */
-   int waiting;
-
-   /* BO used for synchronization. */
-   struct anv_bo *bo;
-};
-
-struct anv_timeline {
-   pthread_mutex_t mutex;
-   pthread_cond_t  cond;
-
-   uint64_t highest_past;
-   uint64_t highest_pending;
-
-   struct list_head points;
-   struct list_head free_points;
-};
-
-struct anv_semaphore_impl {
-   enum anv_semaphore_type type;
-
-   union {
-      /* A BO representing this semaphore when type == ANV_SEMAPHORE_TYPE_BO
-       * or type == ANV_SEMAPHORE_TYPE_WSI_BO.  This BO will be added to the
-       * object list on any execbuf2 calls for which this semaphore is used as
-       * a wait or signal fence.  When used as a signal fence or when type ==
-       * ANV_SEMAPHORE_TYPE_WSI_BO, the EXEC_OBJECT_WRITE flag will be set.
-       */
-      struct anv_bo *bo;
-
-      /* Sync object handle when type == ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ.
-       * Unlike GEM BOs, DRM sync objects aren't deduplicated by the kernel on
-       * import so we don't need to bother with a userspace cache.
-       */
-      uint32_t syncobj;
-
-      /* Non shareable timeline semaphore
-       *
-       * Used when kernel don't have support for timeline semaphores.
-       */
-      struct anv_timeline timeline;
-   };
-};
-
-struct anv_semaphore {
-   struct vk_object_base base;
-
-   /* Permanent semaphore state.  Every semaphore has some form of permanent
-    * state (type != ANV_SEMAPHORE_TYPE_NONE).  This may be a BO to fence on
-    * (for cross-process semaphores0 or it could just be a dummy for use
-    * internally.
-    */
-   struct anv_semaphore_impl permanent;
-
-   /* Temporary semaphore state.  A semaphore *may* have temporary state.
-    * That state is added to the semaphore by an import operation and is reset
-    * back to ANV_SEMAPHORE_TYPE_NONE when the semaphore is waited on.  A
-    * semaphore with temporary state cannot be signaled because the semaphore
-    * must already be signaled before the temporary state can be exported from
-    * the semaphore in the other process and imported here.
-    */
-   struct anv_semaphore_impl temporary;
-};
-
-void anv_semaphore_reset_temporary(struct anv_device *device,
-                                   struct anv_semaphore *semaphore);
-
 #define ANV_STAGE_MASK ((1 << MESA_VULKAN_SHADER_STAGES) - 1)
 
 #define anv_foreach_stage(stage, stage_bits)                         \
@@ -3440,24 +4356,71 @@ struct anv_pipeline_bind_map {
 
    uint32_t surface_count;
    uint32_t sampler_count;
+   uint32_t embedded_sampler_count;
+   uint16_t kernel_args_size;
+   uint16_t kernel_arg_count;
 
    struct anv_pipeline_binding *                surface_to_descriptor;
    struct anv_pipeline_binding *                sampler_to_descriptor;
+   struct anv_pipeline_embedded_sampler_binding* embedded_sampler_to_binding;
+   struct brw_kernel_arg_desc *                 kernel_args;
 
    struct anv_push_range                        push_ranges[4];
 };
 
-struct anv_shader_bin_key {
-   uint32_t size;
-   uint8_t data[0];
+struct anv_push_descriptor_info {
+   /* A bitfield of descriptors used. */
+   uint32_t used_descriptors;
+
+   /* A bitfield of UBOs bindings fully promoted to push constants. */
+   uint32_t fully_promoted_ubo_descriptors;
+
+   /* */
+   uint8_t used_set_buffer;
 };
 
-struct anv_shader_bin {
-   uint32_t ref_cnt;
+/* A list of values we push to implement some of the dynamic states */
+enum anv_dynamic_push_bits {
+   ANV_DYNAMIC_PUSH_INPUT_VERTICES = BITFIELD_BIT(0),
+};
 
+struct anv_shader_upload_params {
    gl_shader_stage stage;
 
-   const struct anv_shader_bin_key *key;
+   const void *key_data;
+   uint32_t key_size;
+
+   const void *kernel_data;
+   uint32_t kernel_size;
+
+   const struct brw_stage_prog_data *prog_data;
+   uint32_t prog_data_size;
+
+   const struct brw_compile_stats *stats;
+   uint32_t num_stats;
+
+   const struct nir_xfb_info *xfb_info;
+
+   const struct anv_pipeline_bind_map *bind_map;
+
+   const struct anv_push_descriptor_info *push_desc_info;
+
+   enum anv_dynamic_push_bits dynamic_push_values;
+};
+
+struct anv_embedded_sampler {
+   uint32_t ref_cnt;
+
+   struct anv_embedded_sampler_key key;
+
+   struct anv_state sampler_state;
+   struct anv_state border_color_state;
+};
+
+struct anv_shader_bin {
+   struct vk_pipeline_cache_object base;
+
+   gl_shader_stage stage;
 
    struct anv_state kernel;
    uint32_t kernel_size;
@@ -3470,51 +4433,33 @@ struct anv_shader_bin {
 
    struct nir_xfb_info *xfb_info;
 
+   struct anv_push_descriptor_info push_desc_info;
+
    struct anv_pipeline_bind_map bind_map;
-};
 
-struct anv_shader_bin *
-anv_shader_bin_create(struct anv_device *device,
-                      gl_shader_stage stage,
-                      const void *key, uint32_t key_size,
-                      const void *kernel, uint32_t kernel_size,
-                      const struct brw_stage_prog_data *prog_data,
-                      uint32_t prog_data_size,
-                      const struct brw_compile_stats *stats, uint32_t num_stats,
-                      const struct nir_xfb_info *xfb_info,
-                      const struct anv_pipeline_bind_map *bind_map);
+   enum anv_dynamic_push_bits dynamic_push_values;
 
-void
-anv_shader_bin_destroy(struct anv_device *device, struct anv_shader_bin *shader);
+   /* Not saved in the pipeline cache.
+    *
+    * Array of pointers of length bind_map.embedded_sampler_count
+    */
+   struct anv_embedded_sampler **embedded_samplers;
+};
 
-static inline void
+static inline struct anv_shader_bin *
 anv_shader_bin_ref(struct anv_shader_bin *shader)
 {
-   assert(shader && shader->ref_cnt >= 1);
-   p_atomic_inc(&shader->ref_cnt);
+   vk_pipeline_cache_object_ref(&shader->base);
+
+   return shader;
 }
 
 static inline void
 anv_shader_bin_unref(struct anv_device *device, struct anv_shader_bin *shader)
 {
-   assert(shader && shader->ref_cnt >= 1);
-   if (p_atomic_dec_zero(&shader->ref_cnt))
-      anv_shader_bin_destroy(device, shader);
+   vk_pipeline_cache_object_unref(&device->vk, &shader->base);
 }
 
-#define anv_shader_bin_get_bsr(bin, local_arg_offset) ({             \
-   assert((local_arg_offset) % 8 == 0);                              \
-   const struct brw_bs_prog_data *prog_data =                        \
-      brw_bs_prog_data_const(bin->prog_data);                        \
-   assert(prog_data->simd_size == 8 || prog_data->simd_size == 16);  \
-                                                                     \
-   (struct GFX_BINDLESS_SHADER_RECORD) {                             \
-      .OffsetToLocalArguments = (local_arg_offset) / 8,              \
-      .BindlessShaderDispatchMode = prog_data->simd_size / 16,       \
-      .KernelStartPointer = bin->kernel.offset,                      \
-   };                                                                \
-})
-
 struct anv_pipeline_executable {
    gl_shader_stage stage;
 
@@ -3526,6 +4471,7 @@ struct anv_pipeline_executable {
 
 enum anv_pipeline_type {
    ANV_PIPELINE_GRAPHICS,
+   ANV_PIPELINE_GRAPHICS_LIB,
    ANV_PIPELINE_COMPUTE,
    ANV_PIPELINE_RAY_TRACING,
 };
@@ -3543,110 +4489,249 @@ struct anv_pipeline {
    enum anv_pipeline_type                       type;
    VkPipelineCreateFlags                        flags;
 
+   VkPipelineCreateFlags2KHR                    active_stages;
+
+   uint32_t                                     ray_queries;
+
+   /**
+    * Mask of stages that are accessing push descriptors.
+    */
+   VkShaderStageFlags                           use_push_descriptor;
+
+   /**
+    * Mask of stages that are accessing the push descriptors buffer.
+    */
+   VkShaderStageFlags                           use_push_descriptor_buffer;
+
+   /**
+    * Maximum scratch size for all shaders in this pipeline.
+    */
+   uint32_t                                     scratch_size;
+
+   /* Layout of the sets used by the pipeline. */
+   struct anv_pipeline_sets_layout              layout;
+
    struct util_dynarray                         executables;
 
    const struct intel_l3_config *               l3_config;
 };
 
-struct anv_graphics_pipeline {
+/* The base graphics pipeline object only hold shaders. */
+struct anv_graphics_base_pipeline {
    struct anv_pipeline                          base;
 
-   uint32_t                                     batch_data[512];
+   struct vk_sample_locations_state             sample_locations;
+
+   /* Shaders */
+   struct anv_shader_bin *                      shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT];
 
-   /* States that are part of batch_data and should be not emitted
-    * dynamically.
+   /* A small hash based of shader_info::source_sha1 for identifying
+    * shaders in renderdoc/shader-db.
     */
-   anv_cmd_dirty_mask_t                         static_state_mask;
+   uint32_t                                     source_hashes[ANV_GRAPHICS_SHADER_STAGE_COUNT];
 
-   /* States that need to be reemitted in cmd_buffer_flush_dynamic_state().
-    * This might cover more than the dynamic states specified at pipeline
-    * creation.
+   /* Feedback index in
+    * VkPipelineCreationFeedbackCreateInfo::pPipelineStageCreationFeedbacks
+    *
+    * For pipeline libraries, we need to remember the order at creation when
+    * included into a linked pipeline.
     */
-   anv_cmd_dirty_mask_t                         dynamic_state_mask;
+   uint32_t                                     feedback_index[ANV_GRAPHICS_SHADER_STAGE_COUNT];
 
-   struct anv_dynamic_state                     dynamic_state;
+   /* Robustness flags used shaders
+    */
+   enum brw_robustness_flags                    robust_flags[ANV_GRAPHICS_SHADER_STAGE_COUNT];
 
-   /* States declared dynamic at pipeline creation. */
-   anv_cmd_dirty_mask_t                         dynamic_states;
+   /* True if at the time the fragment shader was compiled, it didn't have all
+    * the information to avoid INTEL_MSAA_FLAG_ENABLE_DYNAMIC.
+    */
+   bool                                         fragment_dynamic;
+};
 
-   uint32_t                                     topology;
+/* The library graphics pipeline object has a partial graphic state and
+ * possibly some shaders. If requested, shaders are also present in NIR early
+ * form.
+ */
+struct anv_graphics_lib_pipeline {
+   struct anv_graphics_base_pipeline            base;
 
-   /* These fields are required with dynamic primitive topology,
-    * rasterization_samples used only with gen < 8.
-    */
-   VkLineRasterizationModeEXT                   line_mode;
-   VkPolygonMode                                polygon_mode;
-   uint32_t                                     rasterization_samples;
+   VkGraphicsPipelineLibraryFlagsEXT            lib_flags;
 
-   struct anv_subpass *                         subpass;
+   struct vk_graphics_pipeline_all_state        all_state;
+   struct vk_graphics_pipeline_state            state;
 
-   struct anv_shader_bin *                      shaders[MESA_SHADER_STAGES];
+   /* Retained shaders for link optimization. */
+   struct {
+      /* This hash is the same as computed in
+       * anv_graphics_pipeline_gather_shaders().
+       */
+      unsigned char                             shader_sha1[20];
 
-   VkShaderStageFlags                           active_stages;
+      enum gl_subgroup_size                     subgroup_size_type;
+
+      /* NIR captured in anv_pipeline_stage_get_nir(), includes specialization
+       * constants.
+       */
+      nir_shader *                              nir;
+   }                                            retained_shaders[ANV_GRAPHICS_SHADER_STAGE_COUNT];
+
+   /* Whether the shaders have been retained */
+   bool                                         retain_shaders;
+};
+
+struct anv_gfx_state_ptr {
+   /* Both in dwords */
+   uint16_t  offset;
+   uint16_t  len;
+};
+
+/* The final graphics pipeline object has all the graphics state ready to be
+ * programmed into HW packets (dynamic_state field) or fully baked in its
+ * batch.
+ */
+struct anv_graphics_pipeline {
+   struct anv_graphics_base_pipeline            base;
+
+   struct vk_vertex_input_state                 vertex_input;
+   struct vk_sample_locations_state             sample_locations;
+   struct vk_dynamic_graphics_state             dynamic_state;
+
+   /* If true, the patch control points are passed through push constants
+    * (anv_push_constants::gfx::tcs_input_vertices)
+    */
+   bool                                         dynamic_patch_control_points;
+
+   uint32_t                                     view_mask;
+   uint32_t                                     instance_multiplier;
+
+   bool                                         rp_has_ds_self_dep;
 
-   bool                                         writes_depth;
-   bool                                         depth_test_enable;
-   bool                                         writes_stencil;
-   bool                                         stencil_test_enable;
-   bool                                         depth_clamp_enable;
-   bool                                         depth_clip_enable;
-   bool                                         sample_shading_enable;
    bool                                         kill_pixel;
-   bool                                         depth_bounds_test_enable;
    bool                                         force_fragment_thread_dispatch;
+   bool                                         uses_xfb;
+   bool                                         sample_shading_enable;
+   float                                        min_sample_shading;
 
-   /* When primitive replication is used, subpass->view_mask will describe what
-    * views to replicate.
-    */
-   bool                                         use_primitive_replication;
+   /* Number of VERTEX_ELEMENT_STATE input elements used by the shader */
+   uint32_t                                     vs_input_elements;
 
-   struct anv_state                             blend_state;
+   /* Number of VERTEX_ELEMENT_STATE elements we need to implement some of the
+    * draw parameters
+    */
+   uint32_t                                     svgs_count;
 
-   struct anv_state                             cps_state;
+   /* Pre computed VERTEX_ELEMENT_STATE structures for the vertex input that
+    * can be copied into the anv_cmd_buffer behind a 3DSTATE_VERTEX_BUFFER.
+    *
+    * When MESA_VK_DYNAMIC_VI is not dynamic
+    *
+    *     vertex_input_elems = vs_input_elements + svgs_count
+    *
+    * All the VERTEX_ELEMENT_STATE can be directly copied behind a
+    * 3DSTATE_VERTEX_ELEMENTS instruction in the command buffer. Otherwise
+    * this array only holds the svgs_count elements.
+    */
+   uint32_t                                     vertex_input_elems;
+   uint32_t                                     vertex_input_data[2 * 31 /* MAX_VES + 2 internal */];
 
-   uint32_t                                     vb_used;
-   struct anv_pipeline_vertex_binding {
-      uint32_t                                  stride;
-      bool                                      instanced;
-      uint32_t                                  instance_divisor;
-   } vb[MAX_VBS];
+   /* Pre computed CS instructions that can directly be copied into
+    * anv_cmd_buffer.
+    */
+   uint32_t                                     batch_data[416];
 
-   struct {
-      uint32_t                                  sf[7];
-      uint32_t                                  depth_stencil_state[3];
-      uint32_t                                  clip[4];
-      uint32_t                                  xfb_bo_pitch[4];
-      uint32_t                                  wm[3];
-      uint32_t                                  blend_state[MAX_RTS * 2];
-      uint32_t                                  streamout_state[3];
-   } gfx7;
+   /* Urb setup utilized by this pipeline. */
+   struct intel_urb_config urb_cfg;
 
+   /* Fully backed instructions, ready to be emitted in the anv_cmd_buffer */
    struct {
-      uint32_t                                  sf[4];
-      uint32_t                                  raster[5];
-      uint32_t                                  wm_depth_stencil[3];
-      uint32_t                                  wm[2];
-      uint32_t                                  ps_blend[2];
-      uint32_t                                  blend_state[1 + MAX_RTS * 2];
-      uint32_t                                  streamout_state[5];
-   } gfx8;
-
+      struct anv_gfx_state_ptr                  urb;
+      struct anv_gfx_state_ptr                  vf_statistics;
+      struct anv_gfx_state_ptr                  vf_sgvs;
+      struct anv_gfx_state_ptr                  vf_sgvs_2;
+      struct anv_gfx_state_ptr                  vf_sgvs_instancing;
+      struct anv_gfx_state_ptr                  vf_instancing;
+      struct anv_gfx_state_ptr                  primitive_replication;
+      struct anv_gfx_state_ptr                  sbe;
+      struct anv_gfx_state_ptr                  sbe_swiz;
+      struct anv_gfx_state_ptr                  so_decl_list;
+      struct anv_gfx_state_ptr                  vs;
+      struct anv_gfx_state_ptr                  hs;
+      struct anv_gfx_state_ptr                  ds;
+
+      struct anv_gfx_state_ptr                  task_control;
+      struct anv_gfx_state_ptr                  task_shader;
+      struct anv_gfx_state_ptr                  task_redistrib;
+      struct anv_gfx_state_ptr                  clip_mesh;
+      struct anv_gfx_state_ptr                  mesh_control;
+      struct anv_gfx_state_ptr                  mesh_shader;
+      struct anv_gfx_state_ptr                  mesh_distrib;
+      struct anv_gfx_state_ptr                  sbe_mesh;
+   } final;
+
+   /* Pre packed CS instructions & structures that need to be merged later
+    * with dynamic state.
+    */
    struct {
-      uint32_t                                  wm_depth_stencil[4];
-   } gfx9;
+      struct anv_gfx_state_ptr                  clip;
+      struct anv_gfx_state_ptr                  sf;
+      struct anv_gfx_state_ptr                  raster;
+      struct anv_gfx_state_ptr                  ms;
+      struct anv_gfx_state_ptr                  ps_extra;
+      struct anv_gfx_state_ptr                  wm;
+      struct anv_gfx_state_ptr                  so;
+      struct anv_gfx_state_ptr                  gs;
+      struct anv_gfx_state_ptr                  te;
+      struct anv_gfx_state_ptr                  ps;
+      struct anv_gfx_state_ptr                  vfg;
+   } partial;
 };
 
+#define anv_batch_merge_pipeline_state(batch, dwords0, pipeline, state) \
+   do {                                                                 \
+      uint32_t *dw;                                                     \
+                                                                        \
+      assert(ARRAY_SIZE(dwords0) == (pipeline)->state.len);             \
+      dw = anv_batch_emit_dwords((batch), ARRAY_SIZE(dwords0));         \
+      if (!dw)                                                          \
+         break;                                                         \
+      for (uint32_t i = 0; i < ARRAY_SIZE(dwords0); i++)                \
+         dw[i] = (dwords0)[i] |                                         \
+            (pipeline)->batch_data[(pipeline)->state.offset + i];       \
+      VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, ARRAY_SIZE(dwords0) * 4));   \
+   } while (0)
+
+#define anv_batch_emit_pipeline_state(batch, pipeline, state)           \
+   do {                                                                 \
+      if ((pipeline)->state.len == 0)                                   \
+         break;                                                         \
+      uint32_t *dw;                                                     \
+      dw = anv_batch_emit_dwords((batch), (pipeline)->state.len);       \
+      if (!dw)                                                          \
+         break;                                                         \
+      memcpy(dw, &(pipeline)->batch_data[(pipeline)->state.offset],     \
+             4 * (pipeline)->state.len);                                \
+   } while (0)
+
+
 struct anv_compute_pipeline {
    struct anv_pipeline                          base;
 
    struct anv_shader_bin *                      cs;
    uint32_t                                     batch_data[9];
    uint32_t                                     interface_descriptor_data[8];
+
+   /* A small hash based of shader_info::source_sha1 for identifying shaders
+    * in renderdoc/shader-db.
+    */
+   uint32_t                                     source_hash;
 };
 
 struct anv_rt_shader_group {
    VkRayTracingShaderGroupTypeKHR type;
 
+   /* Whether this group was imported from another pipeline */
+   bool imported;
+
    struct anv_shader_bin *general;
    struct anv_shader_bin *closest_hit;
    struct anv_shader_bin *any_hit;
@@ -3681,6 +4766,8 @@ struct anv_ray_tracing_pipeline {
    }
 
 ANV_DECL_PIPELINE_DOWNCAST(graphics, ANV_PIPELINE_GRAPHICS)
+ANV_DECL_PIPELINE_DOWNCAST(graphics_base, ANV_PIPELINE_GRAPHICS)
+ANV_DECL_PIPELINE_DOWNCAST(graphics_lib, ANV_PIPELINE_GRAPHICS_LIB)
 ANV_DECL_PIPELINE_DOWNCAST(compute, ANV_PIPELINE_COMPUTE)
 ANV_DECL_PIPELINE_DOWNCAST(ray_tracing, ANV_PIPELINE_RAY_TRACING)
 
@@ -3688,7 +4775,59 @@ static inline bool
 anv_pipeline_has_stage(const struct anv_graphics_pipeline *pipeline,
                        gl_shader_stage stage)
 {
-   return (pipeline->active_stages & mesa_to_vk_shader_stage(stage)) != 0;
+   return (pipeline->base.base.active_stages & mesa_to_vk_shader_stage(stage)) != 0;
+}
+
+static inline bool
+anv_pipeline_base_has_stage(const struct anv_graphics_base_pipeline *pipeline,
+                            gl_shader_stage stage)
+{
+   return (pipeline->base.active_stages & mesa_to_vk_shader_stage(stage)) != 0;
+}
+
+static inline bool
+anv_pipeline_is_primitive(const struct anv_graphics_pipeline *pipeline)
+{
+   return anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX);
+}
+
+static inline bool
+anv_pipeline_is_mesh(const struct anv_graphics_pipeline *pipeline)
+{
+   return anv_pipeline_has_stage(pipeline, MESA_SHADER_MESH);
+}
+
+static inline bool
+anv_cmd_buffer_all_color_write_masked(const struct anv_cmd_buffer *cmd_buffer)
+{
+   const struct anv_cmd_graphics_state *state = &cmd_buffer->state.gfx;
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+   uint8_t color_writes = dyn->cb.color_write_enables;
+
+   /* All writes disabled through vkCmdSetColorWriteEnableEXT */
+   if ((color_writes & ((1u << state->color_att_count) - 1)) == 0)
+      return true;
+
+   /* Or all write masks are empty */
+   for (uint32_t i = 0; i < state->color_att_count; i++) {
+      if (dyn->cb.attachments[i].write_mask != 0)
+         return false;
+   }
+
+   return true;
+}
+
+static inline void
+anv_cmd_graphic_state_update_has_uint_rt(struct anv_cmd_graphics_state *state)
+{
+   state->has_uint_rt = false;
+   for (unsigned a = 0; a < state->color_att_count; a++) {
+      if (vk_format_is_int(state->color_att[a].vk_format)) {
+         state->has_uint_rt = true;
+         break;
+      }
+   }
 }
 
 #define ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(prefix, stage)             \
@@ -3697,7 +4836,7 @@ get_##prefix##_prog_data(const struct anv_graphics_pipeline *pipeline)  \
 {                                                                       \
    if (anv_pipeline_has_stage(pipeline, stage)) {                       \
       return (const struct brw_##prefix##_prog_data *)                  \
-             pipeline->shaders[stage]->prog_data;                       \
+         pipeline->base.shaders[stage]->prog_data;                      \
    } else {                                                             \
       return NULL;                                                      \
    }                                                                    \
@@ -3708,6 +4847,8 @@ ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(tcs, MESA_SHADER_TESS_CTRL)
 ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(tes, MESA_SHADER_TESS_EVAL)
 ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(gs, MESA_SHADER_GEOMETRY)
 ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(wm, MESA_SHADER_FRAGMENT)
+ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(mesh, MESA_SHADER_MESH)
+ANV_DECL_GET_GRAPHICS_PROG_DATA_FUNC(task, MESA_SHADER_TASK)
 
 static inline const struct brw_cs_prog_data *
 get_cs_prog_data(const struct anv_compute_pipeline *pipeline)
@@ -3733,62 +4874,38 @@ anv_device_init_rt_shaders(struct anv_device *device);
 void
 anv_device_finish_rt_shaders(struct anv_device *device);
 
-VkResult
-anv_pipeline_init(struct anv_pipeline *pipeline,
-                  struct anv_device *device,
-                  enum anv_pipeline_type type,
-                  VkPipelineCreateFlags flags,
-                  const VkAllocationCallbacks *pAllocator);
-
-void
-anv_pipeline_finish(struct anv_pipeline *pipeline,
-                    struct anv_device *device,
-                    const VkAllocationCallbacks *pAllocator);
+struct anv_kernel_arg {
+   bool is_ptr;
+   uint16_t size;
 
-VkResult
-anv_graphics_pipeline_init(struct anv_graphics_pipeline *pipeline, struct anv_device *device,
-                           struct anv_pipeline_cache *cache,
-                           const VkGraphicsPipelineCreateInfo *pCreateInfo,
-                           const VkAllocationCallbacks *alloc);
-
-VkResult
-anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline,
-                        struct anv_pipeline_cache *cache,
-                        const VkComputePipelineCreateInfo *info,
-                        const struct vk_shader_module *module,
-                        const char *entrypoint,
-                        const VkSpecializationInfo *spec_info);
+   union {
+      uint64_t u64;
+      void *ptr;
+   };
+};
 
-VkResult
-anv_ray_tracing_pipeline_init(struct anv_ray_tracing_pipeline *pipeline,
-                              struct anv_device *device,
-                              struct anv_pipeline_cache *cache,
-                              const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
-                              const VkAllocationCallbacks *alloc);
+struct anv_kernel {
+#ifndef NDEBUG
+   const char *name;
+#endif
+   struct anv_shader_bin *bin;
+   const struct intel_l3_config *l3_config;
+};
 
 struct anv_format_plane {
    enum isl_format isl_format:16;
    struct isl_swizzle swizzle;
 
-   /* Whether this plane contains chroma channels */
-   bool has_chroma;
-
-   /* For downscaling of YUV planes */
-   uint8_t denominator_scales[2];
-
-   /* How to map sampled ycbcr planes to a single 4 component element. */
-   struct isl_swizzle ycbcr_swizzle;
-
    /* What aspect is associated to this plane */
    VkImageAspectFlags aspect;
 };
 
-
 struct anv_format {
    struct anv_format_plane planes[3];
    VkFormat vk_format;
    uint8_t n_planes;
    bool can_ycbcr;
+   bool can_video;
 };
 
 static inline void
@@ -3865,15 +4982,41 @@ anv_get_isl_format(const struct intel_device_info *devinfo, VkFormat vk_format,
    return anv_get_format_aspect(devinfo, vk_format, aspect, tiling).isl_format;
 }
 
+bool anv_format_supports_ccs_e(const struct intel_device_info *devinfo,
+                               const enum isl_format format);
+
 bool anv_formats_ccs_e_compatible(const struct intel_device_info *devinfo,
                                   VkImageCreateFlags create_flags,
-                                  VkFormat vk_format,
-                                  VkImageTiling vk_tiling,
-                                  const VkImageFormatListCreateInfoKHR *fmt_list);
+                                  VkFormat vk_format, VkImageTiling vk_tiling,
+                                  VkImageUsageFlags vk_usage,
+                                  const VkImageFormatListCreateInfo *fmt_list);
 
 extern VkFormat
 vk_format_from_android(unsigned android_format, unsigned android_usage);
 
+static inline VkFormat
+anv_get_emulation_format(const struct anv_physical_device *pdevice, VkFormat format)
+{
+   if (pdevice->flush_astc_ldr_void_extent_denorms) {
+      const struct util_format_description *desc =
+         vk_format_description(format);
+      if (desc->layout == UTIL_FORMAT_LAYOUT_ASTC &&
+          desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB)
+         return format;
+   }
+
+   if (pdevice->emu_astc_ldr)
+      return vk_texcompress_astc_emulation_format(format);
+
+   return VK_FORMAT_UNDEFINED;
+}
+
+static inline bool
+anv_is_format_emulated(const struct anv_physical_device *pdevice, VkFormat format)
+{
+   return anv_get_emulation_format(pdevice, format) != VK_FORMAT_UNDEFINED;
+}
+
 static inline struct isl_swizzle
 anv_swizzle_for_render(struct isl_swizzle swizzle)
 {
@@ -3932,14 +5075,14 @@ struct anv_image_memory_range {
       ANV_IMAGE_MEMORY_BINDING_END,
    } binding;
 
+   uint32_t alignment;
+   uint64_t size;
+
    /**
     * Offset is relative to the start of the binding created by
     * vkBindImageMemory, not to the start of the bo.
     */
    uint64_t offset;
-
-   uint64_t size;
-   uint32_t alignment;
 };
 
 /**
@@ -3968,6 +5111,11 @@ struct anv_image {
    bool disjoint;
 
    /**
+    * Image is a WSI image
+    */
+   bool from_wsi;
+
+   /**
     * Image was imported from an struct AHardwareBuffer.  We have to delay
     * final image creation until bind time.
     */
@@ -3980,6 +5128,12 @@ struct anv_image {
    bool from_gralloc;
 
    /**
+    * If not UNDEFINED, image has a hidden plane at planes[n_planes] for ASTC
+    * LDR workaround or emulation.
+    */
+   VkFormat emu_plane_format;
+
+   /**
     * The memory bindings created by vkCreateImage and vkBindImageMemory.
     *
     * For details on the image's memory layout, see check_memory_bindings().
@@ -3996,6 +5150,7 @@ struct anv_image {
    struct anv_image_binding {
       struct anv_image_memory_range memory_range;
       struct anv_address address;
+      struct anv_sparse_binding_data sparse_data;
    } bindings[ANV_IMAGE_MEMORY_BINDING_END];
 
    /**
@@ -4015,13 +5170,6 @@ struct anv_image {
       struct anv_surface primary_surface;
 
       /**
-       * A surface which shadows the main surface and may have different
-       * tiling. This is used for sampling using a tiling that isn't supported
-       * for other operations.
-       */
-      struct anv_surface shadow_surface;
-
-      /**
        * The base aux usage for this image.  For color images, this can be
        * either CCS_E or CCS_D depending on whether or not we can reliably
        * leave CCS on all the time.
@@ -4030,11 +5178,77 @@ struct anv_image {
 
       struct anv_surface aux_surface;
 
+      /** Location of the compression control surface.  */
+      struct anv_image_memory_range compr_ctrl_memory_range;
+
       /** Location of the fast clear state.  */
       struct anv_image_memory_range fast_clear_memory_range;
+
+      /**
+       * Whether this image can be fast cleared with non-zero clear colors.
+       * This can happen with mutable images when formats of different bit
+       * sizes per components are used.
+       *
+       * On Gfx9+, because the clear colors are stored as a 4 components 32bit
+       * values, we can clear in R16G16_UNORM (store 2 16bit values in the
+       * components 0 & 1 of the clear color) and then draw in R32_UINT which
+       * would interpret the clear color as a single component value, using
+       * only the first 16bit component of the previous written clear color.
+       *
+       * On Gfx7/7.5/8, only CC_ZERO/CC_ONE clear colors are supported, this
+       * boolean will prevent the usage of CC_ONE.
+       */
+      bool can_non_zero_fast_clear;
+
+      struct {
+         /** Whether the image has CCS data mapped through AUX-TT. */
+         bool mapped;
+
+         /** Main address of the mapping. */
+         uint64_t addr;
+
+         /** Size of the mapping. */
+         uint64_t size;
+      } aux_tt;
    } planes[3];
+
+   struct anv_image_memory_range vid_dmv_top_surface;
+
+   /* Link in the anv_device.image_private_objects list */
+   struct list_head link;
 };
 
+static inline bool
+anv_image_is_sparse(const struct anv_image *image)
+{
+   return image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT;
+}
+
+static inline bool
+anv_image_is_externally_shared(const struct anv_image *image)
+{
+   return image->vk.drm_format_mod != DRM_FORMAT_MOD_INVALID ||
+          image->vk.external_handle_types != 0;
+}
+
+static inline bool
+anv_image_has_private_binding(const struct anv_image *image)
+{
+   const struct anv_image_binding private_binding =
+      image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE];
+   return private_binding.memory_range.size != 0;
+}
+
+static inline bool
+anv_image_format_is_d16_or_s8(const struct anv_image *image)
+{
+   return image->vk.format == VK_FORMAT_D16_UNORM ||
+      image->vk.format == VK_FORMAT_D16_UNORM_S8_UINT ||
+      image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
+      image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
+      image->vk.format == VK_FORMAT_S8_UINT;
+}
+
 /* The ordering of this enum is important */
 enum anv_fast_clear_type {
    /** Image does not have/support any fast-clear blocks */
@@ -4127,9 +5341,16 @@ anv_image_get_fast_clear_type_addr(const struct anv_device *device,
    struct anv_address addr =
       anv_image_get_clear_color_addr(device, image, aspect);
 
-   const unsigned clear_color_state_size = device->info.ver >= 10 ?
-      device->isl_dev.ss.clear_color_state_size :
-      device->isl_dev.ss.clear_value_size;
+   unsigned clear_color_state_size;
+   if (device->info->ver >= 11) {
+      /* The fast clear type and the first compression state are stored in the
+       * last 2 dwords of the clear color struct. Refer to the comment in
+       * add_aux_state_tracking_buffer().
+       */
+      assert(device->isl_dev.ss.clear_color_state_size >= 32);
+      clear_color_state_size = device->isl_dev.ss.clear_color_state_size - 8;
+   } else
+      clear_color_state_size = device->isl_dev.ss.clear_value_size;
    return anv_address_add(addr, clear_color_state_size);
 }
 
@@ -4142,16 +5363,16 @@ anv_image_get_compression_state_addr(const struct anv_device *device,
    assert(level < anv_image_aux_levels(image, aspect));
    assert(array_layer < anv_image_aux_layers(image, aspect, level));
    UNUSED uint32_t plane = anv_image_aspect_to_plane(image, aspect);
-   assert(image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E);
+   assert(isl_aux_usage_has_ccs_e(image->planes[plane].aux_usage));
 
-   /* Relative to start of the plane's fast clear memory range */
+   /* Relative to start of the plane's fast clear type */
    uint32_t offset;
 
    offset = 4; /* Go past the fast clear type */
 
    if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
       for (uint32_t l = 0; l < level; l++)
-         offset += anv_minify(image->vk.extent.depth, l) * 4;
+         offset += u_minify(image->vk.extent.depth, l) * 4;
    } else {
       offset += level * image->vk.array_layers * 4;
    }
@@ -4165,6 +5386,16 @@ anv_image_get_compression_state_addr(const struct anv_device *device,
       offset);
 }
 
+static inline const struct anv_image_memory_range *
+anv_image_get_aux_memory_range(const struct anv_image *image,
+                               uint32_t plane)
+{
+   if (image->planes[plane].aux_surface.memory_range.size > 0)
+     return &image->planes[plane].aux_surface.memory_range;
+   else
+     return &image->planes[plane].compr_ctrl_memory_range;
+}
+
 /* Returns true if a HiZ-enabled depth buffer can be sampled from. */
 static inline bool
 anv_can_sample_with_hiz(const struct intel_device_info * const devinfo,
@@ -4183,13 +5414,7 @@ anv_can_sample_with_hiz(const struct intel_device_info * const devinfo,
    if (image->vk.image_type == VK_IMAGE_TYPE_3D)
       return false;
 
-   /* Allow this feature on BDW even though it is disabled in the BDW devinfo
-    * struct. There's documentation which suggests that this feature actually
-    * reduces performance on BDW, but it has only been observed to help so
-    * far. Sampling fast-cleared blocks on BDW must also be handled with care
-    * (see depth_stencil_attachment_compute_aux_usage() for more info).
-    */
-   if (devinfo->ver != 8 && !devinfo->has_sample_with_hiz)
+   if (!devinfo->has_sample_with_hiz)
       return false;
 
    return image->vk.samples == 1;
@@ -4212,7 +5437,7 @@ anv_can_sample_mcs_with_clear(const struct intel_device_info * const devinfo,
     * See HSD 1707282275, wa_14013111325. Due to the use of
     * format-reinterpretation, a simplified workaround is implemented.
     */
-   if (devinfo->ver >= 12 &&
+   if (intel_needs_workaround(devinfo, 14013111325) &&
        isl_format_get_layout(anv_surf->isl.format)->bpb <= 16) {
       return false;
    }
@@ -4225,10 +5450,50 @@ anv_image_plane_uses_aux_map(const struct anv_device *device,
                              const struct anv_image *image,
                              uint32_t plane)
 {
-   return device->info.has_aux_map &&
+   return device->info->has_aux_map &&
       isl_aux_usage_has_ccs(image->planes[plane].aux_usage);
 }
 
+static inline bool
+anv_image_uses_aux_map(const struct anv_device *device,
+                       const struct anv_image *image)
+{
+   for (uint32_t p = 0; p < image->n_planes; ++p) {
+      if (anv_image_plane_uses_aux_map(device, image, p))
+         return true;
+   }
+
+   return false;
+}
+
+static inline bool
+anv_bo_allows_aux_map(const struct anv_device *device,
+                      const struct anv_bo *bo)
+{
+   if (device->aux_map_ctx == NULL)
+      return false;
+
+   return (bo->alloc_flags & ANV_BO_ALLOC_AUX_TT_ALIGNED) != 0;
+}
+
+static inline bool
+anv_address_allows_aux_map(const struct anv_device *device,
+                           struct anv_address addr)
+{
+   if (device->aux_map_ctx == NULL)
+      return false;
+
+   /* Technically, we really only care about what offset the image is bound
+    * into on the BO, but we don't have that information here. As a heuristic,
+    * rely on the BO offset instead.
+    */
+   if (anv_address_physical(addr) %
+       intel_aux_map_get_alignment(device->aux_map_ctx) != 0)
+      return false;
+
+   return true;
+}
+
 void
 anv_cmd_buffer_mark_image_written(struct anv_cmd_buffer *cmd_buffer,
                                   const struct anv_image *image,
@@ -4239,6 +5504,21 @@ anv_cmd_buffer_mark_image_written(struct anv_cmd_buffer *cmd_buffer,
                                   uint32_t layer_count);
 
 void
+anv_cmd_buffer_mark_image_fast_cleared(struct anv_cmd_buffer *cmd_buffer,
+                                       const struct anv_image *image,
+                                       const enum isl_format format,
+                                       union isl_color_value clear_color);
+
+void
+anv_cmd_buffer_load_clear_color_from_image(struct anv_cmd_buffer *cmd_buffer,
+                                           struct anv_state state,
+                                           const struct anv_image *image);
+
+struct anv_image_binding *
+anv_image_aspect_to_binding(struct anv_image *image,
+                            VkImageAspectFlags aspect);
+
+void
 anv_image_clear_color(struct anv_cmd_buffer *cmd_buffer,
                       const struct anv_image *image,
                       VkImageAspectFlagBits aspect,
@@ -4256,19 +5536,10 @@ anv_image_clear_depth_stencil(struct anv_cmd_buffer *cmd_buffer,
                               VkRect2D area,
                               float depth_value, uint8_t stencil_value);
 void
-anv_image_msaa_resolve(struct anv_cmd_buffer *cmd_buffer,
-                       const struct anv_image *src_image,
-                       enum isl_aux_usage src_aux_usage,
-                       uint32_t src_level, uint32_t src_base_layer,
-                       const struct anv_image *dst_image,
-                       enum isl_aux_usage dst_aux_usage,
-                       uint32_t dst_level, uint32_t dst_base_layer,
-                       VkImageAspectFlagBits aspect,
-                       uint32_t src_x, uint32_t src_y,
-                       uint32_t dst_x, uint32_t dst_y,
-                       uint32_t width, uint32_t height,
-                       uint32_t layer_count,
-                       enum blorp_filter filter);
+anv_attachment_msaa_resolve(struct anv_cmd_buffer *cmd_buffer,
+                            const struct anv_attachment *att,
+                            VkImageLayout layout,
+                            VkImageAspectFlagBits aspect);
 void
 anv_image_hiz_op(struct anv_cmd_buffer *cmd_buffer,
                  const struct anv_image *image,
@@ -4299,31 +5570,69 @@ anv_image_ccs_op(struct anv_cmd_buffer *cmd_buffer,
                  enum isl_aux_op ccs_op, union isl_color_value *clear_value,
                  bool predicate);
 
+isl_surf_usage_flags_t
+anv_image_choose_isl_surf_usage(struct anv_physical_device *device,
+                                VkImageCreateFlags vk_create_flags,
+                                VkImageUsageFlags vk_usage,
+                                isl_surf_usage_flags_t isl_extra_usage,
+                                VkImageAspectFlagBits aspect,
+                                VkImageCompressionFlagsEXT comp_flags);
+
 void
-anv_image_copy_to_shadow(struct anv_cmd_buffer *cmd_buffer,
-                         const struct anv_image *image,
-                         VkImageAspectFlagBits aspect,
-                         uint32_t base_level, uint32_t level_count,
-                         uint32_t base_layer, uint32_t layer_count);
+anv_cmd_buffer_fill_area(struct anv_cmd_buffer *cmd_buffer,
+                         struct anv_address address,
+                         VkDeviceSize size,
+                         uint32_t data);
+
+VkResult
+anv_cmd_buffer_ensure_rcs_companion(struct anv_cmd_buffer *cmd_buffer);
+
+bool
+anv_can_hiz_clear_ds_view(struct anv_device *device,
+                          const struct anv_image_view *iview,
+                          VkImageLayout layout,
+                          VkImageAspectFlags clear_aspects,
+                          float depth_clear_value,
+                          VkRect2D render_area,
+                          const VkQueueFlagBits queue_flags);
+
+bool
+anv_can_fast_clear_color_view(struct anv_device *device,
+                              struct anv_image_view *iview,
+                              VkImageLayout layout,
+                              union isl_color_value clear_color,
+                              uint32_t num_layers,
+                              VkRect2D render_area,
+                              const VkQueueFlagBits queue_flags);
 
 enum isl_aux_state ATTRIBUTE_PURE
 anv_layout_to_aux_state(const struct intel_device_info * const devinfo,
                         const struct anv_image *image,
                         const VkImageAspectFlagBits aspect,
-                        const VkImageLayout layout);
+                        const VkImageLayout layout,
+                        const VkQueueFlagBits queue_flags);
 
 enum isl_aux_usage ATTRIBUTE_PURE
 anv_layout_to_aux_usage(const struct intel_device_info * const devinfo,
                         const struct anv_image *image,
                         const VkImageAspectFlagBits aspect,
                         const VkImageUsageFlagBits usage,
-                        const VkImageLayout layout);
+                        const VkImageLayout layout,
+                        const VkQueueFlagBits queue_flags);
 
 enum anv_fast_clear_type ATTRIBUTE_PURE
 anv_layout_to_fast_clear_type(const struct intel_device_info * const devinfo,
                               const struct anv_image * const image,
                               const VkImageAspectFlagBits aspect,
-                              const VkImageLayout layout);
+                              const VkImageLayout layout,
+                              const VkQueueFlagBits queue_flags);
+
+bool ATTRIBUTE_PURE
+anv_layout_has_untracked_aux_writes(const struct intel_device_info * const devinfo,
+                                    const struct anv_image * const image,
+                                    const VkImageAspectFlagBits aspect,
+                                    const VkImageLayout layout,
+                                    const VkQueueFlagBits queue_flags);
 
 static inline bool
 anv_image_aspects_compatible(VkImageAspectFlags aspects1,
@@ -4347,39 +5656,44 @@ struct anv_image_view {
    const struct anv_image *image; /**< VkImageViewCreateInfo::image */
 
    unsigned n_planes;
-   struct {
-      uint32_t image_plane;
 
+   /**
+    * True if the surface states (if any) are owned by some anv_state_stream
+    * from internal_surface_state_pool.
+    */
+   bool use_surface_state_stream;
+
+   struct {
       struct isl_view isl;
 
       /**
+       * A version of the image view for storage usage (can apply 3D image
+       * slicing).
+       */
+      struct isl_view isl_storage;
+
+      /**
        * RENDER_SURFACE_STATE when using image as a sampler surface with an
        * image layout of SHADER_READ_ONLY_OPTIMAL or
        * DEPTH_STENCIL_READ_ONLY_OPTIMAL.
        */
-      struct anv_surface_state optimal_sampler_surface_state;
+      struct anv_surface_state optimal_sampler;
 
       /**
        * RENDER_SURFACE_STATE when using image as a sampler surface with an
        * image layout of GENERAL.
        */
-      struct anv_surface_state general_sampler_surface_state;
+      struct anv_surface_state general_sampler;
 
       /**
-       * RENDER_SURFACE_STATE when using image as a storage image. Separate
-       * states for write-only and readable, using the real format for
-       * write-only and the lowered format for readable.
+       * RENDER_SURFACE_STATE when using image as a storage image.
        */
-      struct anv_surface_state storage_surface_state;
-      struct anv_surface_state writeonly_storage_surface_state;
-
-      struct brw_image_param storage_image_param;
+      struct anv_surface_state storage;
    } planes[3];
 };
 
 enum anv_image_view_state_flags {
-   ANV_IMAGE_VIEW_STATE_STORAGE_WRITE_ONLY   = (1 << 0),
-   ANV_IMAGE_VIEW_STATE_TEXTURE_OPTIMAL      = (1 << 1),
+   ANV_IMAGE_VIEW_STATE_TEXTURE_OPTIMAL      = (1 << 0),
 };
 
 void anv_image_fill_surface_state(struct anv_device *device,
@@ -4390,8 +5704,41 @@ void anv_image_fill_surface_state(struct anv_device *device,
                                   enum isl_aux_usage aux_usage,
                                   const union isl_color_value *clear_color,
                                   enum anv_image_view_state_flags flags,
-                                  struct anv_surface_state *state_inout,
-                                  struct brw_image_param *image_param_out);
+                                  struct anv_surface_state *state_inout);
+
+
+static inline const struct anv_surface_state *
+anv_image_view_texture_surface_state(const struct anv_image_view *iview,
+                                     uint32_t plane, VkImageLayout layout)
+{
+   return layout == VK_IMAGE_LAYOUT_GENERAL ?
+          &iview->planes[plane].general_sampler :
+          &iview->planes[plane].optimal_sampler;
+}
+
+static inline const struct anv_surface_state *
+anv_image_view_storage_surface_state(const struct anv_image_view *iview)
+{
+   return &iview->planes[0].storage;
+}
+
+static inline bool
+anv_cmd_graphics_state_has_image_as_attachment(const struct anv_cmd_graphics_state *state,
+                                               const struct anv_image *image)
+{
+   for (unsigned a = 0; a < state->color_att_count; a++) {
+      if (state->color_att[a].iview &&
+          state->color_att[a].iview->image == image)
+         return true;
+   }
+
+   if (state->depth_att.iview && state->depth_att.iview->image == image)
+      return true;
+   if (state->stencil_att.iview && state->stencil_att.iview->image == image)
+      return true;
+
+   return false;
+}
 
 struct anv_image_create_info {
    const VkImageCreateInfo *vk_info;
@@ -4401,112 +5748,104 @@ struct anv_image_create_info {
 
    /** These flags will be added to any derived from VkImageCreateInfo. */
    isl_surf_usage_flags_t isl_extra_usage_flags;
+
+   /** An opt-in stride in pixels, should be 0 for implicit layouts */
+   uint32_t stride;
+
+   /** Whether to allocate private binding */
+   bool no_private_binding_alloc;
 };
 
-VkResult anv_image_create(VkDevice _device,
-                          const struct anv_image_create_info *info,
-                          const VkAllocationCallbacks* alloc,
-                          VkImage *pImage);
+VkResult anv_image_init(struct anv_device *device, struct anv_image *image,
+                        const struct anv_image_create_info *create_info);
+
+void anv_image_finish(struct anv_image *image);
+
+void anv_image_get_memory_requirements(struct anv_device *device,
+                                       struct anv_image *image,
+                                       VkImageAspectFlags aspects,
+                                       VkMemoryRequirements2 *pMemoryRequirements);
+
+void anv_image_view_init(struct anv_device *device,
+                         struct anv_image_view *iview,
+                         const VkImageViewCreateInfo *pCreateInfo,
+                         struct anv_state_stream *state_stream);
+
+void anv_image_view_finish(struct anv_image_view *iview);
 
 enum isl_format
 anv_isl_format_for_descriptor_type(const struct anv_device *device,
                                    VkDescriptorType type);
 
-static inline VkExtent3D
-anv_sanitize_image_extent(const VkImageType imageType,
-                          const VkExtent3D imageExtent)
-{
-   switch (imageType) {
-   case VK_IMAGE_TYPE_1D:
-      return (VkExtent3D) { imageExtent.width, 1, 1 };
-   case VK_IMAGE_TYPE_2D:
-      return (VkExtent3D) { imageExtent.width, imageExtent.height, 1 };
-   case VK_IMAGE_TYPE_3D:
-      return imageExtent;
-   default:
-      unreachable("invalid image type");
-   }
-}
-
-static inline VkOffset3D
-anv_sanitize_image_offset(const VkImageType imageType,
-                          const VkOffset3D imageOffset)
+static inline isl_surf_usage_flags_t
+anv_isl_usage_for_descriptor_type(const VkDescriptorType type)
 {
-   switch (imageType) {
-   case VK_IMAGE_TYPE_1D:
-      return (VkOffset3D) { imageOffset.x, 0, 0 };
-   case VK_IMAGE_TYPE_2D:
-      return (VkOffset3D) { imageOffset.x, imageOffset.y, 0 };
-   case VK_IMAGE_TYPE_3D:
-      return imageOffset;
-   default:
-      unreachable("invalid image type");
+   switch(type) {
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+         return ISL_SURF_USAGE_CONSTANT_BUFFER_BIT;
+      default:
+         return ISL_SURF_USAGE_STORAGE_BIT;
    }
 }
 
 static inline uint32_t
 anv_rasterization_aa_mode(VkPolygonMode raster_mode,
-                          VkLineRasterizationModeEXT line_mode)
+                          VkLineRasterizationModeKHR line_mode)
 {
    if (raster_mode == VK_POLYGON_MODE_LINE &&
-       line_mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT)
+       line_mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR)
       return true;
    return false;
 }
 
-VkFormatFeatureFlags
-anv_get_image_format_features(const struct intel_device_info *devinfo,
-                              VkFormat vk_format,
-                              const struct anv_format *anv_format,
-                              VkImageTiling vk_tiling,
-                              const struct isl_drm_modifier_info *isl_mod_info);
+static inline VkLineRasterizationModeKHR
+anv_line_rasterization_mode(VkLineRasterizationModeKHR line_mode,
+                            unsigned rasterization_samples)
+{
+   if (line_mode == VK_LINE_RASTERIZATION_MODE_DEFAULT_KHR) {
+      if (rasterization_samples > 1) {
+         return VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR;
+      } else {
+         return VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR;
+      }
+   }
+   return line_mode;
+}
+
+static inline bool
+anv_is_dual_src_blend_factor(VkBlendFactor factor)
+{
+   return factor == VK_BLEND_FACTOR_SRC1_COLOR ||
+          factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR ||
+          factor == VK_BLEND_FACTOR_SRC1_ALPHA ||
+          factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA;
+}
+
+static inline bool
+anv_is_dual_src_blend_equation(const struct vk_color_blend_attachment_state *cb)
+{
+   return anv_is_dual_src_blend_factor(cb->src_color_blend_factor) &&
+          anv_is_dual_src_blend_factor(cb->dst_color_blend_factor) &&
+          anv_is_dual_src_blend_factor(cb->src_alpha_blend_factor) &&
+          anv_is_dual_src_blend_factor(cb->dst_alpha_blend_factor);
+}
+
+VkFormatFeatureFlags2
+anv_get_image_format_features2(const struct anv_physical_device *physical_device,
+                               VkFormat vk_format,
+                               const struct anv_format *anv_format,
+                               VkImageTiling vk_tiling,
+                               const struct isl_drm_modifier_info *isl_mod_info);
 
 void anv_fill_buffer_surface_state(struct anv_device *device,
-                                   struct anv_state state,
+                                   void *surface_state_ptr,
                                    enum isl_format format,
+                                   struct isl_swizzle swizzle,
                                    isl_surf_usage_flags_t usage,
                                    struct anv_address address,
                                    uint32_t range, uint32_t stride);
 
-static inline void
-anv_clear_color_from_att_state(union isl_color_value *clear_color,
-                               const struct anv_attachment_state *att_state,
-                               const struct anv_image_view *iview)
-{
-   const struct isl_format_layout *view_fmtl =
-      isl_format_get_layout(iview->planes[0].isl.format);
-
-#define COPY_CLEAR_COLOR_CHANNEL(c, i) \
-   if (view_fmtl->channels.c.bits) \
-      clear_color->u32[i] = att_state->clear_value.color.uint32[i]
-
-   COPY_CLEAR_COLOR_CHANNEL(r, 0);
-   COPY_CLEAR_COLOR_CHANNEL(g, 1);
-   COPY_CLEAR_COLOR_CHANNEL(b, 2);
-   COPY_CLEAR_COLOR_CHANNEL(a, 3);
-
-#undef COPY_CLEAR_COLOR_CHANNEL
-}
-
-
-/* Haswell border color is a bit of a disaster.  Float and unorm formats use a
- * straightforward 32-bit float color in the first 64 bytes.  Instead of using
- * a nice float/integer union like Gfx8+, Haswell specifies the integer border
- * color as a separate entry /after/ the float color.  The layout of this entry
- * also depends on the format's bpp (with extra hacks for RG32), and overlaps.
- *
- * Since we don't know the format/bpp, we can't make any of the border colors
- * containing '1' work for all formats, as it would be in the wrong place for
- * some of them.  We opt to make 32-bit integers work as this seems like the
- * most common option.  Fortunately, transparent black works regardless, as
- * all zeroes is the same in every bit-size.
- */
-struct hsw_border_color {
-   float float32[4];
-   uint32_t _pad0[12];
-   uint32_t uint32[4];
-   uint32_t _pad1[108];
-};
 
 struct gfx8_border_color {
    union {
@@ -4517,24 +5856,19 @@ struct gfx8_border_color {
    uint32_t _pad[12];
 };
 
-struct anv_ycbcr_conversion {
-   struct vk_object_base base;
-
-   const struct anv_format *        format;
-   VkSamplerYcbcrModelConversion    ycbcr_model;
-   VkSamplerYcbcrRange              ycbcr_range;
-   VkComponentSwizzle               mapping[4];
-   VkChromaLocation                 chroma_offsets[2];
-   VkFilter                         chroma_filter;
-   bool                             chroma_reconstruction;
-};
-
 struct anv_sampler {
-   struct vk_object_base        base;
+   struct vk_sampler            vk;
+
+   /* Hash of the sampler state + border color, useful for embedded samplers
+    * and included in the descriptor layout hash.
+    */
+   unsigned char                sha1[20];
 
    uint32_t                     state[3][4];
+   uint32_t                     db_state[3][4];
+   /* Packed SAMPLER_STATE without the border color pointer. */
+   uint32_t                     state_no_bc[3][4];
    uint32_t                     n_planes;
-   struct anv_ycbcr_conversion *conversion;
 
    /* Blob of sampler state data which is guaranteed to be 32-byte aligned
     * and with a 32-byte stride for use as bindless samplers.
@@ -4542,107 +5876,27 @@ struct anv_sampler {
    struct anv_state             bindless_state;
 
    struct anv_state             custom_border_color;
-};
-
-struct anv_framebuffer {
-   struct vk_object_base                        base;
-
-   uint32_t                                     width;
-   uint32_t                                     height;
-   uint32_t                                     layers;
-
-   uint32_t                                     attachment_count;
-   struct anv_image_view *                      attachments[0];
-};
-
-struct anv_subpass_attachment {
-   VkImageUsageFlagBits usage;
-   uint32_t attachment;
-   VkImageLayout layout;
-
-   /* Used only with attachment containing stencil data. */
-   VkImageLayout stencil_layout;
-};
-
-struct anv_subpass {
-   uint32_t                                     attachment_count;
-
-   /**
-    * A pointer to all attachment references used in this subpass.
-    * Only valid if ::attachment_count > 0.
-    */
-   struct anv_subpass_attachment *              attachments;
-   uint32_t                                     input_count;
-   struct anv_subpass_attachment *              input_attachments;
-   uint32_t                                     color_count;
-   struct anv_subpass_attachment *              color_attachments;
-   struct anv_subpass_attachment *              resolve_attachments;
-
-   struct anv_subpass_attachment *              depth_stencil_attachment;
-   struct anv_subpass_attachment *              ds_resolve_attachment;
-   VkResolveModeFlagBitsKHR                     depth_resolve_mode;
-   VkResolveModeFlagBitsKHR                     stencil_resolve_mode;
-
-   uint32_t                                     view_mask;
-
-   /** Subpass has a depth/stencil self-dependency */
-   bool                                         has_ds_self_dep;
-
-   /** Subpass has at least one color resolve attachment */
-   bool                                         has_color_resolve;
-};
-
-static inline unsigned
-anv_subpass_view_count(const struct anv_subpass *subpass)
-{
-   return MAX2(1, util_bitcount(subpass->view_mask));
-}
-
-struct anv_render_pass_attachment {
-   /* TODO: Consider using VkAttachmentDescription instead of storing each of
-    * its members individually.
-    */
-   VkFormat                                     format;
-   uint32_t                                     samples;
-   VkImageUsageFlags                            usage;
-   VkAttachmentLoadOp                           load_op;
-   VkAttachmentStoreOp                          store_op;
-   VkAttachmentLoadOp                           stencil_load_op;
-   VkImageLayout                                initial_layout;
-   VkImageLayout                                final_layout;
-   VkImageLayout                                first_subpass_layout;
-
-   VkImageLayout                                stencil_initial_layout;
-   VkImageLayout                                stencil_final_layout;
-
-   /* The subpass id in which the attachment will be used last. */
-   uint32_t                                     last_subpass_idx;
-};
-
-struct anv_render_pass {
-   struct vk_object_base                        base;
-
-   uint32_t                                     attachment_count;
-   uint32_t                                     subpass_count;
-   /* An array of subpass_count+1 flushes, one per subpass boundary */
-   enum anv_pipe_bits *                         subpass_flushes;
-   struct anv_render_pass_attachment *          attachments;
-   struct anv_subpass                           subpasses[0];
+   struct anv_state             custom_border_color_db;
 };
 
 #define ANV_PIPELINE_STATISTICS_MASK 0x000007ff
 
 struct anv_query_pool {
-   struct vk_object_base                        base;
+   struct vk_query_pool                         vk;
 
-   VkQueryType                                  type;
-   VkQueryPipelineStatisticFlags                pipeline_statistics;
    /** Stride between slots, in bytes */
    uint32_t                                     stride;
    /** Number of slots in this query pool */
-   uint32_t                                     slots;
    struct anv_bo *                              bo;
 
+   /** Location for the KHR_performance_query small batch updating
+    *  ANV_PERF_QUERY_OFFSET_REG
+    */
+   uint32_t                                     khr_perf_preambles_offset;
+
+   /** Size of each small batch */
+   uint32_t                                     khr_perf_preamble_stride;
+
    /* KHR perf queries : */
    uint32_t                                     pass_size;
    uint32_t                                     data_offset;
@@ -4656,40 +5910,59 @@ struct anv_query_pool {
 static inline uint32_t khr_perf_query_preamble_offset(const struct anv_query_pool *pool,
                                                       uint32_t pass)
 {
-   return pool->pass_size * pass + 8;
+   return pool->khr_perf_preambles_offset +
+          pool->khr_perf_preamble_stride * pass;
 }
 
-struct anv_acceleration_structure {
-   struct vk_object_base                        base;
+struct anv_vid_mem {
+   struct anv_device_memory *mem;
+   VkDeviceSize       offset;
+   VkDeviceSize       size;
+};
 
-   VkDeviceSize                                 size;
-   struct anv_address                           address;
+#define ANV_VIDEO_MEM_REQS_H264 4
+#define ANV_VIDEO_MEM_REQS_H265 9
+#define ANV_MB_WIDTH 16
+#define ANV_MB_HEIGHT 16
+#define ANV_VIDEO_H264_MAX_NUM_REF_FRAME 16
+#define ANV_VIDEO_H265_MAX_NUM_REF_FRAME 16
+#define ANV_VIDEO_H265_HCP_NUM_REF_FRAME 8
+#define ANV_MAX_H265_CTB_SIZE 64
+
+enum anv_vid_mem_h264_types {
+   ANV_VID_MEM_H264_INTRA_ROW_STORE,
+   ANV_VID_MEM_H264_DEBLOCK_FILTER_ROW_STORE,
+   ANV_VID_MEM_H264_BSD_MPC_ROW_SCRATCH,
+   ANV_VID_MEM_H264_MPR_ROW_SCRATCH,
+   ANV_VID_MEM_H264_MAX,
 };
 
-int anv_get_instance_entrypoint_index(const char *name);
-int anv_get_device_entrypoint_index(const char *name);
-int anv_get_physical_device_entrypoint_index(const char *name);
+enum anv_vid_mem_h265_types {
+   ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_LINE,
+   ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_TILE_LINE,
+   ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_TILE_COLUMN,
+   ANV_VID_MEM_H265_METADATA_LINE,
+   ANV_VID_MEM_H265_METADATA_TILE_LINE,
+   ANV_VID_MEM_H265_METADATA_TILE_COLUMN,
+   ANV_VID_MEM_H265_SAO_LINE,
+   ANV_VID_MEM_H265_SAO_TILE_LINE,
+   ANV_VID_MEM_H265_SAO_TILE_COLUMN,
+   ANV_VID_MEM_H265_MAX,
+};
 
-const char *anv_get_instance_entry_name(int index);
-const char *anv_get_physical_device_entry_name(int index);
-const char *anv_get_device_entry_name(int index);
+struct anv_video_session {
+   struct vk_video_session vk;
 
-bool
-anv_instance_entrypoint_is_enabled(int index, uint32_t core_version,
-                                   const struct vk_instance_extension_table *instance);
-bool
-anv_physical_device_entrypoint_is_enabled(int index, uint32_t core_version,
-                                          const struct vk_instance_extension_table *instance);
-bool
-anv_device_entrypoint_is_enabled(int index, uint32_t core_version,
-                                 const struct vk_instance_extension_table *instance,
-                                 const struct vk_device_extension_table *device);
+   /* the decoder needs some private memory allocations */
+   struct anv_vid_mem vid_mem[ANV_VID_MEM_H265_MAX];
+};
 
-const struct vk_device_dispatch_table *
-anv_get_device_dispatch_table(const struct intel_device_info *devinfo);
+struct anv_video_session_params {
+   struct vk_video_session_parameters vk;
+};
 
 void
-anv_dump_pipe_bits(enum anv_pipe_bits bits);
+anv_dump_pipe_bits(enum anv_pipe_bits bits, FILE *f);
 
 static inline void
 anv_add_pending_pipe_bits(struct anv_cmd_buffer* cmd_buffer,
@@ -4697,27 +5970,17 @@ anv_add_pending_pipe_bits(struct anv_cmd_buffer* cmd_buffer,
                           const char* reason)
 {
    cmd_buffer->state.pending_pipe_bits |= bits;
-   if (unlikely(INTEL_DEBUG & DEBUG_PIPE_CONTROL) && bits)
-   {
-      fputs("pc: add ", stderr);
-      anv_dump_pipe_bits(bits);
-      fprintf(stderr, "reason: %s\n", reason);
+   if (INTEL_DEBUG(DEBUG_PIPE_CONTROL) && bits) {
+      fputs("pc: add ", stdout);
+      anv_dump_pipe_bits(bits, stdout);
+      fprintf(stdout, "reason: %s\n", reason);
+   }
+   /* store reason, if space available*/
+   if (cmd_buffer->state.pc_reasons_count <
+       ARRAY_SIZE(cmd_buffer->state.pc_reasons)) {
+      cmd_buffer->state.pc_reasons[
+         cmd_buffer->state.pc_reasons_count++] = reason;
    }
-}
-
-static inline uint32_t
-anv_get_subpass_id(const struct anv_cmd_state * const cmd_state)
-{
-   /* This function must be called from within a subpass. */
-   assert(cmd_state->pass && cmd_state->subpass);
-
-   const uint32_t subpass_id = cmd_state->subpass - cmd_state->pass->subpasses;
-
-   /* The id of this subpass shouldn't exceed the number of subpasses in this
-    * render pass minus 1.
-    */
-   assert(subpass_id < cmd_state->pass->subpass_count);
-   return subpass_id;
 }
 
 struct anv_performance_configuration_intel {
@@ -4728,6 +5991,7 @@ struct anv_performance_configuration_intel {
    uint64_t                   config_id;
 };
 
+void anv_physical_device_init_va_ranges(struct anv_physical_device *device);
 void anv_physical_device_init_perf(struct anv_physical_device *device, int fd);
 void anv_device_perf_init(struct anv_device *device);
 void anv_perf_write_pass_results(struct intel_perf_config *perf,
@@ -4735,25 +5999,108 @@ void anv_perf_write_pass_results(struct intel_perf_config *perf,
                                  const struct intel_perf_query_result *accumulated_results,
                                  union VkPerformanceCounterResultKHR *results);
 
+void anv_apply_per_prim_attr_wa(struct nir_shader *ms_nir,
+                                struct nir_shader *fs_nir,
+                                struct anv_device *device,
+                                const VkGraphicsPipelineCreateInfo *info);
+
+/* Use to emit a series of memcpy operations */
+struct anv_memcpy_state {
+   struct anv_device *device;
+   struct anv_batch *batch;
+
+   struct anv_vb_cache_range vb_bound;
+   struct anv_vb_cache_range vb_dirty;
+};
+
+VkResult anv_device_init_internal_kernels(struct anv_device *device);
+void anv_device_finish_internal_kernels(struct anv_device *device);
+VkResult anv_device_get_internal_shader(struct anv_device *device,
+                                        enum anv_internal_kernel_name name,
+                                        struct anv_shader_bin **out_bin);
+
+VkResult anv_device_init_astc_emu(struct anv_device *device);
+void anv_device_finish_astc_emu(struct anv_device *device);
+void anv_astc_emu_process(struct anv_cmd_buffer *cmd_buffer,
+                          struct anv_image *image,
+                          VkImageLayout layout,
+                          const VkImageSubresourceLayers *subresource,
+                          VkOffset3D block_offset,
+                          VkExtent3D block_extent);
+
+/* This structure is used in 2 scenarios :
+ *
+ *    - copy utrace timestamps from command buffer so that command buffer can
+ *      be resubmitted multiple times without the recorded timestamps being
+ *      overwritten before they're read back
+ *
+ *    - emit trace points for queue debug tagging
+ *      (vkQueueBeginDebugUtilsLabelEXT/vkQueueEndDebugUtilsLabelEXT)
+ */
+struct anv_utrace_submit {
+   /* Needs to be the first field */
+   struct intel_ds_flush_data ds;
+
+   /* Batch stuff to implement of copy of timestamps recorded in another
+    * buffer.
+    */
+   struct anv_reloc_list relocs;
+   struct anv_batch batch;
+   struct util_dynarray batch_bos;
+
+   /* Stream for temporary allocations */
+   struct anv_state_stream dynamic_state_stream;
+   struct anv_state_stream general_state_stream;
+
+   /* Syncobj to be signaled when the batch completes */
+   struct vk_sync *sync;
+
+   /* Queue on which all the recorded traces are submitted */
+   struct anv_queue *queue;
+
+   /* Buffer of 64bits timestamps (only used for timestamp copies) */
+   struct anv_bo *trace_bo;
+
+   /* Last fully read 64bit timestamp (used to rebuild the upper bits of 32bit
+    * timestamps)
+    */
+   uint64_t last_full_timestamp;
+
+   /* Memcpy state tracking (only used for timestamp copies on render engine) */
+   struct anv_memcpy_state memcpy_state;
+
+   /* Memcpy state tracking (only used for timestamp copies on compute engine) */
+   struct anv_simple_shader simple_state;
+};
+
+void anv_device_utrace_init(struct anv_device *device);
+void anv_device_utrace_finish(struct anv_device *device);
+VkResult
+anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
+                                    uint32_t cmd_buffer_count,
+                                    struct anv_cmd_buffer **cmd_buffers,
+                                    struct anv_utrace_submit **out_submit);
+
+static bool
+anv_has_cooperative_matrix(const struct anv_physical_device *device)
+{
+   return device->has_cooperative_matrix;
+}
+
 #define ANV_FROM_HANDLE(__anv_type, __name, __handle) \
    VK_FROM_HANDLE(__anv_type, __name, __handle)
 
-VK_DEFINE_HANDLE_CASTS(anv_cmd_buffer, base, VkCommandBuffer,
+VK_DEFINE_HANDLE_CASTS(anv_cmd_buffer, vk.base, VkCommandBuffer,
                        VK_OBJECT_TYPE_COMMAND_BUFFER)
 VK_DEFINE_HANDLE_CASTS(anv_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
 VK_DEFINE_HANDLE_CASTS(anv_instance, vk.base, VkInstance, VK_OBJECT_TYPE_INSTANCE)
 VK_DEFINE_HANDLE_CASTS(anv_physical_device, vk.base, VkPhysicalDevice,
                        VK_OBJECT_TYPE_PHYSICAL_DEVICE)
-VK_DEFINE_HANDLE_CASTS(anv_queue, base, VkQueue, VK_OBJECT_TYPE_QUEUE)
-
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_acceleration_structure, base,
-                               VkAccelerationStructureKHR,
-                               VK_OBJECT_TYPE_ACCELERATION_STRUCTURE_KHR)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_cmd_pool, base, VkCommandPool,
-                               VK_OBJECT_TYPE_COMMAND_POOL)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_buffer, base, VkBuffer,
+VK_DEFINE_HANDLE_CASTS(anv_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
+
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_buffer, vk.base, VkBuffer,
                                VK_OBJECT_TYPE_BUFFER)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_buffer_view, base, VkBufferView,
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_buffer_view, vk.base, VkBufferView,
                                VK_OBJECT_TYPE_BUFFER_VIEW)
 VK_DEFINE_NONDISP_HANDLE_CASTS(anv_descriptor_pool, base, VkDescriptorPool,
                                VK_OBJECT_TYPE_DESCRIPTOR_POOL)
@@ -4762,51 +6109,33 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(anv_descriptor_set, base, VkDescriptorSet,
 VK_DEFINE_NONDISP_HANDLE_CASTS(anv_descriptor_set_layout, base,
                                VkDescriptorSetLayout,
                                VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_descriptor_update_template, base,
-                               VkDescriptorUpdateTemplate,
-                               VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_device_memory, base, VkDeviceMemory,
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_device_memory, vk.base, VkDeviceMemory,
                                VK_OBJECT_TYPE_DEVICE_MEMORY)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_fence, base, VkFence, VK_OBJECT_TYPE_FENCE)
 VK_DEFINE_NONDISP_HANDLE_CASTS(anv_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_framebuffer, base, VkFramebuffer,
-                               VK_OBJECT_TYPE_FRAMEBUFFER)
 VK_DEFINE_NONDISP_HANDLE_CASTS(anv_image, vk.base, VkImage, VK_OBJECT_TYPE_IMAGE)
 VK_DEFINE_NONDISP_HANDLE_CASTS(anv_image_view, vk.base, VkImageView,
                                VK_OBJECT_TYPE_IMAGE_VIEW);
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_pipeline_cache, base, VkPipelineCache,
-                               VK_OBJECT_TYPE_PIPELINE_CACHE)
 VK_DEFINE_NONDISP_HANDLE_CASTS(anv_pipeline, base, VkPipeline,
                                VK_OBJECT_TYPE_PIPELINE)
 VK_DEFINE_NONDISP_HANDLE_CASTS(anv_pipeline_layout, base, VkPipelineLayout,
                                VK_OBJECT_TYPE_PIPELINE_LAYOUT)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_query_pool, base, VkQueryPool,
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_query_pool, vk.base, VkQueryPool,
                                VK_OBJECT_TYPE_QUERY_POOL)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_render_pass, base, VkRenderPass,
-                               VK_OBJECT_TYPE_RENDER_PASS)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_sampler, base, VkSampler,
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_sampler, vk.base, VkSampler,
                                VK_OBJECT_TYPE_SAMPLER)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_semaphore, base, VkSemaphore,
-                               VK_OBJECT_TYPE_SEMAPHORE)
-VK_DEFINE_NONDISP_HANDLE_CASTS(anv_ycbcr_conversion, base,
-                               VkSamplerYcbcrConversion,
-                               VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION)
 VK_DEFINE_NONDISP_HANDLE_CASTS(anv_performance_configuration_intel, base,
                                VkPerformanceConfigurationINTEL,
                                VK_OBJECT_TYPE_PERFORMANCE_CONFIGURATION_INTEL)
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_video_session, vk.base,
+                               VkVideoSessionKHR,
+                               VK_OBJECT_TYPE_VIDEO_SESSION_KHR)
+VK_DEFINE_NONDISP_HANDLE_CASTS(anv_video_session_params, vk.base,
+                               VkVideoSessionParametersKHR,
+                               VK_OBJECT_TYPE_VIDEO_SESSION_PARAMETERS_KHR)
 
 #define anv_genX(devinfo, thing) ({             \
    __typeof(&gfx9_##thing) genX_thing;          \
    switch ((devinfo)->verx10) {                 \
-   case 70:                                     \
-      genX_thing = &gfx7_##thing;               \
-      break;                                    \
-   case 75:                                     \
-      genX_thing = &gfx75_##thing;              \
-      break;                                    \
-   case 80:                                     \
-      genX_thing = &gfx8_##thing;               \
-      break;                                    \
    case 90:                                     \
       genX_thing = &gfx9_##thing;               \
       break;                                    \
@@ -4819,6 +6148,9 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(anv_performance_configuration_intel, base,
    case 125:                                    \
       genX_thing = &gfx125_##thing;             \
       break;                                    \
+   case 200:                                    \
+      genX_thing = &gfx20_##thing;              \
+      break;                                    \
    default:                                     \
       unreachable("Unknown hardware generation"); \
    }                                            \
@@ -4829,15 +6161,6 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(anv_performance_configuration_intel, base,
 #ifdef genX
 #  include "anv_genX.h"
 #else
-#  define genX(x) gfx7_##x
-#  include "anv_genX.h"
-#  undef genX
-#  define genX(x) gfx75_##x
-#  include "anv_genX.h"
-#  undef genX
-#  define genX(x) gfx8_##x
-#  include "anv_genX.h"
-#  undef genX
 #  define genX(x) gfx9_##x
 #  include "anv_genX.h"
 #  undef genX
@@ -4850,6 +6173,13 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(anv_performance_configuration_intel, base,
 #  define genX(x) gfx125_##x
 #  include "anv_genX.h"
 #  undef genX
+#  define genX(x) gfx20_##x
+#  include "anv_genX.h"
+#  undef genX
+#endif
+
+#ifdef __cplusplus
+}
 #endif
 
 #endif /* ANV_PRIVATE_H */
diff --git a/src/intel/vulkan/anv_queue.c b/src/intel/vulkan/anv_queue.c
index f94223b1a30..1989016f6b2 100644
--- a/src/intel/vulkan/anv_queue.c
+++ b/src/intel/vulkan/anv_queue.c
@@ -22,2668 +22,106 @@
  */
 
 /**
- * This file implements VkQueue, VkFence, and VkSemaphore
+ * This file implements VkQueue
  */
 
-#include <errno.h>
-#include <fcntl.h>
-#include <unistd.h>
-
-#include "util/os_file.h"
-
 #include "anv_private.h"
-#include "anv_measure.h"
-#include "vk_util.h"
-
-#include "genxml/gen7_pack.h"
-
-uint64_t anv_gettime_ns(void)
-{
-   struct timespec current;
-   clock_gettime(CLOCK_MONOTONIC, &current);
-   return (uint64_t)current.tv_sec * NSEC_PER_SEC + current.tv_nsec;
-}
-
-uint64_t anv_get_absolute_timeout(uint64_t timeout)
-{
-   if (timeout == 0)
-      return 0;
-   uint64_t current_time = anv_gettime_ns();
-   uint64_t max_timeout = (uint64_t) INT64_MAX - current_time;
-
-   timeout = MIN2(max_timeout, timeout);
-
-   return (current_time + timeout);
-}
-
-static int64_t anv_get_relative_timeout(uint64_t abs_timeout)
-{
-   uint64_t now = anv_gettime_ns();
-
-   /* We don't want negative timeouts.
-    *
-    * DRM_IOCTL_I915_GEM_WAIT uses a signed 64 bit timeout and is
-    * supposed to block indefinitely timeouts < 0.  Unfortunately,
-    * this was broken for a couple of kernel releases.  Since there's
-    * no way to know whether or not the kernel we're using is one of
-    * the broken ones, the best we can do is to clamp the timeout to
-    * INT64_MAX.  This limits the maximum timeout from 584 years to
-    * 292 years - likely not a big deal.
-    */
-   if (abs_timeout < now)
-      return 0;
-
-   uint64_t rel_timeout = abs_timeout - now;
-   if (rel_timeout > (uint64_t) INT64_MAX)
-      rel_timeout = INT64_MAX;
-
-   return rel_timeout;
-}
-
-static void anv_semaphore_impl_cleanup(struct anv_device *device,
-                                       struct anv_semaphore_impl *impl);
-
-static void
-anv_queue_submit_free(struct anv_device *device,
-                      struct anv_queue_submit *submit)
-{
-   const VkAllocationCallbacks *alloc = submit->alloc;
-
-   for (uint32_t i = 0; i < submit->temporary_semaphore_count; i++)
-      anv_semaphore_impl_cleanup(device, &submit->temporary_semaphores[i]);
-   /* Execbuf does not consume the in_fence.  It's our job to close it. */
-   if (submit->in_fence != -1) {
-      assert(!device->has_thread_submit);
-      close(submit->in_fence);
-   }
-   if (submit->out_fence != -1) {
-      assert(!device->has_thread_submit);
-      close(submit->out_fence);
-   }
-   vk_free(alloc, submit->fences);
-   vk_free(alloc, submit->fence_values);
-   vk_free(alloc, submit->temporary_semaphores);
-   vk_free(alloc, submit->wait_timelines);
-   vk_free(alloc, submit->wait_timeline_values);
-   vk_free(alloc, submit->signal_timelines);
-   vk_free(alloc, submit->signal_timeline_values);
-   vk_free(alloc, submit->fence_bos);
-   vk_free(alloc, submit->cmd_buffers);
-   vk_free(alloc, submit);
-}
-
-static bool
-anv_queue_submit_ready_locked(struct anv_queue_submit *submit)
-{
-   for (uint32_t i = 0; i < submit->wait_timeline_count; i++) {
-      if (submit->wait_timeline_values[i] > submit->wait_timelines[i]->highest_pending)
-         return false;
-   }
-
-   return true;
-}
-
-static VkResult
-anv_timeline_init(struct anv_device *device,
-                  struct anv_timeline *timeline,
-                  uint64_t initial_value)
-{
-   timeline->highest_past =
-      timeline->highest_pending = initial_value;
-   list_inithead(&timeline->points);
-   list_inithead(&timeline->free_points);
-
-   return VK_SUCCESS;
-}
 
-static void
-anv_timeline_finish(struct anv_device *device,
-                    struct anv_timeline *timeline)
-{
-   list_for_each_entry_safe(struct anv_timeline_point, point,
-                            &timeline->free_points, link) {
-      list_del(&point->link);
-      anv_device_release_bo(device, point->bo);
-      vk_free(&device->vk.alloc, point);
-   }
-   list_for_each_entry_safe(struct anv_timeline_point, point,
-                            &timeline->points, link) {
-      list_del(&point->link);
-      anv_device_release_bo(device, point->bo);
-      vk_free(&device->vk.alloc, point);
-   }
-}
+#include "i915/anv_queue.h"
+#include "xe/anv_queue.h"
 
 static VkResult
-anv_timeline_add_point_locked(struct anv_device *device,
-                              struct anv_timeline *timeline,
-                              uint64_t value,
-                              struct anv_timeline_point **point)
+anv_create_engine(struct anv_device *device,
+                  struct anv_queue *queue,
+                  const VkDeviceQueueCreateInfo *pCreateInfo)
 {
-   VkResult result = VK_SUCCESS;
-
-   if (list_is_empty(&timeline->free_points)) {
-      *point =
-         vk_zalloc(&device->vk.alloc, sizeof(**point),
-                   8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
-      if (!(*point))
-         result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-      if (result == VK_SUCCESS) {
-         result = anv_device_alloc_bo(device, "timeline-semaphore", 4096,
-                                      ANV_BO_ALLOC_EXTERNAL |
-                                      ANV_BO_ALLOC_IMPLICIT_SYNC,
-                                      0 /* explicit_address */,
-                                      &(*point)->bo);
-         if (result != VK_SUCCESS)
-            vk_free(&device->vk.alloc, *point);
-      }
-   } else {
-      *point = list_first_entry(&timeline->free_points,
-                                struct anv_timeline_point, link);
-      list_del(&(*point)->link);
-   }
-
-   if (result == VK_SUCCESS) {
-      (*point)->serial = value;
-      list_addtail(&(*point)->link, &timeline->points);
-   }
-
-   return result;
-}
-
-static VkResult
-anv_timeline_gc_locked(struct anv_device *device,
-                       struct anv_timeline *timeline)
-{
-   list_for_each_entry_safe(struct anv_timeline_point, point,
-                            &timeline->points, link) {
-      /* timeline->higest_pending is only incremented once submission has
-       * happened. If this point has a greater serial, it means the point
-       * hasn't been submitted yet.
-       */
-      if (point->serial > timeline->highest_pending)
-         return VK_SUCCESS;
-
-      /* If someone is waiting on this time point, consider it busy and don't
-       * try to recycle it. There's a slim possibility that it's no longer
-       * busy by the time we look at it but we would be recycling it out from
-       * under a waiter and that can lead to weird races.
-       *
-       * We walk the list in-order so if this time point is still busy so is
-       * every following time point
-       */
-      assert(point->waiting >= 0);
-      if (point->waiting)
-         return VK_SUCCESS;
-
-      /* Garbage collect any signaled point. */
-      VkResult result = anv_device_bo_busy(device, point->bo);
-      if (result == VK_NOT_READY) {
-         /* We walk the list in-order so if this time point is still busy so
-          * is every following time point
-          */
-         return VK_SUCCESS;
-      } else if (result != VK_SUCCESS) {
-         return result;
-      }
-
-      assert(timeline->highest_past < point->serial);
-      timeline->highest_past = point->serial;
-
-      list_del(&point->link);
-      list_add(&point->link, &timeline->free_points);
-   }
-
-   return VK_SUCCESS;
-}
-
-static VkResult anv_queue_submit_add_fence_bo(struct anv_queue_submit *submit,
-                                              struct anv_bo *bo,
-                                              bool signal);
-
-static VkResult
-anv_queue_submit_timeline_locked(struct anv_queue *queue,
-                                 struct anv_queue_submit *submit)
-{
-   VkResult result;
-
-   for (uint32_t i = 0; i < submit->wait_timeline_count; i++) {
-      struct anv_timeline *timeline = submit->wait_timelines[i];
-      uint64_t wait_value = submit->wait_timeline_values[i];
-
-      if (timeline->highest_past >= wait_value)
-         continue;
-
-      list_for_each_entry(struct anv_timeline_point, point, &timeline->points, link) {
-         if (point->serial < wait_value)
-            continue;
-         result = anv_queue_submit_add_fence_bo(submit, point->bo, false);
-         if (result != VK_SUCCESS)
-            return result;
-         break;
-      }
-   }
-   for (uint32_t i = 0; i < submit->signal_timeline_count; i++) {
-      struct anv_timeline *timeline = submit->signal_timelines[i];
-      uint64_t signal_value = submit->signal_timeline_values[i];
-      struct anv_timeline_point *point;
-
-      result = anv_timeline_add_point_locked(queue->device, timeline,
-                                             signal_value, &point);
-      if (result != VK_SUCCESS)
-         return result;
-
-      result = anv_queue_submit_add_fence_bo(submit, point->bo, true);
-      if (result != VK_SUCCESS)
-         return result;
-   }
-
-   result = anv_queue_execbuf_locked(queue, submit);
-
-   if (result == VK_SUCCESS) {
-      /* Update the pending values in the timeline objects. */
-      for (uint32_t i = 0; i < submit->signal_timeline_count; i++) {
-         struct anv_timeline *timeline = submit->signal_timelines[i];
-         uint64_t signal_value = submit->signal_timeline_values[i];
-
-         assert(signal_value > timeline->highest_pending);
-         timeline->highest_pending = signal_value;
-      }
-   } else {
-      /* Unblock any waiter by signaling the points, the application will get
-       * a device lost error code.
-       */
-      for (uint32_t i = 0; i < submit->signal_timeline_count; i++) {
-         struct anv_timeline *timeline = submit->signal_timelines[i];
-         uint64_t signal_value = submit->signal_timeline_values[i];
-
-         assert(signal_value > timeline->highest_pending);
-         timeline->highest_past = timeline->highest_pending = signal_value;
-      }
-   }
-
-   return result;
-}
-
-static VkResult
-anv_queue_submit_deferred_locked(struct anv_queue *queue, uint32_t *advance)
-{
-   VkResult result = VK_SUCCESS;
-
-   /* Go through all the queued submissions and submit then until we find one
-    * that's waiting on a point that hasn't materialized yet.
-    */
-   list_for_each_entry_safe(struct anv_queue_submit, submit,
-                            &queue->queued_submits, link) {
-      if (!anv_queue_submit_ready_locked(submit))
-         break;
-
-      (*advance)++;
-      list_del(&submit->link);
-
-      result = anv_queue_submit_timeline_locked(queue, submit);
-
-      anv_queue_submit_free(queue->device, submit);
-
-      if (result != VK_SUCCESS)
-         break;
+   switch (device->info->kmd_type) {
+   case INTEL_KMD_TYPE_I915:
+      return anv_i915_create_engine(device, queue, pCreateInfo);
+   case INTEL_KMD_TYPE_XE:
+      return anv_xe_create_engine(device, queue, pCreateInfo);
+   default:
+      unreachable("Missing");
+      return VK_ERROR_UNKNOWN;
    }
-
-   return result;
-}
-
-static VkResult
-anv_device_submit_deferred_locked(struct anv_device *device)
-{
-   VkResult result = VK_SUCCESS;
-
-   uint32_t advance;
-   do {
-      advance = 0;
-      for (uint32_t i = 0; i < device->queue_count; i++) {
-         struct anv_queue *queue = &device->queues[i];
-         VkResult qres = anv_queue_submit_deferred_locked(queue, &advance);
-         if (qres != VK_SUCCESS)
-            result = qres;
-      }
-   } while (advance);
-
-   return result;
 }
 
 static void
-anv_queue_submit_signal_fences(struct anv_device *device,
-                               struct anv_queue_submit *submit)
-{
-   for (uint32_t i = 0; i < submit->fence_count; i++) {
-      if (submit->fences[i].flags & I915_EXEC_FENCE_SIGNAL) {
-         anv_gem_syncobj_timeline_signal(device, &submit->fences[i].handle,
-                                         &submit->fence_values[i], 1);
-      }
-   }
-}
-
-static void *
-anv_queue_task(void *_queue)
-{
-   struct anv_queue *queue = _queue;
-
-   pthread_mutex_lock(&queue->mutex);
-
-   while (!queue->quit) {
-      while (!list_is_empty(&queue->queued_submits)) {
-         struct anv_queue_submit *submit =
-            list_first_entry(&queue->queued_submits, struct anv_queue_submit, link);
-         list_del(&submit->link);
-
-         pthread_mutex_unlock(&queue->mutex);
-
-         VkResult result = VK_ERROR_DEVICE_LOST;
-
-         /* Wait for timeline points to materialize before submitting. We need
-          * to do this because we're using threads to do the submit to i915.
-          * We could end up in a situation where the application submits to 2
-          * queues with the first submit creating the dma-fence for the
-          * second. But because the scheduling of the submission threads might
-          * wakeup the second queue thread first, this would make that execbuf
-          * fail because the dma-fence it depends on hasn't materialized yet.
-          */
-         if (!queue->lost && submit->wait_timeline_count > 0) {
-            int ret = queue->device->info.no_hw ? 0 :
-               anv_gem_syncobj_timeline_wait(
-                  queue->device, submit->wait_timeline_syncobjs,
-                  submit->wait_timeline_values, submit->wait_timeline_count,
-                  anv_get_absolute_timeout(UINT64_MAX) /* wait forever */,
-                  true /* wait for all */, true /* wait for materialize */);
-            if (ret) {
-               result = anv_queue_set_lost(queue, "timeline timeout: %s",
-                                           strerror(errno));
-            }
-         }
-
-         /* Now submit */
-         if (!queue->lost) {
-            pthread_mutex_lock(&queue->device->mutex);
-            result = anv_queue_execbuf_locked(queue, submit);
-            pthread_mutex_unlock(&queue->device->mutex);
-         }
-
-         if (result != VK_SUCCESS) {
-            /* vkQueueSubmit or some other entry point will report the
-             * DEVICE_LOST error at some point, but until we have emptied our
-             * list of execbufs we need to wake up all potential the waiters
-             * until one of them spots the error.
-             */
-            anv_queue_submit_signal_fences(queue->device, submit);
-         }
-
-         anv_queue_submit_free(queue->device, submit);
-
-         pthread_mutex_lock(&queue->mutex);
-      }
-
-      if (!queue->quit)
-         pthread_cond_wait(&queue->cond, &queue->mutex);
-   }
-
-   pthread_mutex_unlock(&queue->mutex);
-
-   return NULL;
-}
-
-static VkResult
-anv_queue_submit_post(struct anv_queue *queue,
-                      struct anv_queue_submit **_submit,
-                      bool flush_queue)
-{
-   struct anv_queue_submit *submit = *_submit;
-
-   /* Wait before signal behavior means we might keep alive the
-    * anv_queue_submit object a bit longer, so transfer the ownership to the
-    * anv_queue.
-    */
-   *_submit = NULL;
-   if (queue->device->has_thread_submit) {
-      pthread_mutex_lock(&queue->mutex);
-      pthread_cond_broadcast(&queue->cond);
-      list_addtail(&submit->link, &queue->queued_submits);
-      pthread_mutex_unlock(&queue->mutex);
-      return VK_SUCCESS;
-   } else {
-      pthread_mutex_lock(&queue->device->mutex);
-      list_addtail(&submit->link, &queue->queued_submits);
-      VkResult result = anv_device_submit_deferred_locked(queue->device);
-      if (flush_queue) {
-         while (result == VK_SUCCESS && !list_is_empty(&queue->queued_submits)) {
-            int ret = pthread_cond_wait(&queue->device->queue_submit,
-                                        &queue->device->mutex);
-            if (ret != 0) {
-               result = anv_device_set_lost(queue->device, "wait timeout");
-               break;
-            }
-
-            result = anv_device_submit_deferred_locked(queue->device);
-         }
-      }
-      pthread_mutex_unlock(&queue->device->mutex);
-      return result;
-   }
-}
-
-VkResult
-anv_queue_init(struct anv_device *device, struct anv_queue *queue,
-               uint32_t exec_flags,
-               const VkDeviceQueueCreateInfo *pCreateInfo)
-{
-   struct anv_physical_device *pdevice = device->physical;
-   VkResult result;
-
-   queue->device = device;
-   queue->flags = pCreateInfo->flags;
-
-   assert(pCreateInfo->queueFamilyIndex < pdevice->queue.family_count);
-   queue->family = &pdevice->queue.families[pCreateInfo->queueFamilyIndex];
-
-   queue->exec_flags = exec_flags;
-   queue->lost = false;
-   queue->quit = false;
-
-   list_inithead(&queue->queued_submits);
-
-   /* We only need those additional thread/mutex when using a thread for
-    * submission.
-    */
-   if (device->has_thread_submit) {
-      if (pthread_mutex_init(&queue->mutex, NULL) != 0)
-         return vk_error(VK_ERROR_INITIALIZATION_FAILED);
-
-      if (pthread_cond_init(&queue->cond, NULL) != 0) {
-         result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
-         goto fail_mutex;
-      }
-      if (pthread_create(&queue->thread, NULL, anv_queue_task, queue)) {
-         result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
-         goto fail_cond;
-      }
-   }
-
-   vk_object_base_init(&device->vk, &queue->base, VK_OBJECT_TYPE_QUEUE);
-
-   return VK_SUCCESS;
-
- fail_cond:
-   pthread_cond_destroy(&queue->cond);
- fail_mutex:
-   pthread_mutex_destroy(&queue->mutex);
-
-   return result;
-}
-
-void
-anv_queue_finish(struct anv_queue *queue)
-{
-   if (queue->device->has_thread_submit) {
-      pthread_mutex_lock(&queue->mutex);
-      pthread_cond_broadcast(&queue->cond);
-      queue->quit = true;
-      pthread_mutex_unlock(&queue->mutex);
-
-      void *ret;
-      pthread_join(queue->thread, &ret);
-
-      pthread_cond_destroy(&queue->cond);
-      pthread_mutex_destroy(&queue->mutex);
-   }
-
-   vk_object_base_finish(&queue->base);
-}
-
-static VkResult
-anv_queue_submit_add_fence_bo(struct anv_queue_submit *submit,
-                              struct anv_bo *bo,
-                              bool signal)
-{
-   if (submit->fence_bo_count >= submit->fence_bo_array_length) {
-      uint32_t new_len = MAX2(submit->fence_bo_array_length * 2, 64);
-      uintptr_t *new_fence_bos =
-         vk_realloc(submit->alloc,
-                    submit->fence_bos, new_len * sizeof(*submit->fence_bos),
-                    8, submit->alloc_scope);
-      if (new_fence_bos == NULL)
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-      submit->fence_bos = new_fence_bos;
-      submit->fence_bo_array_length = new_len;
-   }
-
-   /* Take advantage that anv_bo are allocated at 8 byte alignement so we can
-    * use the lowest bit to store whether this is a BO we need to signal.
-    */
-   submit->fence_bos[submit->fence_bo_count++] = anv_pack_ptr(bo, 1, signal);
-
-   return VK_SUCCESS;
-}
-
-static VkResult
-anv_queue_submit_add_syncobj(struct anv_queue_submit* submit,
-                             struct anv_device *device,
-                             uint32_t handle, uint32_t flags,
-                             uint64_t value)
-{
-   assert(flags != 0);
-
-   if (device->has_thread_submit && (flags & I915_EXEC_FENCE_WAIT)) {
-      if (submit->wait_timeline_count >= submit->wait_timeline_array_length) {
-         uint32_t new_len = MAX2(submit->wait_timeline_array_length * 2, 64);
-
-         uint32_t *new_wait_timeline_syncobjs =
-            vk_realloc(submit->alloc,
-                       submit->wait_timeline_syncobjs,
-                       new_len * sizeof(*submit->wait_timeline_syncobjs),
-                       8, submit->alloc_scope);
-         if (new_wait_timeline_syncobjs == NULL)
-            return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-         submit->wait_timeline_syncobjs = new_wait_timeline_syncobjs;
-
-         uint64_t *new_wait_timeline_values =
-            vk_realloc(submit->alloc,
-                       submit->wait_timeline_values, new_len * sizeof(*submit->wait_timeline_values),
-                       8, submit->alloc_scope);
-         if (new_wait_timeline_values == NULL)
-            return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-         submit->wait_timeline_values = new_wait_timeline_values;
-         submit->wait_timeline_array_length = new_len;
-      }
-
-      submit->wait_timeline_syncobjs[submit->wait_timeline_count] = handle;
-      submit->wait_timeline_values[submit->wait_timeline_count] = value;
-
-      submit->wait_timeline_count++;
-   }
-
-   if (submit->fence_count >= submit->fence_array_length) {
-      uint32_t new_len = MAX2(submit->fence_array_length * 2, 64);
-      struct drm_i915_gem_exec_fence *new_fences =
-         vk_realloc(submit->alloc,
-                    submit->fences, new_len * sizeof(*submit->fences),
-                    8, submit->alloc_scope);
-      if (new_fences == NULL)
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-      submit->fences = new_fences;
-
-      uint64_t *new_fence_values =
-         vk_realloc(submit->alloc,
-                    submit->fence_values, new_len * sizeof(*submit->fence_values),
-                    8, submit->alloc_scope);
-      if (new_fence_values == NULL)
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-      submit->fence_values = new_fence_values;
-      submit->fence_array_length = new_len;
-   }
-
-   submit->fences[submit->fence_count] = (struct drm_i915_gem_exec_fence) {
-      .handle = handle,
-      .flags = flags,
-   };
-   submit->fence_values[submit->fence_count] = value;
-   submit->fence_count++;
-
-   return VK_SUCCESS;
-}
-
-static VkResult
-anv_queue_submit_add_timeline_wait(struct anv_queue_submit* submit,
-                                   struct anv_device *device,
-                                   struct anv_timeline *timeline,
-                                   uint64_t value)
-{
-   if (submit->wait_timeline_count >= submit->wait_timeline_array_length) {
-      uint32_t new_len = MAX2(submit->wait_timeline_array_length * 2, 64);
-      struct anv_timeline **new_wait_timelines =
-         vk_realloc(submit->alloc,
-                    submit->wait_timelines, new_len * sizeof(*submit->wait_timelines),
-                    8, submit->alloc_scope);
-      if (new_wait_timelines == NULL)
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-      submit->wait_timelines = new_wait_timelines;
-
-      uint64_t *new_wait_timeline_values =
-         vk_realloc(submit->alloc,
-                    submit->wait_timeline_values, new_len * sizeof(*submit->wait_timeline_values),
-                    8, submit->alloc_scope);
-      if (new_wait_timeline_values == NULL)
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-      submit->wait_timeline_values = new_wait_timeline_values;
-
-      submit->wait_timeline_array_length = new_len;
-   }
-
-   submit->wait_timelines[submit->wait_timeline_count] = timeline;
-   submit->wait_timeline_values[submit->wait_timeline_count] = value;
-
-   submit->wait_timeline_count++;
-
-   return VK_SUCCESS;
-}
-
-static VkResult
-anv_queue_submit_add_timeline_signal(struct anv_queue_submit* submit,
-                                     struct anv_device *device,
-                                     struct anv_timeline *timeline,
-                                     uint64_t value)
-{
-   assert(timeline->highest_pending < value);
-
-   if (submit->signal_timeline_count >= submit->signal_timeline_array_length) {
-      uint32_t new_len = MAX2(submit->signal_timeline_array_length * 2, 64);
-      struct anv_timeline **new_signal_timelines =
-         vk_realloc(submit->alloc,
-                    submit->signal_timelines, new_len * sizeof(*submit->signal_timelines),
-                    8, submit->alloc_scope);
-      if (new_signal_timelines == NULL)
-            return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-      submit->signal_timelines = new_signal_timelines;
-
-      uint64_t *new_signal_timeline_values =
-         vk_realloc(submit->alloc,
-                    submit->signal_timeline_values, new_len * sizeof(*submit->signal_timeline_values),
-                    8, submit->alloc_scope);
-      if (new_signal_timeline_values == NULL)
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-      submit->signal_timeline_values = new_signal_timeline_values;
-
-      submit->signal_timeline_array_length = new_len;
-   }
-
-   submit->signal_timelines[submit->signal_timeline_count] = timeline;
-   submit->signal_timeline_values[submit->signal_timeline_count] = value;
-
-   submit->signal_timeline_count++;
-
-   return VK_SUCCESS;
-}
-
-static struct anv_queue_submit *
-anv_queue_submit_alloc(struct anv_device *device)
-{
-   const VkAllocationCallbacks *alloc = &device->vk.alloc;
-   VkSystemAllocationScope alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE;
-
-   struct anv_queue_submit *submit = vk_zalloc(alloc, sizeof(*submit), 8, alloc_scope);
-   if (!submit)
-      return NULL;
-
-   submit->alloc = alloc;
-   submit->alloc_scope = alloc_scope;
-   submit->in_fence = -1;
-   submit->out_fence = -1;
-   submit->perf_query_pass = -1;
-
-   return submit;
-}
-
-VkResult
-anv_queue_submit_simple_batch(struct anv_queue *queue,
-                              struct anv_batch *batch)
+anv_destroy_engine(struct anv_queue *queue)
 {
-   if (queue->device->info.no_hw)
-      return VK_SUCCESS;
-
    struct anv_device *device = queue->device;
-   struct anv_queue_submit *submit = anv_queue_submit_alloc(device);
-   if (!submit)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   bool has_syncobj_wait = device->physical->has_syncobj_wait;
-   VkResult result;
-   uint32_t syncobj;
-   struct anv_bo *batch_bo, *sync_bo;
-
-   if (has_syncobj_wait) {
-      syncobj = anv_gem_syncobj_create(device, 0);
-      if (!syncobj) {
-         result = vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
-         goto err_free_submit;
-      }
-
-      result = anv_queue_submit_add_syncobj(submit, device, syncobj,
-                                            I915_EXEC_FENCE_SIGNAL, 0);
-   } else {
-      result = anv_device_alloc_bo(device, "simple-batch-sync", 4096,
-                                   ANV_BO_ALLOC_EXTERNAL |
-                                   ANV_BO_ALLOC_IMPLICIT_SYNC,
-                                   0 /* explicit_address */,
-                                   &sync_bo);
-      if (result != VK_SUCCESS)
-         goto err_free_submit;
-
-      result = anv_queue_submit_add_fence_bo(submit, sync_bo, true /* signal */);
-   }
-
-   if (result != VK_SUCCESS)
-      goto err_destroy_sync_primitive;
-
-   if (batch) {
-      uint32_t size = align_u32(batch->next - batch->start, 8);
-      result = anv_bo_pool_alloc(&device->batch_bo_pool, size, &batch_bo);
-      if (result != VK_SUCCESS)
-         goto err_destroy_sync_primitive;
-
-      memcpy(batch_bo->map, batch->start, size);
-      if (!device->info.has_llc)
-         intel_flush_range(batch_bo->map, size);
-
-      submit->simple_bo = batch_bo;
-      submit->simple_bo_size = size;
-   }
-
-   result = anv_queue_submit_post(queue, &submit, true);
-
-   if (result == VK_SUCCESS) {
-      if (has_syncobj_wait) {
-         if (anv_gem_syncobj_wait(device, &syncobj, 1,
-                                  anv_get_absolute_timeout(INT64_MAX), true))
-            result = anv_device_set_lost(device, "anv_gem_syncobj_wait failed: %m");
-         anv_gem_syncobj_destroy(device, syncobj);
-      } else {
-         result = anv_device_wait(device, sync_bo,
-                                  anv_get_relative_timeout(INT64_MAX));
-         anv_device_release_bo(device, sync_bo);
-      }
-   }
-
-   if (batch)
-      anv_bo_pool_free(&device->batch_bo_pool, batch_bo);
-
-   if (submit)
-      anv_queue_submit_free(device, submit);
-
-   return result;
-
- err_destroy_sync_primitive:
-   if (has_syncobj_wait)
-      anv_gem_syncobj_destroy(device, syncobj);
-   else
-      anv_device_release_bo(device, sync_bo);
- err_free_submit:
-   if (submit)
-      anv_queue_submit_free(device, submit);
-
-   return result;
-}
-
-/* Transfer ownership of temporary semaphores from the VkSemaphore object to
- * the anv_queue_submit object. Those temporary semaphores are then freed in
- * anv_queue_submit_free() once the driver is finished with them.
- */
-static VkResult
-maybe_transfer_temporary_semaphore(struct anv_queue_submit *submit,
-                                   struct anv_semaphore *semaphore,
-                                   struct anv_semaphore_impl **out_impl)
-{
-   struct anv_semaphore_impl *impl = &semaphore->temporary;
-
-   if (impl->type == ANV_SEMAPHORE_TYPE_NONE) {
-      *out_impl = &semaphore->permanent;
-      return VK_SUCCESS;
-   }
-
-   /* BO backed timeline semaphores cannot be temporary. */
-   assert(impl->type != ANV_SEMAPHORE_TYPE_TIMELINE);
-
-   /*
-    * There is a requirement to reset semaphore to their permanent state after
-    * submission. From the Vulkan 1.0.53 spec:
-    *
-    *    "If the import is temporary, the implementation must restore the
-    *    semaphore to its prior permanent state after submitting the next
-    *    semaphore wait operation."
-    *
-    * In the case we defer the actual submission to a thread because of the
-    * wait-before-submit behavior required for timeline semaphores, we need to
-    * make copies of the temporary syncobj to ensure they stay alive until we
-    * do the actual execbuffer ioctl.
-    */
-   if (submit->temporary_semaphore_count >= submit->temporary_semaphore_array_length) {
-      uint32_t new_len = MAX2(submit->temporary_semaphore_array_length * 2, 8);
-      /* Make sure that if the realloc fails, we still have the old semaphore
-       * array around to properly clean things up on failure.
-       */
-      struct anv_semaphore_impl *new_array =
-         vk_realloc(submit->alloc,
-                    submit->temporary_semaphores,
-                    new_len * sizeof(*submit->temporary_semaphores),
-                    8, submit->alloc_scope);
-      if (new_array == NULL)
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-      submit->temporary_semaphores = new_array;
-      submit->temporary_semaphore_array_length = new_len;
-   }
-
-   /* Copy anv_semaphore_impl into anv_queue_submit. */
-   submit->temporary_semaphores[submit->temporary_semaphore_count++] = *impl;
-   *out_impl = &submit->temporary_semaphores[submit->temporary_semaphore_count - 1];
-
-   /* Clear the incoming semaphore */
-   impl->type = ANV_SEMAPHORE_TYPE_NONE;
-
-   return VK_SUCCESS;
-}
-
-static VkResult
-anv_queue_submit_add_in_semaphores(struct anv_queue_submit *submit,
-                                   struct anv_device *device,
-                                   const VkSemaphore *in_semaphores,
-                                   const uint64_t *in_values,
-                                   uint32_t num_in_semaphores)
-{
-   VkResult result;
-
-   for (uint32_t i = 0; i < num_in_semaphores; i++) {
-      ANV_FROM_HANDLE(anv_semaphore, semaphore, in_semaphores[i]);
-      struct anv_semaphore_impl *impl;
-
-      result = maybe_transfer_temporary_semaphore(submit, semaphore, &impl);
-      if (result != VK_SUCCESS)
-         return result;
-
-      switch (impl->type) {
-      case ANV_SEMAPHORE_TYPE_WSI_BO:
-         /* When using a window-system buffer as a semaphore, always enable
-          * EXEC_OBJECT_WRITE.  This gives us a WaR hazard with the display or
-          * compositor's read of the buffer and enforces that we don't start
-          * rendering until they are finished.  This is exactly the
-          * synchronization we want with vkAcquireNextImage.
-          */
-         result = anv_queue_submit_add_fence_bo(submit, impl->bo, true /* signal */);
-         if (result != VK_SUCCESS)
-            return result;
-         break;
-
-      case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: {
-         result = anv_queue_submit_add_syncobj(submit, device,
-                                               impl->syncobj,
-                                               I915_EXEC_FENCE_WAIT,
-                                               0);
-         if (result != VK_SUCCESS)
-            return result;
-         break;
-      }
-
-      case ANV_SEMAPHORE_TYPE_TIMELINE:
-         assert(in_values);
-         if (in_values[i] == 0)
-            break;
-         result = anv_queue_submit_add_timeline_wait(submit, device,
-                                                     &impl->timeline,
-                                                     in_values[i]);
-         if (result != VK_SUCCESS)
-            return result;
-         break;
-
-      case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE:
-         assert(in_values);
-         if (in_values[i] == 0)
-            break;
-         result = anv_queue_submit_add_syncobj(submit, device,
-                                               impl->syncobj,
-                                               I915_EXEC_FENCE_WAIT,
-                                               in_values[i]);
-         if (result != VK_SUCCESS)
-            return result;
-         break;
-
-      default:
-         break;
-      }
-   }
-
-   return VK_SUCCESS;
-}
-
-static VkResult
-anv_queue_submit_add_out_semaphores(struct anv_queue_submit *submit,
-                                    struct anv_device *device,
-                                    const VkSemaphore *out_semaphores,
-                                    const uint64_t *out_values,
-                                    uint32_t num_out_semaphores)
-{
-   VkResult result;
-
-   for (uint32_t i = 0; i < num_out_semaphores; i++) {
-      ANV_FROM_HANDLE(anv_semaphore, semaphore, out_semaphores[i]);
-
-      /* Under most circumstances, out fences won't be temporary.  However,
-       * the spec does allow it for opaque_fd.  From the Vulkan 1.0.53 spec:
-       *
-       *    "If the import is temporary, the implementation must restore the
-       *    semaphore to its prior permanent state after submitting the next
-       *    semaphore wait operation."
-       *
-       * The spec says nothing whatsoever about signal operations on
-       * temporarily imported semaphores so it appears they are allowed.
-       * There are also CTS tests that require this to work.
-       */
-      struct anv_semaphore_impl *impl =
-         semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ?
-         &semaphore->temporary : &semaphore->permanent;
-
-      switch (impl->type) {
-      case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: {
-         /*
-          * Reset the content of the syncobj so it doesn't contain a
-          * previously signaled dma-fence, until one is added by EXECBUFFER by
-          * the submission thread.
-          */
-         anv_gem_syncobj_reset(device, impl->syncobj);
-
-         result = anv_queue_submit_add_syncobj(submit, device, impl->syncobj,
-                                               I915_EXEC_FENCE_SIGNAL,
-                                               0);
-         if (result != VK_SUCCESS)
-            return result;
-         break;
-      }
-
-      case ANV_SEMAPHORE_TYPE_TIMELINE:
-         assert(out_values);
-         if (out_values[i] == 0)
-            break;
-         result = anv_queue_submit_add_timeline_signal(submit, device,
-                                                       &impl->timeline,
-                                                       out_values[i]);
-         if (result != VK_SUCCESS)
-            return result;
-         break;
-
-      case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE:
-         assert(out_values);
-         if (out_values[i] == 0)
-            break;
-         result = anv_queue_submit_add_syncobj(submit, device, impl->syncobj,
-                                               I915_EXEC_FENCE_SIGNAL,
-                                               out_values[i]);
-         if (result != VK_SUCCESS)
-            return result;
-         break;
-
-      default:
-         break;
-      }
-   }
-
-   return VK_SUCCESS;
-}
-
-static VkResult
-anv_queue_submit_add_fence(struct anv_queue_submit *submit,
-                           struct anv_device *device,
-                           struct anv_fence *fence)
-{
-   /* Under most circumstances, out fences won't be temporary. However, the
-    * spec does allow it for opaque_fd. From the Vulkan 1.0.53 spec:
-    *
-    *    "If the import is temporary, the implementation must restore the
-    *    semaphore to its prior permanent state after submitting the next
-    *    semaphore wait operation."
-    *
-    * The spec says nothing whatsoever about signal operations on temporarily
-    * imported semaphores so it appears they are allowed. There are also CTS
-    * tests that require this to work.
-    */
-   struct anv_fence_impl *impl =
-      fence->temporary.type != ANV_FENCE_TYPE_NONE ?
-      &fence->temporary : &fence->permanent;
-
-   VkResult result;
-
-   switch (impl->type) {
-   case ANV_FENCE_TYPE_BO:
-      assert(!device->has_thread_submit);
-      result = anv_queue_submit_add_fence_bo(submit, impl->bo.bo, true /* signal */);
-      if (result != VK_SUCCESS)
-         return result;
+   switch (device->info->kmd_type) {
+   case INTEL_KMD_TYPE_I915:
+      anv_i915_destroy_engine(device, queue);
       break;
-
-   case ANV_FENCE_TYPE_SYNCOBJ: {
-      /*
-       * For the same reason we reset the signaled binary syncobj above, also
-       * reset the fence's syncobj so that they don't contain a signaled
-       * dma-fence.
-       */
-      anv_gem_syncobj_reset(device, impl->syncobj);
-
-      result = anv_queue_submit_add_syncobj(submit, device, impl->syncobj,
-                                            I915_EXEC_FENCE_SIGNAL,
-                                            0);
-      if (result != VK_SUCCESS)
-         return result;
+   case INTEL_KMD_TYPE_XE:
+      anv_xe_destroy_engine(device, queue);
       break;
-      }
-
    default:
-      unreachable("Invalid fence type");
-   }
-
-   return VK_SUCCESS;
-}
-
-static void
-anv_post_queue_fence_update(struct anv_device *device, struct anv_fence *fence)
-{
-   if (fence->permanent.type == ANV_FENCE_TYPE_BO) {
-      assert(!device->has_thread_submit);
-      /* If we have permanent BO fence, the only type of temporary possible
-       * would be BO_WSI (because BO fences are not shareable). The Vulkan spec
-       * also requires that the fence passed to vkQueueSubmit() be :
-       *
-       *    * unsignaled
-       *    * not be associated with any other queue command that has not yet
-       *      completed execution on that queue
-       *
-       * So the only acceptable type for the temporary is NONE.
-       */
-      assert(fence->temporary.type == ANV_FENCE_TYPE_NONE);
-
-      /* Once the execbuf has returned, we need to set the fence state to
-       * SUBMITTED.  We can't do this before calling execbuf because
-       * anv_GetFenceStatus does take the global device lock before checking
-       * fence->state.
-       *
-       * We set the fence state to SUBMITTED regardless of whether or not the
-       * execbuf succeeds because we need to ensure that vkWaitForFences() and
-       * vkGetFenceStatus() return a valid result (VK_ERROR_DEVICE_LOST or
-       * VK_SUCCESS) in a finite amount of time even if execbuf fails.
-       */
-      fence->permanent.bo.state = ANV_BO_FENCE_STATE_SUBMITTED;
-   }
-}
-
-static VkResult
-anv_queue_submit_add_cmd_buffer(struct anv_queue_submit *submit,
-                                struct anv_cmd_buffer *cmd_buffer,
-                                int perf_pass)
-{
-   if (submit->cmd_buffer_count >= submit->cmd_buffer_array_length) {
-      uint32_t new_len = MAX2(submit->cmd_buffer_array_length * 2, 4);
-      struct anv_cmd_buffer **new_cmd_buffers =
-         vk_realloc(submit->alloc,
-                    submit->cmd_buffers, new_len * sizeof(*submit->cmd_buffers),
-                    8, submit->alloc_scope);
-      if (new_cmd_buffers == NULL)
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-      submit->cmd_buffers = new_cmd_buffers;
-      submit->cmd_buffer_array_length = new_len;
+      unreachable("Missing");
    }
-
-   submit->cmd_buffers[submit->cmd_buffer_count++] = cmd_buffer;
-   /* Only update the perf_query_pool if there is one. We can decide to batch
-    * 2 command buffers if the second one doesn't use a query pool, but we
-    * can't drop the already chosen one.
-    */
-   if (cmd_buffer->perf_query_pool)
-      submit->perf_query_pool = cmd_buffer->perf_query_pool;
-   submit->perf_query_pass = perf_pass;
-
-   return VK_SUCCESS;
 }
 
-static bool
-anv_queue_submit_can_add_cmd_buffer(const struct anv_queue_submit *submit,
-                                    const struct anv_cmd_buffer *cmd_buffer,
-                                    int perf_pass)
-{
-   /* If first command buffer, no problem. */
-   if (submit->cmd_buffer_count == 0)
-      return true;
-
-   /* Can we chain the last buffer into the next one? */
-   if (!anv_cmd_buffer_is_chainable(submit->cmd_buffers[submit->cmd_buffer_count - 1]))
-      return false;
-
-   /* A change of perf query pools between VkSubmitInfo elements means we
-    * can't batch things up.
-    */
-   if (cmd_buffer->perf_query_pool &&
-       submit->perf_query_pool &&
-       submit->perf_query_pool != cmd_buffer->perf_query_pool)
-      return false;
-
-   /* A change of perf pass also prevents batching things up.
-    */
-   if (submit->perf_query_pass != -1 &&
-       submit->perf_query_pass != perf_pass)
-      return false;
-
-   return true;
-}
-
-static bool
-anv_queue_submit_can_add_submit(const struct anv_queue_submit *submit,
-                                uint32_t n_wait_semaphores,
-                                uint32_t n_signal_semaphores,
-                                int perf_pass)
+VkResult
+anv_queue_init(struct anv_device *device, struct anv_queue *queue,
+               const VkDeviceQueueCreateInfo *pCreateInfo,
+               uint32_t index_in_family)
 {
-   /* We can add to an empty anv_queue_submit. */
-   if (submit->cmd_buffer_count == 0 &&
-       submit->fence_count == 0 &&
-       submit->wait_timeline_count == 0 &&
-       submit->signal_timeline_count == 0 &&
-       submit->fence_bo_count == 0)
-      return true;
-
-   /* Different perf passes will require different EXECBUF ioctls. */
-   if (perf_pass != submit->perf_query_pass)
-      return false;
-
-   /* If the current submit is signaling anything, we can't add anything. */
-   if (submit->signal_timeline_count)
-      return false;
-
-   /* If a submit is waiting on anything, anything that happened before needs
-    * to be submitted.
-    */
-   if (n_wait_semaphores)
-      return false;
-
-   return true;
-}
+   struct anv_physical_device *pdevice = device->physical;
+   assert(queue->vk.queue_family_index < pdevice->queue.family_count);
+   struct anv_queue_family *queue_family =
+      &device->physical->queue.families[pCreateInfo->queueFamilyIndex];
+   VkResult result;
 
-static VkResult
-anv_queue_submit_post_and_alloc_new(struct anv_queue *queue,
-                                    struct anv_queue_submit **submit)
-{
-   VkResult result = anv_queue_submit_post(queue, submit, false);
+   result = vk_queue_init(&queue->vk, &device->vk, pCreateInfo,
+                          index_in_family);
    if (result != VK_SUCCESS)
       return result;
 
-   *submit = anv_queue_submit_alloc(queue->device);
-   if (!*submit)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-   return VK_SUCCESS;
-}
-
-VkResult anv_QueueSubmit(
-    VkQueue                                     _queue,
-    uint32_t                                    submitCount,
-    const VkSubmitInfo*                         pSubmits,
-    VkFence                                     _fence)
-{
-   ANV_FROM_HANDLE(anv_queue, queue, _queue);
-   ANV_FROM_HANDLE(anv_fence, fence, _fence);
-   struct anv_device *device = queue->device;
-
-   if (device->info.no_hw)
-      return VK_SUCCESS;
+   queue->vk.driver_submit = anv_queue_submit;
+   queue->device = device;
+   queue->family = queue_family;
+   queue->decoder = &device->decoder[queue->vk.queue_family_index];
 
-   /* Query for device status prior to submitting.  Technically, we don't need
-    * to do this.  However, if we have a client that's submitting piles of
-    * garbage, we would rather break as early as possible to keep the GPU
-    * hanging contained.  If we don't check here, we'll either be waiting for
-    * the kernel to kick us or we'll have to wait until the client waits on a
-    * fence before we actually know whether or not we've hung.
-    */
-   VkResult result = anv_device_query_status(device);
-   if (result != VK_SUCCESS)
+   result = anv_create_engine(device, queue, pCreateInfo);
+   if (result != VK_SUCCESS) {
+      vk_queue_finish(&queue->vk);
       return result;
-
-   struct anv_queue_submit *submit = anv_queue_submit_alloc(device);
-   if (!submit)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   for (uint32_t i = 0; i < submitCount; i++) {
-      const struct wsi_memory_signal_submit_info *mem_signal_info =
-         vk_find_struct_const(pSubmits[i].pNext,
-                              WSI_MEMORY_SIGNAL_SUBMIT_INFO_MESA);
-      struct anv_bo *wsi_signal_bo =
-         mem_signal_info && mem_signal_info->memory != VK_NULL_HANDLE ?
-         anv_device_memory_from_handle(mem_signal_info->memory)->bo : NULL;
-
-      const VkTimelineSemaphoreSubmitInfoKHR *timeline_info =
-         vk_find_struct_const(pSubmits[i].pNext,
-                              TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR);
-      const VkPerformanceQuerySubmitInfoKHR *perf_info =
-         vk_find_struct_const(pSubmits[i].pNext,
-                              PERFORMANCE_QUERY_SUBMIT_INFO_KHR);
-      const int perf_pass = perf_info ? perf_info->counterPassIndex : 0;
-      const uint64_t *wait_values =
-         timeline_info && timeline_info->waitSemaphoreValueCount ?
-         timeline_info->pWaitSemaphoreValues : NULL;
-      const uint64_t *signal_values =
-         timeline_info && timeline_info->signalSemaphoreValueCount ?
-         timeline_info->pSignalSemaphoreValues : NULL;
-
-      if (!anv_queue_submit_can_add_submit(submit,
-                                           pSubmits[i].waitSemaphoreCount,
-                                           pSubmits[i].signalSemaphoreCount,
-                                           perf_pass)) {
-         result = anv_queue_submit_post_and_alloc_new(queue, &submit);
-         if (result != VK_SUCCESS)
-            goto out;
-      }
-
-      /* Wait semaphores */
-      result = anv_queue_submit_add_in_semaphores(submit,
-                                                  device,
-                                                  pSubmits[i].pWaitSemaphores,
-                                                  wait_values,
-                                                  pSubmits[i].waitSemaphoreCount);
-      if (result != VK_SUCCESS)
-         goto out;
-
-      /* Command buffers */
-      for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
-         ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer,
-                         pSubmits[i].pCommandBuffers[j]);
-         assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
-         assert(!anv_batch_has_error(&cmd_buffer->batch));
-         anv_measure_submit(cmd_buffer);
-
-         /* If we can't add an additional command buffer to the existing
-          * anv_queue_submit, post it and create a new one.
-          */
-         if (!anv_queue_submit_can_add_cmd_buffer(submit, cmd_buffer, perf_pass)) {
-            result = anv_queue_submit_post_and_alloc_new(queue, &submit);
-            if (result != VK_SUCCESS)
-               goto out;
-         }
-
-         result = anv_queue_submit_add_cmd_buffer(submit, cmd_buffer, perf_pass);
-         if (result != VK_SUCCESS)
-            goto out;
-      }
-
-      /* Signal semaphores */
-      result = anv_queue_submit_add_out_semaphores(submit,
-                                                   device,
-                                                   pSubmits[i].pSignalSemaphores,
-                                                   signal_values,
-                                                   pSubmits[i].signalSemaphoreCount);
-      if (result != VK_SUCCESS)
-         goto out;
-
-      /* WSI BO */
-      if (wsi_signal_bo) {
-         result = anv_queue_submit_add_fence_bo(submit, wsi_signal_bo,
-                                                true /* signal */);
-         if (result != VK_SUCCESS)
-            goto out;
-      }
-   }
-
-   if (fence) {
-      result = anv_queue_submit_add_fence(submit, device, fence);
-      if (result != VK_SUCCESS)
-         goto out;
    }
 
-   result = anv_queue_submit_post(queue, &submit, false);
-   if (result != VK_SUCCESS)
-      goto out;
-
-   if (fence)
-      anv_post_queue_fence_update(device, fence);
-
-out:
-   if (submit)
-      anv_queue_submit_free(device, submit);
-
-   if (result != VK_SUCCESS && result != VK_ERROR_DEVICE_LOST) {
-      /* In the case that something has gone wrong we may end up with an
-       * inconsistent state from which it may not be trivial to recover.
-       * For example, we might have computed address relocations and
-       * any future attempt to re-submit this job will need to know about
-       * this and avoid computing relocation addresses again.
-       *
-       * To avoid this sort of issues, we assume that if something was
-       * wrong during submission we must already be in a really bad situation
-       * anyway (such us being out of memory) and return
-       * VK_ERROR_DEVICE_LOST to ensure that clients do not attempt to
-       * submit the same job again to this device.
-       *
-       * We skip doing this on VK_ERROR_DEVICE_LOST because
-       * anv_device_set_lost() would have been called already by a callee of
-       * anv_queue_submit().
-       */
-      result = anv_device_set_lost(device, "vkQueueSubmit() failed");
-   }
-
-   return result;
-}
-
-VkResult anv_QueueWaitIdle(
-    VkQueue                                     _queue)
-{
-   ANV_FROM_HANDLE(anv_queue, queue, _queue);
-
-   if (anv_device_is_lost(queue->device))
-      return VK_ERROR_DEVICE_LOST;
-
-   return anv_queue_submit_simple_batch(queue, NULL);
-}
-
-VkResult anv_CreateFence(
-    VkDevice                                    _device,
-    const VkFenceCreateInfo*                    pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkFence*                                    pFence)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   struct anv_fence *fence;
-
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FENCE_CREATE_INFO);
-
-   fence = vk_object_zalloc(&device->vk, pAllocator, sizeof(*fence),
-                            VK_OBJECT_TYPE_FENCE);
-   if (fence == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   if (device->physical->has_syncobj_wait) {
-      fence->permanent.type = ANV_FENCE_TYPE_SYNCOBJ;
-
-      uint32_t create_flags = 0;
-      if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT)
-         create_flags |= DRM_SYNCOBJ_CREATE_SIGNALED;
-
-      fence->permanent.syncobj = anv_gem_syncobj_create(device, create_flags);
-      if (!fence->permanent.syncobj)
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-   } else {
-      fence->permanent.type = ANV_FENCE_TYPE_BO;
-
-      VkResult result = anv_bo_pool_alloc(&device->batch_bo_pool, 4096,
-                                          &fence->permanent.bo.bo);
-      if (result != VK_SUCCESS)
+   if (INTEL_DEBUG(DEBUG_SYNC)) {
+      result = vk_sync_create(&device->vk,
+                              &device->physical->sync_syncobj_type,
+                              0, 0, &queue->sync);
+      if (result != VK_SUCCESS) {
+         anv_queue_finish(queue);
          return result;
-
-      if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT) {
-         fence->permanent.bo.state = ANV_BO_FENCE_STATE_SIGNALED;
-      } else {
-         fence->permanent.bo.state = ANV_BO_FENCE_STATE_RESET;
-      }
-   }
-
-   *pFence = anv_fence_to_handle(fence);
-
-   return VK_SUCCESS;
-}
-
-static void
-anv_fence_impl_cleanup(struct anv_device *device,
-                       struct anv_fence_impl *impl)
-{
-   switch (impl->type) {
-   case ANV_FENCE_TYPE_NONE:
-      /* Dummy.  Nothing to do */
-      break;
-
-   case ANV_FENCE_TYPE_BO:
-      anv_bo_pool_free(&device->batch_bo_pool, impl->bo.bo);
-      break;
-
-   case ANV_FENCE_TYPE_WSI_BO:
-      anv_device_release_bo(device, impl->bo.bo);
-      break;
-
-   case ANV_FENCE_TYPE_SYNCOBJ:
-      anv_gem_syncobj_destroy(device, impl->syncobj);
-      break;
-
-   case ANV_FENCE_TYPE_WSI:
-      impl->fence_wsi->destroy(impl->fence_wsi);
-      break;
-
-   default:
-      unreachable("Invalid fence type");
-   }
-
-   impl->type = ANV_FENCE_TYPE_NONE;
-}
-
-void
-anv_fence_reset_temporary(struct anv_device *device,
-                          struct anv_fence *fence)
-{
-   if (fence->temporary.type == ANV_FENCE_TYPE_NONE)
-      return;
-
-   anv_fence_impl_cleanup(device, &fence->temporary);
-}
-
-void anv_DestroyFence(
-    VkDevice                                    _device,
-    VkFence                                     _fence,
-    const VkAllocationCallbacks*                pAllocator)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_fence, fence, _fence);
-
-   if (!fence)
-      return;
-
-   anv_fence_impl_cleanup(device, &fence->temporary);
-   anv_fence_impl_cleanup(device, &fence->permanent);
-
-   vk_object_free(&device->vk, pAllocator, fence);
-}
-
-VkResult anv_ResetFences(
-    VkDevice                                    _device,
-    uint32_t                                    fenceCount,
-    const VkFence*                              pFences)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-
-   for (uint32_t i = 0; i < fenceCount; i++) {
-      ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
-
-      /* From the Vulkan 1.0.53 spec:
-       *
-       *    "If any member of pFences currently has its payload imported with
-       *    temporary permanence, that fence’s prior permanent payload is
-       *    first restored. The remaining operations described therefore
-       *    operate on the restored payload.
-       */
-      anv_fence_reset_temporary(device, fence);
-
-      struct anv_fence_impl *impl = &fence->permanent;
-
-      switch (impl->type) {
-      case ANV_FENCE_TYPE_BO:
-         impl->bo.state = ANV_BO_FENCE_STATE_RESET;
-         break;
-
-      case ANV_FENCE_TYPE_SYNCOBJ:
-         anv_gem_syncobj_reset(device, impl->syncobj);
-         break;
-
-      default:
-         unreachable("Invalid fence type");
-      }
-   }
-
-   return VK_SUCCESS;
-}
-
-VkResult anv_GetFenceStatus(
-    VkDevice                                    _device,
-    VkFence                                     _fence)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_fence, fence, _fence);
-
-   if (anv_device_is_lost(device))
-      return VK_ERROR_DEVICE_LOST;
-
-   struct anv_fence_impl *impl =
-      fence->temporary.type != ANV_FENCE_TYPE_NONE ?
-      &fence->temporary : &fence->permanent;
-
-   switch (impl->type) {
-   case ANV_FENCE_TYPE_BO:
-   case ANV_FENCE_TYPE_WSI_BO:
-      switch (impl->bo.state) {
-      case ANV_BO_FENCE_STATE_RESET:
-         /* If it hasn't even been sent off to the GPU yet, it's not ready */
-         return VK_NOT_READY;
-
-      case ANV_BO_FENCE_STATE_SIGNALED:
-         /* It's been signaled, return success */
-         return VK_SUCCESS;
-
-      case ANV_BO_FENCE_STATE_SUBMITTED: {
-         VkResult result = anv_device_bo_busy(device, impl->bo.bo);
-         if (result == VK_SUCCESS) {
-            impl->bo.state = ANV_BO_FENCE_STATE_SIGNALED;
-            return VK_SUCCESS;
-         } else {
-            return result;
-         }
-      }
-      default:
-         unreachable("Invalid fence status");
-      }
-
-   case ANV_FENCE_TYPE_SYNCOBJ: {
-      if (device->has_thread_submit) {
-         uint64_t binary_value = 0;
-         int ret = anv_gem_syncobj_timeline_wait(device, &impl->syncobj,
-                                             &binary_value, 1, 0,
-                                             true /* wait_all */,
-                                             false /* wait_materialize */);
-         if (ret == -1) {
-            if (errno == ETIME) {
-               return VK_NOT_READY;
-            } else {
-               /* We don't know the real error. */
-               return anv_device_set_lost(device, "drm_syncobj_wait failed: %m");
-            }
-         } else {
-            return VK_SUCCESS;
-         }
-      } else {
-         int ret = anv_gem_syncobj_wait(device, &impl->syncobj, 1, 0, false);
-         if (ret == -1) {
-            if (errno == ETIME) {
-               return VK_NOT_READY;
-            } else {
-               /* We don't know the real error. */
-               return anv_device_set_lost(device, "drm_syncobj_wait failed: %m");
-            }
-         } else {
-            return VK_SUCCESS;
-         }
-      }
-   }
-
-   default:
-      unreachable("Invalid fence type");
-   }
-}
-
-static VkResult
-anv_wait_for_syncobj_fences(struct anv_device *device,
-                            uint32_t fenceCount,
-                            const VkFence *pFences,
-                            bool waitAll,
-                            uint64_t abs_timeout_ns)
-{
-   uint32_t *syncobjs = vk_zalloc(&device->vk.alloc,
-                                  sizeof(*syncobjs) * fenceCount, 8,
-                                  VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-   if (!syncobjs)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   for (uint32_t i = 0; i < fenceCount; i++) {
-      ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
-      assert(fence->permanent.type == ANV_FENCE_TYPE_SYNCOBJ);
-
-      struct anv_fence_impl *impl =
-         fence->temporary.type != ANV_FENCE_TYPE_NONE ?
-         &fence->temporary : &fence->permanent;
-
-      assert(impl->type == ANV_FENCE_TYPE_SYNCOBJ);
-      syncobjs[i] = impl->syncobj;
-   }
-
-   int ret = 0;
-   /* The gem_syncobj_wait ioctl may return early due to an inherent
-    * limitation in the way it computes timeouts. Loop until we've actually
-    * passed the timeout.
-    */
-   do {
-      ret = anv_gem_syncobj_wait(device, syncobjs, fenceCount,
-                                 abs_timeout_ns, waitAll);
-   } while (ret == -1 && errno == ETIME && anv_gettime_ns() < abs_timeout_ns);
-
-   vk_free(&device->vk.alloc, syncobjs);
-
-   if (ret == -1) {
-      if (errno == ETIME) {
-         return VK_TIMEOUT;
-      } else {
-         /* We don't know the real error. */
-         return anv_device_set_lost(device, "drm_syncobj_wait failed: %m");
-      }
-   } else {
-      return VK_SUCCESS;
-   }
-}
-
-static VkResult
-anv_wait_for_bo_fences(struct anv_device *device,
-                       uint32_t fenceCount,
-                       const VkFence *pFences,
-                       bool waitAll,
-                       uint64_t abs_timeout_ns)
-{
-   VkResult result = VK_SUCCESS;
-   uint32_t pending_fences = fenceCount;
-   while (pending_fences) {
-      pending_fences = 0;
-      bool signaled_fences = false;
-      for (uint32_t i = 0; i < fenceCount; i++) {
-         ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
-
-         struct anv_fence_impl *impl =
-            fence->temporary.type != ANV_FENCE_TYPE_NONE ?
-            &fence->temporary : &fence->permanent;
-         assert(impl->type == ANV_FENCE_TYPE_BO ||
-                impl->type == ANV_FENCE_TYPE_WSI_BO);
-
-         switch (impl->bo.state) {
-         case ANV_BO_FENCE_STATE_RESET:
-            /* This fence hasn't been submitted yet, we'll catch it the next
-             * time around.  Yes, this may mean we dead-loop but, short of
-             * lots of locking and a condition variable, there's not much that
-             * we can do about that.
-             */
-            pending_fences++;
-            continue;
-
-         case ANV_BO_FENCE_STATE_SIGNALED:
-            /* This fence is not pending.  If waitAll isn't set, we can return
-             * early.  Otherwise, we have to keep going.
-             */
-            if (!waitAll) {
-               result = VK_SUCCESS;
-               goto done;
-            }
-            continue;
-
-         case ANV_BO_FENCE_STATE_SUBMITTED:
-            /* These are the fences we really care about.  Go ahead and wait
-             * on it until we hit a timeout.
-             */
-            result = anv_device_wait(device, impl->bo.bo,
-                                     anv_get_relative_timeout(abs_timeout_ns));
-            switch (result) {
-            case VK_SUCCESS:
-               impl->bo.state = ANV_BO_FENCE_STATE_SIGNALED;
-               signaled_fences = true;
-               if (!waitAll)
-                  goto done;
-               break;
-
-            case VK_TIMEOUT:
-               goto done;
-
-            default:
-               return result;
-            }
-         }
-      }
-
-      if (pending_fences && !signaled_fences) {
-         /* If we've hit this then someone decided to vkWaitForFences before
-          * they've actually submitted any of them to a queue.  This is a
-          * fairly pessimal case, so it's ok to lock here and use a standard
-          * pthreads condition variable.
-          */
-         pthread_mutex_lock(&device->mutex);
-
-         /* It's possible that some of the fences have changed state since the
-          * last time we checked.  Now that we have the lock, check for
-          * pending fences again and don't wait if it's changed.
-          */
-         uint32_t now_pending_fences = 0;
-         for (uint32_t i = 0; i < fenceCount; i++) {
-            ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
-            if (fence->permanent.bo.state == ANV_BO_FENCE_STATE_RESET)
-               now_pending_fences++;
-         }
-         assert(now_pending_fences <= pending_fences);
-
-         if (now_pending_fences == pending_fences) {
-            struct timespec abstime = {
-               .tv_sec = abs_timeout_ns / NSEC_PER_SEC,
-               .tv_nsec = abs_timeout_ns % NSEC_PER_SEC,
-            };
-
-            ASSERTED int ret;
-            ret = pthread_cond_timedwait(&device->queue_submit,
-                                         &device->mutex, &abstime);
-            assert(ret != EINVAL);
-            if (anv_gettime_ns() >= abs_timeout_ns) {
-               pthread_mutex_unlock(&device->mutex);
-               result = VK_TIMEOUT;
-               goto done;
-            }
-         }
-
-         pthread_mutex_unlock(&device->mutex);
-      }
-   }
-
-done:
-   if (anv_device_is_lost(device))
-      return VK_ERROR_DEVICE_LOST;
-
-   return result;
-}
-
-static VkResult
-anv_wait_for_wsi_fence(struct anv_device *device,
-                       struct anv_fence_impl *impl,
-                       uint64_t abs_timeout)
-{
-   return impl->fence_wsi->wait(impl->fence_wsi, abs_timeout);
-}
-
-static VkResult
-anv_wait_for_fences(struct anv_device *device,
-                    uint32_t fenceCount,
-                    const VkFence *pFences,
-                    bool waitAll,
-                    uint64_t abs_timeout)
-{
-   VkResult result = VK_SUCCESS;
-
-   if (fenceCount <= 1 || waitAll) {
-      for (uint32_t i = 0; i < fenceCount; i++) {
-         ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
-         struct anv_fence_impl *impl =
-            fence->temporary.type != ANV_FENCE_TYPE_NONE ?
-            &fence->temporary : &fence->permanent;
-
-         switch (impl->type) {
-         case ANV_FENCE_TYPE_BO:
-            assert(!device->physical->has_syncobj_wait);
-            FALLTHROUGH;
-         case ANV_FENCE_TYPE_WSI_BO:
-            result = anv_wait_for_bo_fences(device, 1, &pFences[i],
-                                            true, abs_timeout);
-            break;
-         case ANV_FENCE_TYPE_SYNCOBJ:
-            result = anv_wait_for_syncobj_fences(device, 1, &pFences[i],
-                                                 true, abs_timeout);
-            break;
-         case ANV_FENCE_TYPE_WSI:
-            result = anv_wait_for_wsi_fence(device, impl, abs_timeout);
-            break;
-         case ANV_FENCE_TYPE_NONE:
-            result = VK_SUCCESS;
-            break;
-         }
-         if (result != VK_SUCCESS)
-            return result;
       }
-   } else {
-      do {
-         for (uint32_t i = 0; i < fenceCount; i++) {
-            if (anv_wait_for_fences(device, 1, &pFences[i], true, 0) == VK_SUCCESS)
-               return VK_SUCCESS;
-         }
-      } while (anv_gettime_ns() < abs_timeout);
-      result = VK_TIMEOUT;
    }
-   return result;
-}
-
-static bool anv_all_fences_syncobj(uint32_t fenceCount, const VkFence *pFences)
-{
-   for (uint32_t i = 0; i < fenceCount; ++i) {
-      ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
-      struct anv_fence_impl *impl =
-         fence->temporary.type != ANV_FENCE_TYPE_NONE ?
-         &fence->temporary : &fence->permanent;
-      if (impl->type != ANV_FENCE_TYPE_SYNCOBJ)
-         return false;
-   }
-   return true;
-}
-
-static bool anv_all_fences_bo(uint32_t fenceCount, const VkFence *pFences)
-{
-   for (uint32_t i = 0; i < fenceCount; ++i) {
-      ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
-      struct anv_fence_impl *impl =
-         fence->temporary.type != ANV_FENCE_TYPE_NONE ?
-         &fence->temporary : &fence->permanent;
-      if (impl->type != ANV_FENCE_TYPE_BO &&
-          impl->type != ANV_FENCE_TYPE_WSI_BO)
-         return false;
-   }
-   return true;
-}
-
-VkResult anv_WaitForFences(
-    VkDevice                                    _device,
-    uint32_t                                    fenceCount,
-    const VkFence*                              pFences,
-    VkBool32                                    waitAll,
-    uint64_t                                    timeout)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-
-   if (device->info.no_hw)
-      return VK_SUCCESS;
 
-   if (anv_device_is_lost(device))
-      return VK_ERROR_DEVICE_LOST;
-
-   uint64_t abs_timeout = anv_get_absolute_timeout(timeout);
-   if (anv_all_fences_syncobj(fenceCount, pFences)) {
-      return anv_wait_for_syncobj_fences(device, fenceCount, pFences,
-                                         waitAll, abs_timeout);
-   } else if (anv_all_fences_bo(fenceCount, pFences)) {
-      return anv_wait_for_bo_fences(device, fenceCount, pFences,
-                                    waitAll, abs_timeout);
-   } else {
-      return anv_wait_for_fences(device, fenceCount, pFences,
-                                 waitAll, abs_timeout);
-   }
-}
-
-void anv_GetPhysicalDeviceExternalFenceProperties(
-    VkPhysicalDevice                            physicalDevice,
-    const VkPhysicalDeviceExternalFenceInfo*    pExternalFenceInfo,
-    VkExternalFenceProperties*                  pExternalFenceProperties)
-{
-   ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
-   switch (pExternalFenceInfo->handleType) {
-   case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT:
-   case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT:
-      if (device->has_syncobj_wait) {
-         pExternalFenceProperties->exportFromImportedHandleTypes =
-            VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT |
-            VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT;
-         pExternalFenceProperties->compatibleHandleTypes =
-            VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT |
-            VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT;
-         pExternalFenceProperties->externalFenceFeatures =
-            VK_EXTERNAL_FENCE_FEATURE_EXPORTABLE_BIT |
-            VK_EXTERNAL_FENCE_FEATURE_IMPORTABLE_BIT;
-         return;
-      }
-      break;
-
-   default:
-      break;
-   }
-
-   pExternalFenceProperties->exportFromImportedHandleTypes = 0;
-   pExternalFenceProperties->compatibleHandleTypes = 0;
-   pExternalFenceProperties->externalFenceFeatures = 0;
-}
-
-VkResult anv_ImportFenceFdKHR(
-    VkDevice                                    _device,
-    const VkImportFenceFdInfoKHR*               pImportFenceFdInfo)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_fence, fence, pImportFenceFdInfo->fence);
-   int fd = pImportFenceFdInfo->fd;
-
-   assert(pImportFenceFdInfo->sType ==
-          VK_STRUCTURE_TYPE_IMPORT_FENCE_FD_INFO_KHR);
-
-   struct anv_fence_impl new_impl = {
-      .type = ANV_FENCE_TYPE_NONE,
-   };
-
-   switch (pImportFenceFdInfo->handleType) {
-   case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT:
-      new_impl.type = ANV_FENCE_TYPE_SYNCOBJ;
-
-      new_impl.syncobj = anv_gem_syncobj_fd_to_handle(device, fd);
-      if (!new_impl.syncobj)
-         return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
-
-      break;
-
-   case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: {
-      /* Sync files are a bit tricky.  Because we want to continue using the
-       * syncobj implementation of WaitForFences, we don't use the sync file
-       * directly but instead import it into a syncobj.
-       */
-      new_impl.type = ANV_FENCE_TYPE_SYNCOBJ;
-
-      /* "If handleType is VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT, the
-       *  special value -1 for fd is treated like a valid sync file descriptor
-       *  referring to an object that has already signaled. The import
-       *  operation will succeed and the VkFence will have a temporarily
-       *  imported payload as if a valid file descriptor had been provided."
-       */
-      uint32_t create_flags = 0;
-      if (fd == -1)
-         create_flags |= DRM_SYNCOBJ_CREATE_SIGNALED;
-
-      new_impl.syncobj = anv_gem_syncobj_create(device, create_flags);
-      if (!new_impl.syncobj)
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-      if (fd != -1 &&
-          anv_gem_syncobj_import_sync_file(device, new_impl.syncobj, fd)) {
-         anv_gem_syncobj_destroy(device, new_impl.syncobj);
-         return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,
-                          "syncobj sync file import failed: %m");
-      }
-      break;
-   }
-
-   default:
-      return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
-   }
-
-   /* From the Vulkan 1.0.53 spec:
-    *
-    *    "Importing a fence payload from a file descriptor transfers
-    *    ownership of the file descriptor from the application to the
-    *    Vulkan implementation. The application must not perform any
-    *    operations on the file descriptor after a successful import."
-    *
-    * If the import fails, we leave the file descriptor open.
-    */
-   if (fd != -1)
-      close(fd);
-
-   if (pImportFenceFdInfo->flags & VK_FENCE_IMPORT_TEMPORARY_BIT) {
-      anv_fence_impl_cleanup(device, &fence->temporary);
-      fence->temporary = new_impl;
-   } else {
-      anv_fence_impl_cleanup(device, &fence->permanent);
-      fence->permanent = new_impl;
-   }
-
-   return VK_SUCCESS;
-}
-
-/* The sideband payload of the DRM syncobj was incremented when the
- * application called vkQueueSubmit(). Here we wait for a fence with the same
- * value to materialize so that we can exporting (typically as a SyncFD).
- */
-static VkResult
-wait_syncobj_materialize(struct anv_device *device,
-                         uint32_t syncobj,
-                         int *fd)
-{
-   if (!device->has_thread_submit)
-      return VK_SUCCESS;
-
-   uint64_t binary_value = 0;
-   /* We might need to wait until the fence materializes before we can
-    * export to a sync FD when we use a thread for submission.
-    */
-   if (anv_gem_syncobj_timeline_wait(device, &syncobj, &binary_value, 1,
-                                     anv_get_absolute_timeout(5ull * NSEC_PER_SEC),
-                                     true /* wait_all */,
-                                     true /* wait_materialize */))
-      return anv_device_set_lost(device, "anv_gem_syncobj_timeline_wait failed: %m");
-
-   return VK_SUCCESS;
-}
-
-VkResult anv_GetFenceFdKHR(
-    VkDevice                                    _device,
-    const VkFenceGetFdInfoKHR*                  pGetFdInfo,
-    int*                                        pFd)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_fence, fence, pGetFdInfo->fence);
-
-   assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_FENCE_GET_FD_INFO_KHR);
-
-   struct anv_fence_impl *impl =
-      fence->temporary.type != ANV_FENCE_TYPE_NONE ?
-      &fence->temporary : &fence->permanent;
-
-   assert(impl->type == ANV_FENCE_TYPE_SYNCOBJ);
-   switch (pGetFdInfo->handleType) {
-   case VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT: {
-      int fd = anv_gem_syncobj_handle_to_fd(device, impl->syncobj);
-      if (fd < 0)
-         return vk_error(VK_ERROR_TOO_MANY_OBJECTS);
-
-      *pFd = fd;
-      break;
-   }
-
-   case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: {
-      VkResult result = wait_syncobj_materialize(device, impl->syncobj, pFd);
-      if (result != VK_SUCCESS)
-         return result;
-
-      int fd = anv_gem_syncobj_export_sync_file(device, impl->syncobj);
-      if (fd < 0)
-         return vk_error(VK_ERROR_TOO_MANY_OBJECTS);
-
-      *pFd = fd;
-      break;
-   }
-
-   default:
-      unreachable("Invalid fence export handle type");
-   }
-
-   /* From the Vulkan 1.0.53 spec:
-    *
-    *    "Export operations have the same transference as the specified handle
-    *    type’s import operations. [...] If the fence was using a
-    *    temporarily imported payload, the fence’s prior permanent payload
-    *    will be restored.
-    */
-   if (impl == &fence->temporary)
-      anv_fence_impl_cleanup(device, impl);
-
-   return VK_SUCCESS;
-}
-
-// Queue semaphore functions
-
-static VkSemaphoreTypeKHR
-get_semaphore_type(const void *pNext, uint64_t *initial_value)
-{
-   const VkSemaphoreTypeCreateInfoKHR *type_info =
-      vk_find_struct_const(pNext, SEMAPHORE_TYPE_CREATE_INFO_KHR);
-
-   if (!type_info)
-      return VK_SEMAPHORE_TYPE_BINARY_KHR;
-
-   if (initial_value)
-      *initial_value = type_info->initialValue;
-   return type_info->semaphoreType;
-}
-
-static VkResult
-binary_semaphore_create(struct anv_device *device,
-                        struct anv_semaphore_impl *impl,
-                        bool exportable)
-{
-   impl->type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ;
-   impl->syncobj = anv_gem_syncobj_create(device, 0);
-   if (!impl->syncobj)
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-   return VK_SUCCESS;
-}
-
-static VkResult
-timeline_semaphore_create(struct anv_device *device,
-                          struct anv_semaphore_impl *impl,
-                          uint64_t initial_value)
-{
-   if (device->has_thread_submit) {
-      impl->type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE;
-      impl->syncobj = anv_gem_syncobj_create(device, 0);
-      if (!impl->syncobj)
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-      if (initial_value) {
-         if (anv_gem_syncobj_timeline_signal(device,
-                                             &impl->syncobj,
-                                             &initial_value, 1)) {
-            anv_gem_syncobj_destroy(device, impl->syncobj);
-            return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-         }
-      }
-   } else {
-      impl->type = ANV_SEMAPHORE_TYPE_TIMELINE;
-      anv_timeline_init(device, &impl->timeline, initial_value);
-   }
-
-   return VK_SUCCESS;
-}
-
-VkResult anv_CreateSemaphore(
-    VkDevice                                    _device,
-    const VkSemaphoreCreateInfo*                pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkSemaphore*                                pSemaphore)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   struct anv_semaphore *semaphore;
-
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO);
-
-   uint64_t timeline_value = 0;
-   VkSemaphoreTypeKHR sem_type = get_semaphore_type(pCreateInfo->pNext, &timeline_value);
-
-   semaphore = vk_object_alloc(&device->vk, NULL, sizeof(*semaphore),
-                               VK_OBJECT_TYPE_SEMAPHORE);
-   if (semaphore == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   const VkExportSemaphoreCreateInfo *export =
-      vk_find_struct_const(pCreateInfo->pNext, EXPORT_SEMAPHORE_CREATE_INFO);
-   VkExternalSemaphoreHandleTypeFlags handleTypes =
-      export ? export->handleTypes : 0;
-   VkResult result;
-
-   if (handleTypes == 0) {
-      if (sem_type == VK_SEMAPHORE_TYPE_BINARY_KHR)
-         result = binary_semaphore_create(device, &semaphore->permanent, false);
-      else
-         result = timeline_semaphore_create(device, &semaphore->permanent, timeline_value);
-      if (result != VK_SUCCESS) {
-         vk_object_free(&device->vk, pAllocator, semaphore);
-         return result;
-      }
-   } else if (handleTypes & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) {
-      assert(handleTypes == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT);
-      if (sem_type == VK_SEMAPHORE_TYPE_BINARY_KHR)
-         result = binary_semaphore_create(device, &semaphore->permanent, true);
-      else
-         result = timeline_semaphore_create(device, &semaphore->permanent, timeline_value);
+   if (queue_family->engine_class == INTEL_ENGINE_CLASS_COPY ||
+       queue_family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
+      result = vk_sync_create(&device->vk,
+                              &device->physical->sync_syncobj_type,
+                              0, 0, &queue->companion_sync);
       if (result != VK_SUCCESS) {
-         vk_object_free(&device->vk, pAllocator, semaphore);
+         anv_queue_finish(queue);
          return result;
       }
-   } else if (handleTypes & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) {
-      assert(handleTypes == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT);
-      assert(sem_type == VK_SEMAPHORE_TYPE_BINARY_KHR);
-      semaphore->permanent.type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ;
-      semaphore->permanent.syncobj = anv_gem_syncobj_create(device, 0);
-      if (!semaphore->permanent.syncobj) {
-         vk_object_free(&device->vk, pAllocator, semaphore);
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-      }
-   } else {
-      assert(!"Unknown handle type");
-      vk_object_free(&device->vk, pAllocator, semaphore);
-      return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
    }
 
-   semaphore->temporary.type = ANV_SEMAPHORE_TYPE_NONE;
-
-   *pSemaphore = anv_semaphore_to_handle(semaphore);
-
    return VK_SUCCESS;
 }
 
-static void
-anv_semaphore_impl_cleanup(struct anv_device *device,
-                           struct anv_semaphore_impl *impl)
-{
-   switch (impl->type) {
-   case ANV_SEMAPHORE_TYPE_NONE:
-   case ANV_SEMAPHORE_TYPE_DUMMY:
-      /* Dummy.  Nothing to do */
-      break;
-
-   case ANV_SEMAPHORE_TYPE_WSI_BO:
-      anv_device_release_bo(device, impl->bo);
-      break;
-
-   case ANV_SEMAPHORE_TYPE_TIMELINE:
-      anv_timeline_finish(device, &impl->timeline);
-      break;
-
-   case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ:
-   case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE:
-      anv_gem_syncobj_destroy(device, impl->syncobj);
-      break;
-
-   default:
-      unreachable("Invalid semaphore type");
-   }
-
-   impl->type = ANV_SEMAPHORE_TYPE_NONE;
-}
-
 void
-anv_semaphore_reset_temporary(struct anv_device *device,
-                              struct anv_semaphore *semaphore)
-{
-   if (semaphore->temporary.type == ANV_SEMAPHORE_TYPE_NONE)
-      return;
-
-   anv_semaphore_impl_cleanup(device, &semaphore->temporary);
-}
-
-void anv_DestroySemaphore(
-    VkDevice                                    _device,
-    VkSemaphore                                 _semaphore,
-    const VkAllocationCallbacks*                pAllocator)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_semaphore, semaphore, _semaphore);
-
-   if (semaphore == NULL)
-      return;
-
-   anv_semaphore_impl_cleanup(device, &semaphore->temporary);
-   anv_semaphore_impl_cleanup(device, &semaphore->permanent);
-
-   vk_object_base_finish(&semaphore->base);
-   vk_free(&device->vk.alloc, semaphore);
-}
-
-void anv_GetPhysicalDeviceExternalSemaphoreProperties(
-    VkPhysicalDevice                            physicalDevice,
-    const VkPhysicalDeviceExternalSemaphoreInfo* pExternalSemaphoreInfo,
-    VkExternalSemaphoreProperties*               pExternalSemaphoreProperties)
-{
-   ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
-   VkSemaphoreTypeKHR sem_type =
-      get_semaphore_type(pExternalSemaphoreInfo->pNext, NULL);
-
-   switch (pExternalSemaphoreInfo->handleType) {
-   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT:
-      /* Timeline semaphores are not exportable, unless we have threaded
-       * submission.
-       */
-      if (sem_type == VK_SEMAPHORE_TYPE_TIMELINE_KHR && !device->has_thread_submit)
-         break;
-      pExternalSemaphoreProperties->exportFromImportedHandleTypes =
-         VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT;
-      pExternalSemaphoreProperties->compatibleHandleTypes =
-         VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT;
-      pExternalSemaphoreProperties->externalSemaphoreFeatures =
-         VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT |
-         VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT;
-      return;
-
-   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT:
-      if (sem_type == VK_SEMAPHORE_TYPE_TIMELINE_KHR)
-         break;
-      if (!device->has_exec_fence)
-         break;
-      pExternalSemaphoreProperties->exportFromImportedHandleTypes =
-         VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT;
-      pExternalSemaphoreProperties->compatibleHandleTypes =
-         VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT;
-      pExternalSemaphoreProperties->externalSemaphoreFeatures =
-         VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT |
-         VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT;
-      return;
-
-   default:
-      break;
-   }
-
-   pExternalSemaphoreProperties->exportFromImportedHandleTypes = 0;
-   pExternalSemaphoreProperties->compatibleHandleTypes = 0;
-   pExternalSemaphoreProperties->externalSemaphoreFeatures = 0;
-}
-
-VkResult anv_ImportSemaphoreFdKHR(
-    VkDevice                                    _device,
-    const VkImportSemaphoreFdInfoKHR*           pImportSemaphoreFdInfo)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_semaphore, semaphore, pImportSemaphoreFdInfo->semaphore);
-   int fd = pImportSemaphoreFdInfo->fd;
-
-   struct anv_semaphore_impl new_impl = {
-      .type = ANV_SEMAPHORE_TYPE_NONE,
-   };
-
-   switch (pImportSemaphoreFdInfo->handleType) {
-   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT:
-      /* When importing non temporarily, reuse the semaphore's existing
-       * type. The Linux/DRM implementation allows to interchangeably use
-       * binary & timeline semaphores and we have no way to differenciate
-       * them.
-       */
-      if (pImportSemaphoreFdInfo->flags & VK_SEMAPHORE_IMPORT_TEMPORARY_BIT)
-         new_impl.type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ;
-      else
-         new_impl.type = semaphore->permanent.type;
-
-      new_impl.syncobj = anv_gem_syncobj_fd_to_handle(device, fd);
-      if (!new_impl.syncobj)
-         return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
-
-      /* From the Vulkan spec:
-       *
-       *    "Importing semaphore state from a file descriptor transfers
-       *    ownership of the file descriptor from the application to the
-       *    Vulkan implementation. The application must not perform any
-       *    operations on the file descriptor after a successful import."
-       *
-       * If the import fails, we leave the file descriptor open.
-       */
-      close(fd);
-      break;
-
-   case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: {
-      uint32_t create_flags = 0;
-
-      if (fd == -1)
-         create_flags |= DRM_SYNCOBJ_CREATE_SIGNALED;
-
-      new_impl = (struct anv_semaphore_impl) {
-         .type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ,
-         .syncobj = anv_gem_syncobj_create(device, create_flags),
-      };
-
-      if (!new_impl.syncobj)
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-      if (fd != -1) {
-         if (anv_gem_syncobj_import_sync_file(device, new_impl.syncobj, fd)) {
-            anv_gem_syncobj_destroy(device, new_impl.syncobj);
-            return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE,
-                             "syncobj sync file import failed: %m");
-         }
-         /* Ownership of the FD is transfered to Anv. Since we don't need it
-          * anymore because the associated fence has been put into a syncobj,
-          * we must close the FD.
-          */
-         close(fd);
-      }
-      break;
-   }
-
-   default:
-      return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
-   }
-
-   if (pImportSemaphoreFdInfo->flags & VK_SEMAPHORE_IMPORT_TEMPORARY_BIT) {
-      anv_semaphore_impl_cleanup(device, &semaphore->temporary);
-      semaphore->temporary = new_impl;
-   } else {
-      anv_semaphore_impl_cleanup(device, &semaphore->permanent);
-      semaphore->permanent = new_impl;
-   }
-
-   return VK_SUCCESS;
-}
-
-VkResult anv_GetSemaphoreFdKHR(
-    VkDevice                                    _device,
-    const VkSemaphoreGetFdInfoKHR*              pGetFdInfo,
-    int*                                        pFd)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_semaphore, semaphore, pGetFdInfo->semaphore);
-   int fd;
-
-   assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR);
-
-   struct anv_semaphore_impl *impl =
-      semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ?
-      &semaphore->temporary : &semaphore->permanent;
-
-   switch (impl->type) {
-   case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ:
-      if (pGetFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) {
-         VkResult result = wait_syncobj_materialize(device, impl->syncobj, pFd);
-         if (result != VK_SUCCESS)
-            return result;
-
-         fd = anv_gem_syncobj_export_sync_file(device, impl->syncobj);
-      } else {
-         assert(pGetFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT);
-         fd = anv_gem_syncobj_handle_to_fd(device, impl->syncobj);
-      }
-      if (fd < 0)
-         return vk_error(VK_ERROR_TOO_MANY_OBJECTS);
-      *pFd = fd;
-      break;
-
-   case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE:
-      assert(pGetFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT);
-      fd = anv_gem_syncobj_handle_to_fd(device, impl->syncobj);
-      if (fd < 0)
-         return vk_error(VK_ERROR_TOO_MANY_OBJECTS);
-      *pFd = fd;
-      break;
-
-   default:
-      return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE);
-   }
-
-   /* From the Vulkan 1.0.53 spec:
-    *
-    *    "Export operations have the same transference as the specified handle
-    *    type’s import operations. [...] If the semaphore was using a
-    *    temporarily imported payload, the semaphore’s prior permanent payload
-    *    will be restored.
-    */
-   if (impl == &semaphore->temporary)
-      anv_semaphore_impl_cleanup(device, impl);
-
-   return VK_SUCCESS;
-}
-
-VkResult anv_GetSemaphoreCounterValue(
-    VkDevice                                    _device,
-    VkSemaphore                                 _semaphore,
-    uint64_t*                                   pValue)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_semaphore, semaphore, _semaphore);
-
-   struct anv_semaphore_impl *impl =
-      semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ?
-      &semaphore->temporary : &semaphore->permanent;
-
-   switch (impl->type) {
-   case ANV_SEMAPHORE_TYPE_TIMELINE: {
-      pthread_mutex_lock(&device->mutex);
-      anv_timeline_gc_locked(device, &impl->timeline);
-      *pValue = impl->timeline.highest_past;
-      pthread_mutex_unlock(&device->mutex);
-      return VK_SUCCESS;
-   }
-
-   case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE: {
-      int ret = anv_gem_syncobj_timeline_query(device, &impl->syncobj, pValue, 1);
-
-      if (ret != 0)
-         return anv_device_set_lost(device, "unable to query timeline syncobj");
-
-      return VK_SUCCESS;
-   }
-
-   default:
-      unreachable("Invalid semaphore type");
-   }
-}
-
-static VkResult
-anv_timeline_wait_locked(struct anv_device *device,
-                         struct anv_timeline *timeline,
-                         uint64_t serial, uint64_t abs_timeout_ns)
-{
-   /* Wait on the queue_submit condition variable until the timeline has a
-    * time point pending that's at least as high as serial.
-    */
-   while (timeline->highest_pending < serial) {
-      struct timespec abstime = {
-         .tv_sec = abs_timeout_ns / NSEC_PER_SEC,
-         .tv_nsec = abs_timeout_ns % NSEC_PER_SEC,
-      };
-
-      UNUSED int ret = pthread_cond_timedwait(&device->queue_submit,
-                                              &device->mutex, &abstime);
-      assert(ret != EINVAL);
-      if (anv_gettime_ns() >= abs_timeout_ns &&
-          timeline->highest_pending < serial)
-         return VK_TIMEOUT;
-   }
-
-   while (1) {
-      VkResult result = anv_timeline_gc_locked(device, timeline);
-      if (result != VK_SUCCESS)
-         return result;
-
-      if (timeline->highest_past >= serial)
-         return VK_SUCCESS;
-
-      /* If we got here, our earliest time point has a busy BO */
-      struct anv_timeline_point *point =
-         list_first_entry(&timeline->points,
-                          struct anv_timeline_point, link);
-
-      /* Drop the lock while we wait. */
-      point->waiting++;
-      pthread_mutex_unlock(&device->mutex);
-
-      result = anv_device_wait(device, point->bo,
-                               anv_get_relative_timeout(abs_timeout_ns));
-
-      /* Pick the mutex back up */
-      pthread_mutex_lock(&device->mutex);
-      point->waiting--;
-
-      /* This covers both VK_TIMEOUT and VK_ERROR_DEVICE_LOST */
-      if (result != VK_SUCCESS)
-         return result;
-   }
-}
-
-static VkResult
-anv_timelines_wait(struct anv_device *device,
-                   struct anv_timeline **timelines,
-                   const uint64_t *serials,
-                   uint32_t n_timelines,
-                   bool wait_all,
-                   uint64_t abs_timeout_ns)
-{
-   if (!wait_all && n_timelines > 1) {
-      pthread_mutex_lock(&device->mutex);
-
-      while (1) {
-         VkResult result;
-         for (uint32_t i = 0; i < n_timelines; i++) {
-            result =
-               anv_timeline_wait_locked(device, timelines[i], serials[i], 0);
-            if (result != VK_TIMEOUT)
-               break;
-         }
-
-         if (result != VK_TIMEOUT ||
-             anv_gettime_ns() >= abs_timeout_ns) {
-            pthread_mutex_unlock(&device->mutex);
-            return result;
-         }
-
-         /* If none of them are ready do a short wait so we don't completely
-          * spin while holding the lock. The 10us is completely arbitrary.
-          */
-         uint64_t abs_short_wait_ns =
-            anv_get_absolute_timeout(
-               MIN2((anv_gettime_ns() - abs_timeout_ns) / 10, 10 * 1000));
-         struct timespec abstime = {
-            .tv_sec = abs_short_wait_ns / NSEC_PER_SEC,
-            .tv_nsec = abs_short_wait_ns % NSEC_PER_SEC,
-         };
-         ASSERTED int ret;
-         ret = pthread_cond_timedwait(&device->queue_submit,
-                                      &device->mutex, &abstime);
-         assert(ret != EINVAL);
-      }
-   } else {
-      VkResult result = VK_SUCCESS;
-      pthread_mutex_lock(&device->mutex);
-      for (uint32_t i = 0; i < n_timelines; i++) {
-         result =
-            anv_timeline_wait_locked(device, timelines[i],
-                                     serials[i], abs_timeout_ns);
-         if (result != VK_SUCCESS)
-            break;
-      }
-      pthread_mutex_unlock(&device->mutex);
-      return result;
-   }
-}
-
-VkResult anv_WaitSemaphores(
-    VkDevice                                    _device,
-    const VkSemaphoreWaitInfoKHR*               pWaitInfo,
-    uint64_t                                    timeout)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   uint32_t *handles;
-   struct anv_timeline **timelines;
-
-   VK_MULTIALLOC(ma);
-
-   VK_MULTIALLOC_DECL(&ma, uint64_t, values, pWaitInfo->semaphoreCount);
-   if (device->has_thread_submit) {
-      vk_multialloc_add(&ma, &handles, uint32_t, pWaitInfo->semaphoreCount);
-   } else {
-      vk_multialloc_add(&ma, &timelines, struct anv_timeline *,
-                             pWaitInfo->semaphoreCount);
-   }
-
-   if (!vk_multialloc_alloc(&ma, &device->vk.alloc,
-                            VK_SYSTEM_ALLOCATION_SCOPE_COMMAND))
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   uint32_t handle_count = 0;
-   for (uint32_t i = 0; i < pWaitInfo->semaphoreCount; i++) {
-      ANV_FROM_HANDLE(anv_semaphore, semaphore, pWaitInfo->pSemaphores[i]);
-      struct anv_semaphore_impl *impl =
-         semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ?
-         &semaphore->temporary : &semaphore->permanent;
-
-      if (pWaitInfo->pValues[i] == 0)
-         continue;
-
-      if (device->has_thread_submit) {
-         assert(impl->type == ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE);
-         handles[handle_count] = impl->syncobj;
-      } else {
-         assert(impl->type == ANV_SEMAPHORE_TYPE_TIMELINE);
-         timelines[handle_count] = &impl->timeline;
-      }
-      values[handle_count] = pWaitInfo->pValues[i];
-      handle_count++;
-   }
-
-   VkResult result = VK_SUCCESS;
-   if (handle_count > 0) {
-      if (device->has_thread_submit) {
-         int ret =
-            anv_gem_syncobj_timeline_wait(device,
-                                          handles, values, handle_count,
-                                          anv_get_absolute_timeout(timeout),
-                                          !(pWaitInfo->flags & VK_SEMAPHORE_WAIT_ANY_BIT_KHR),
-                                          false);
-         if (ret != 0)
-            result = errno == ETIME ? VK_TIMEOUT :
-               anv_device_set_lost(device, "unable to wait on timeline syncobj");
-      } else {
-         result =
-            anv_timelines_wait(device, timelines, values, handle_count,
-                               !(pWaitInfo->flags & VK_SEMAPHORE_WAIT_ANY_BIT_KHR),
-                               anv_get_absolute_timeout(timeout));
-      }
-   }
-
-   vk_free(&device->vk.alloc, values);
-
-   return result;
-}
-
-VkResult anv_SignalSemaphore(
-    VkDevice                                    _device,
-    const VkSemaphoreSignalInfoKHR*             pSignalInfo)
+anv_queue_finish(struct anv_queue *queue)
 {
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_semaphore, semaphore, pSignalInfo->semaphore);
-
-   struct anv_semaphore_impl *impl =
-      semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ?
-      &semaphore->temporary : &semaphore->permanent;
-
-   switch (impl->type) {
-   case ANV_SEMAPHORE_TYPE_TIMELINE: {
-      pthread_mutex_lock(&device->mutex);
-
-      VkResult result = anv_timeline_gc_locked(device, &impl->timeline);
+   if (queue->sync)
+      vk_sync_destroy(&queue->device->vk, queue->sync);
 
-      assert(pSignalInfo->value > impl->timeline.highest_pending);
+   if (queue->companion_sync)
+      vk_sync_destroy(&queue->device->vk, queue->companion_sync);
 
-      impl->timeline.highest_pending = impl->timeline.highest_past = pSignalInfo->value;
-
-      if (result == VK_SUCCESS)
-         result = anv_device_submit_deferred_locked(device);
-
-      pthread_cond_broadcast(&device->queue_submit);
-      pthread_mutex_unlock(&device->mutex);
-      return result;
-   }
-
-   case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE: {
-      /* Timeline semaphores are created with a value of 0, so signaling on 0
-       * is a waste of time.
-       */
-      if (pSignalInfo->value == 0)
-         return VK_SUCCESS;
-
-      int ret = anv_gem_syncobj_timeline_signal(device, &impl->syncobj,
-                                                &pSignalInfo->value, 1);
-
-      return ret == 0 ? VK_SUCCESS :
-         anv_device_set_lost(device, "unable to signal timeline syncobj");
-   }
-
-   default:
-      unreachable("Invalid semaphore type");
-   }
+   anv_destroy_engine(queue);
+   vk_queue_finish(&queue->vk);
 }
diff --git a/src/intel/vulkan/anv_rmv.c b/src/intel/vulkan/anv_rmv.c
new file mode 100644
index 00000000000..a65258d8d1f
--- /dev/null
+++ b/src/intel/vulkan/anv_rmv.c
@@ -0,0 +1,864 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "anv_private.h"
+
+static VkResult
+capture_trace(VkQueue _queue)
+{
+   ANV_FROM_HANDLE(anv_queue, queue, _queue);
+
+   simple_mtx_lock(&queue->device->vk.memory_trace_data.token_mtx);
+   vk_dump_rmv_capture(&queue->device->vk.memory_trace_data);
+   simple_mtx_unlock(&queue->device->vk.memory_trace_data.token_mtx);
+
+   return VK_SUCCESS;
+}
+
+void
+anv_memory_trace_init(struct anv_device *device)
+{
+   struct vk_rmv_device_info info;
+   memset(&info, 0, sizeof(info));
+   anv_rmv_fill_device_info(device->physical, &info);
+   vk_memory_trace_init(&device->vk, &info);
+
+   if (!device->vk.memory_trace_data.is_enabled)
+      return;
+
+   device->vk.capture_trace = capture_trace;
+}
+
+static void
+fill_memory_info(const struct anv_physical_device *device,
+                 struct vk_rmv_memory_info *out_info,
+                 int32_t index)
+{
+   switch (index) {
+   case VK_RMV_MEMORY_LOCATION_DEVICE:
+      out_info->physical_base_address = 0;
+      out_info->size = device->memory.heaps[0].size;
+      break;
+   case VK_RMV_MEMORY_LOCATION_DEVICE_INVISIBLE:
+      out_info->physical_base_address = device->memory.heaps[0].size;
+      out_info->size = device->vram_non_mappable.size;
+      break;
+   case VK_RMV_MEMORY_LOCATION_HOST:
+      out_info->physical_base_address = 0;
+      out_info->size = device->memory.heaps[1].size;
+      break;
+   default:
+      unreachable("invalid memory index");
+   }
+}
+
+void
+anv_rmv_fill_device_info(const struct anv_physical_device *device,
+                         struct vk_rmv_device_info *info)
+{
+   for (int32_t i = 0; i < VK_RMV_MEMORY_LOCATION_COUNT; ++i)
+      fill_memory_info(device, &info->memory_infos[i], i);
+
+   strncpy(info->device_name, device->info.name, sizeof(info->device_name) - 1);
+   info->pcie_revision_id = device->info.pci_revision_id;
+   info->pcie_device_id = device->info.pci_device_id;
+   /* TODO: */
+   info->pcie_family_id = 0;
+   info->minimum_shader_clock = 0;
+   info->maximum_shader_clock = 1 * 1024 * 1024 * 1024;
+   info->vram_type = VK_RMV_MEMORY_TYPE_DDR4;
+   info->vram_bus_width = 256;
+   info->vram_operations_per_clock = 1;
+   info->minimum_memory_clock = 0;
+   info->maximum_memory_clock = 1;
+   info->vram_bandwidth = 256;
+}
+
+void
+anv_memory_trace_finish(struct anv_device *device)
+{
+}
+
+static uint32_t
+resource_id_locked(struct anv_device *device, const void *obj)
+{
+   return vk_rmv_get_resource_id_locked(&device->vk, (uint64_t)(uintptr_t)obj);
+}
+
+static void
+resource_destroy_locked(struct anv_device *device, const void *obj)
+{
+   vk_rmv_destroy_resource_id_locked(&device->vk, (uint64_t)(uintptr_t)obj);
+}
+
+/* The token lock must be held when entering _locked functions */
+static void
+log_resource_bind_locked(struct anv_device *device, uint64_t resource_id,
+                         struct anv_bo *bo, uint64_t offset,
+                         uint64_t size)
+{
+   struct vk_rmv_resource_bind_token token = {
+      .resource_id      = resource_id,
+      .is_system_memory = bo ? (bo->alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM) : 0,
+      .address          = (bo ? bo->offset : 0) + offset,
+      .size             = size,
+   };
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_BIND, &token);
+}
+
+static void
+log_state_pool_bind_locked(struct anv_device *device, uint64_t resource_id,
+                           struct anv_state_pool *pool, struct anv_state *state)
+{
+   struct vk_rmv_resource_bind_token token = {
+      .resource_id      = resource_id,
+      .is_system_memory = (pool->block_pool.bo_alloc_flags &
+                           ANV_BO_ALLOC_NO_LOCAL_MEM) != 0,
+      .address          = anv_address_physical(
+         anv_state_pool_state_address(pool, *state)),
+      .size             = state->alloc_size,
+   };
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_BIND, &token);
+}
+
+static enum vk_rmv_memory_location
+anv_heap_index_to_memory_location(struct anv_device *device,
+                                  unsigned heap_index)
+{
+   if (heap_index == 0)
+      return device->physical->vram_non_mappable.size != 0 ?
+             VK_RMV_MEMORY_LOCATION_DEVICE_INVISIBLE :
+             VK_RMV_MEMORY_LOCATION_DEVICE;
+   else if (heap_index == 1)
+      return VK_RMV_MEMORY_LOCATION_HOST;
+   else
+      return VK_RMV_MEMORY_LOCATION_DEVICE;
+}
+
+static void
+anv_rmv_log_bo_gtt_unmap_locked(struct anv_device *device,
+                                struct anv_bo *bo)
+{
+   if (!bo->gtt_mapped)
+      return;
+
+   struct vk_rmv_token token = {
+      .type      = VK_RMV_TOKEN_TYPE_PAGE_TABLE_UPDATE,
+      .timestamp = (uint64_t)os_time_get_nano(),
+      .data      = {
+         .page_table_update = {
+            .type             = VK_RMV_PAGE_TABLE_UPDATE_TYPE_UPDATE,
+            .page_size        = device->info->mem_alignment,
+            .page_count       = DIV_ROUND_UP(bo->size,
+                                             device->info->mem_alignment),
+            .pid              = getpid(),
+            .virtual_address  = bo->offset,
+            .physical_address = bo->offset,
+            .is_unmap         = true,
+         },
+      },
+   };
+   util_dynarray_append(&device->vk.memory_trace_data.tokens,
+                        struct vk_rmv_token, token);
+
+   bo->gtt_mapped = false;
+}
+
+void
+anv_rmv_log_bo_gtt_unmap(struct anv_device *device,
+                         struct anv_bo *bo)
+{
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   anv_rmv_log_bo_gtt_unmap_locked(device, bo);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_bo_gtt_map(struct anv_device *device,
+                       struct anv_bo *bo)
+{
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_token token = {
+      .type      = VK_RMV_TOKEN_TYPE_PAGE_TABLE_UPDATE,
+      .timestamp = (uint64_t)os_time_get_nano(),
+      .data      = {
+         .page_table_update = {
+            .type             = VK_RMV_PAGE_TABLE_UPDATE_TYPE_UPDATE,
+            .page_size        = device->info->mem_alignment,
+            .page_count       = DIV_ROUND_UP(bo->size,
+                                                device->info->mem_alignment),
+            .pid              = getpid(),
+            .virtual_address  = bo->offset,
+            .physical_address = bo->offset,
+            .is_unmap         = false,
+            },
+      },
+   };
+   util_dynarray_append(&device->vk.memory_trace_data.tokens,
+                        struct vk_rmv_token, token);
+
+   bo->gtt_mapped = true;
+
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_bos_gtt_map(struct anv_device *device,
+                        struct anv_bo **bos,
+                        uint32_t bo_count)
+{
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   for (uint32_t i = 0; i < bo_count; i++) {
+      struct anv_bo *bo = bos[i];
+
+      if (bo->gtt_mapped)
+         continue;
+
+      struct vk_rmv_token token = {
+         .type      = VK_RMV_TOKEN_TYPE_PAGE_TABLE_UPDATE,
+         .timestamp = (uint64_t)os_time_get_nano(),
+         .data      = {
+            .page_table_update = {
+               .type             = VK_RMV_PAGE_TABLE_UPDATE_TYPE_UPDATE,
+               .page_size        = device->info->mem_alignment,
+               .page_count       = DIV_ROUND_UP(bo->size,
+                                                device->info->mem_alignment),
+               .pid              = getpid(),
+               .virtual_address  = bo->offset,
+               .physical_address = bo->offset,
+               .is_unmap         = false,
+            },
+         },
+      };
+      util_dynarray_append(&device->vk.memory_trace_data.tokens,
+                           struct vk_rmv_token, token);
+
+      bo->gtt_mapped = true;
+   }
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_vm_binds(struct anv_device *device,
+                     struct anv_vm_bind *binds,
+                     uint32_t bind_count)
+{
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   for (uint32_t i = 0; i < bind_count; i++) {
+
+      struct vk_rmv_token token = {
+         .type      = VK_RMV_TOKEN_TYPE_PAGE_TABLE_UPDATE,
+         .timestamp = (uint64_t)os_time_get_nano(),
+         .data      = {
+            .page_table_update = {
+               .type             = VK_RMV_PAGE_TABLE_UPDATE_TYPE_UPDATE,
+               .page_size        = device->info->mem_alignment,
+               .page_count       = DIV_ROUND_UP(binds[i].size,
+                                                device->info->mem_alignment),
+               .pid              = getpid(),
+               .virtual_address  = binds[i].address,
+               .physical_address = binds[i].bo_offset,
+               .is_unmap         = binds[i].op == ANV_VM_UNBIND,
+            },
+         },
+      };
+      util_dynarray_append(&device->vk.memory_trace_data.tokens,
+                           struct vk_rmv_token, token);
+   }
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_heap_create(struct anv_device *device,
+                        struct anv_device_memory *memory,
+                        bool is_internal,
+                        VkMemoryAllocateFlags alloc_flags)
+{
+   /* Do not log zero-sized device memory objects. */
+   if (!memory->vk.size)
+      return;
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+
+   struct vk_rmv_resource_create_token token = {
+      .type               = VK_RMV_RESOURCE_TYPE_HEAP,
+      .resource_id        = resource_id_locked(device, memory),
+      .is_driver_internal = is_internal,
+      .heap = {
+         .alignment   = device->info->mem_alignment,
+         .size        = memory->vk.size,
+         .heap_index  = anv_heap_index_to_memory_location(device,
+                                                          memory->type->heapIndex),
+         .alloc_flags = alloc_flags,
+      },
+   };
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE, &token);
+   log_resource_bind_locked(device, token.resource_id, memory->bo, 0, memory->vk.size);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+static void
+anv_rmv_log_vma_locked(struct anv_device *device, uint64_t address, uint64_t size,
+                       bool internal, bool vram, bool in_invisible_vram)
+{
+   struct vk_rmv_virtual_allocate_token token = {
+      .address              = address,
+      /* If all VRAM is visible, no bo will be in invisible memory. */
+      .is_in_invisible_vram = in_invisible_vram,
+      .preferred_domains    = (vram ?
+                               VK_RMV_KERNEL_MEMORY_DOMAIN_VRAM :
+                               VK_RMV_KERNEL_MEMORY_DOMAIN_GTT),
+      .is_driver_internal   = internal,
+      .page_count           = DIV_ROUND_UP(size, 4096),
+   };
+
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_VIRTUAL_ALLOCATE, &token);
+}
+
+void
+anv_rmv_log_bo_allocate(struct anv_device *device,
+                        struct anv_bo *bo)
+{
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   anv_rmv_log_vma_locked(device, bo->offset, bo->size,
+                          bo->alloc_flags & ANV_BO_ALLOC_INTERNAL,
+                          (bo->alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM) == 0,
+                          device->physical->vram_non_mappable.size != 0 &&
+                          (bo->alloc_flags & (ANV_BO_ALLOC_MAPPED |
+                                              ANV_BO_ALLOC_HOST_CACHED_COHERENT |
+                                              ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE |
+                                              ANV_BO_ALLOC_NO_LOCAL_MEM)) == 0);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+
+   if (bo->alloc_flags & ANV_BO_ALLOC_MAPPED)
+      vk_rmv_log_cpu_map(&device->vk, bo->offset, false);
+}
+
+void
+anv_rmv_log_bo_destroy(struct anv_device *device, struct anv_bo *bo)
+{
+   struct vk_rmv_virtual_free_token token = {
+      .address = bo->offset,
+   };
+
+   if (bo->alloc_flags & ANV_BO_ALLOC_MAPPED)
+      vk_rmv_log_cpu_map(&device->vk, bo->offset, true);
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   anv_rmv_log_bo_gtt_unmap_locked(device, bo);
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_VIRTUAL_FREE, &token);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_buffer_create(struct anv_device *device,
+                          bool is_internal,
+                          struct anv_buffer *buffer)
+{
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_resource_create_token token = {
+      .type               = VK_RMV_RESOURCE_TYPE_BUFFER,
+      .is_driver_internal = is_internal,
+      .resource_id        = resource_id_locked(device, buffer),
+      .buffer             = {
+         .create_flags = buffer->vk.create_flags,
+         .size         = buffer->vk.size,
+         .usage_flags  = buffer->vk.usage,
+      },
+   };
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE, &token);
+   if (buffer->vk.create_flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT) {
+      assert(buffer->sparse_data.size != 0);
+      anv_rmv_log_vma_locked(device,
+                             buffer->sparse_data.address,
+                             buffer->sparse_data.size,
+                             false /* internal */, true /* TODO: vram */,
+                             true /* in_invisible_vram */);
+      log_resource_bind_locked(device,
+                               resource_id_locked(device, buffer),
+                               NULL,
+                               buffer->sparse_data.address,
+                               buffer->sparse_data.size);
+   }
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+
+}
+
+void
+anv_rmv_log_buffer_destroy(struct anv_device *device,
+                           struct anv_buffer *buffer)
+{
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   if (buffer->vk.create_flags & VK_BUFFER_CREATE_SPARSE_BINDING_BIT) {
+      struct vk_rmv_virtual_free_token token = {
+         .address = buffer->sparse_data.address,
+      };
+      vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_VIRTUAL_FREE, &token);
+   }
+   resource_destroy_locked(device, buffer);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+
+}
+
+void
+anv_rmv_log_buffer_bind(struct anv_device *device, struct anv_buffer *buffer)
+{
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   log_resource_bind_locked(device,
+                            resource_id_locked(device, buffer),
+                            buffer->address.bo,
+                            buffer->address.offset, buffer->vk.size);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_image_create(struct anv_device *device,
+                         bool is_internal,
+                         struct anv_image *image)
+{
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_resource_create_token token = {
+      .type               = VK_RMV_RESOURCE_TYPE_IMAGE,
+      .resource_id        = resource_id_locked(device, image),
+      .is_driver_internal = is_internal,
+      .image              = {
+         .create_flags            = image->vk.create_flags,
+         .usage_flags             = image->vk.usage,
+         .type                    = image->vk.image_type,
+         .extent                  = image->vk.extent,
+         .format                  = image->vk.format,
+         .num_mips                = image->vk.mip_levels,
+         .num_slices              = image->vk.array_layers,
+         .tiling                  = image->vk.tiling,
+         .alignment_log2          = util_logbase2(
+            image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN].memory_range.alignment),
+         .log2_samples            = util_logbase2(image->vk.samples),
+         .metadata_alignment_log2 = util_logbase2(
+            image->planes[0].aux_surface.isl.alignment_B),
+         .image_alignment_log2    = util_logbase2(
+            image->planes[0].primary_surface.isl.alignment_B),
+         .size                    = image->planes[0].primary_surface.memory_range.size,
+         .metadata_size           = image->planes[0].aux_surface.memory_range.size,
+         .metadata_header_size    = 0,
+         .metadata_offset         = image->planes[0].aux_surface.memory_range.offset,
+         .metadata_header_offset  = image->planes[0].aux_surface.memory_range.offset,
+         .presentable             = (image->planes[0].primary_surface.isl.usage &
+                                     ISL_SURF_USAGE_DISPLAY_BIT) != 0,
+      },
+   };
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE, &token);
+   if (image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT) {
+      for (uint32_t b = 0; b < ARRAY_SIZE(image->bindings); b++) {
+         if (image->bindings[b].sparse_data.size != 0) {
+            anv_rmv_log_vma_locked(device,
+                                   image->bindings[b].sparse_data.address,
+                                   image->bindings[b].sparse_data.size,
+                                   false /* internal */, true /* TODO: vram */,
+                                   true /* in_invisible_vram */);
+            log_resource_bind_locked(device,
+                                     resource_id_locked(device, image),
+                                     NULL,
+                                     image->bindings[b].sparse_data.address,
+                                     image->bindings[b].sparse_data.size);
+         }
+      }
+   }
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_image_destroy(struct anv_device *device,
+                          struct anv_image *image)
+{
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   if (image->vk.create_flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT) {
+      for (uint32_t b = 0; b < ARRAY_SIZE(image->bindings); b++) {
+         if (image->bindings[b].sparse_data.size != 0) {
+            struct vk_rmv_virtual_free_token token = {
+               .address = image->bindings[b].sparse_data.address,
+            };
+
+            vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_VIRTUAL_FREE, &token);
+         }
+      }
+   }
+   resource_destroy_locked(device, image);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_image_bind(struct anv_device *device,
+                       struct anv_image *image,
+                       enum anv_image_memory_binding binding)
+{
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   log_resource_bind_locked(device,
+                            resource_id_locked(device, image),
+                            image->bindings[binding].address.bo,
+                            image->bindings[binding].address.offset,
+                            image->bindings[binding].memory_range.size);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_query_pool_create(struct anv_device *device,
+                              struct anv_query_pool *pool,
+                              bool is_internal)
+{
+   if (pool->vk.query_type != VK_QUERY_TYPE_OCCLUSION &&
+       pool->vk.query_type != VK_QUERY_TYPE_PIPELINE_STATISTICS &&
+       pool->vk.query_type != VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT)
+      return;
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_resource_create_token create_token = {
+      .type               = VK_RMV_RESOURCE_TYPE_QUERY_HEAP,
+      .resource_id        = resource_id_locked(device, pool),
+      .is_driver_internal = is_internal,
+      .query_pool = {
+         .type           = pool->vk.query_type,
+         .has_cpu_access = true,
+      },
+   };
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data,
+                     VK_RMV_TOKEN_TYPE_RESOURCE_CREATE, &create_token);
+   log_resource_bind_locked(device, create_token.resource_id,
+                            pool->bo, 0, pool->bo->size);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+static void
+bind_cmd_buffer_state_stream_locked(struct anv_device *device,
+                                    uint64_t resource_id,
+                                    struct anv_state_stream *stream)
+{
+   util_dynarray_foreach(&stream->all_blocks, struct anv_state, block)
+      log_state_pool_bind_locked(device, resource_id, stream->state_pool, block);
+}
+
+void
+anv_rmv_log_cmd_buffer_create(struct anv_device *device,
+                              struct anv_cmd_buffer *cmd_buffer)
+{
+   uint64_t data_size =
+      cmd_buffer->surface_state_stream.total_size +
+      cmd_buffer->dynamic_state_stream.total_size +
+      cmd_buffer->general_state_stream.total_size +
+      cmd_buffer->indirect_push_descriptor_stream.total_size;
+
+   uint64_t executable_size = 0;
+   list_for_each_entry(struct anv_batch_bo, bbo, &cmd_buffer->batch_bos, link)
+      executable_size += bbo->length;
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_resource_create_token create_token = {
+      .type               = VK_RMV_RESOURCE_TYPE_COMMAND_ALLOCATOR,
+      .resource_id        = resource_id_locked(device, cmd_buffer),
+      .is_driver_internal = true,
+      .command_buffer     = {
+         .preferred_domain                 = VK_RMV_KERNEL_MEMORY_DOMAIN_GTT /* TODO */,
+         .executable_size                  = executable_size,
+         .app_available_executable_size    = executable_size,
+         .embedded_data_size               = data_size,
+         .app_available_embedded_data_size = data_size,
+         .scratch_size                     = 0,
+         .app_available_scratch_size       = 0,
+      },
+   };
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data,
+                     VK_RMV_TOKEN_TYPE_RESOURCE_CREATE,
+                     &create_token);
+   list_for_each_entry(struct anv_batch_bo, bbo, &cmd_buffer->batch_bos, link) {
+      log_resource_bind_locked(device, create_token.resource_id,
+                               bbo->bo, 0, bbo->length);
+   }
+   bind_cmd_buffer_state_stream_locked(device, create_token.resource_id,
+                                       &cmd_buffer->surface_state_stream);
+   bind_cmd_buffer_state_stream_locked(device, create_token.resource_id,
+                                       &cmd_buffer->dynamic_state_stream);
+   bind_cmd_buffer_state_stream_locked(device, create_token.resource_id,
+                                       &cmd_buffer->general_state_stream);
+   bind_cmd_buffer_state_stream_locked(device, create_token.resource_id,
+                                       &cmd_buffer->indirect_push_descriptor_stream);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_cmd_buffer_destroy(struct anv_device *device,
+                               struct anv_cmd_buffer *cmd_buffer)
+{
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_resource_destroy_token destroy_token = {
+      .resource_id = resource_id_locked(device, cmd_buffer),
+   };
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data,
+                     VK_RMV_TOKEN_TYPE_RESOURCE_DESTROY, &destroy_token);
+   resource_destroy_locked(device, cmd_buffer);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_sparse_add_residency(struct anv_device *device,
+                                 struct anv_bo *src_bo,
+                                 uint64_t offset)
+{
+   struct vk_rmv_resource_reference_token token = {
+      .virtual_address   = src_bo->offset + offset,
+      .residency_removed = false,
+   };
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   vk_rmv_emit_token(&device->vk.memory_trace_data,
+                     VK_RMV_TOKEN_TYPE_RESOURCE_REFERENCE, &token);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_sparse_remove_residency(struct anv_device *device,
+                                    struct anv_bo *src_bo,
+                                    uint64_t offset)
+{
+   struct vk_rmv_resource_reference_token token = {
+      .virtual_address   = src_bo->offset + offset,
+      .residency_removed = true,
+   };
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   vk_rmv_emit_token(&device->vk.memory_trace_data,
+                     VK_RMV_TOKEN_TYPE_RESOURCE_REFERENCE, &token);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_descriptor_pool_create(struct anv_device *device,
+                                   const VkDescriptorPoolCreateInfo *create_info,
+                                   struct anv_descriptor_pool *pool,
+                                   bool is_internal)
+{
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_resource_create_token create_token = {
+      .type               = VK_RMV_RESOURCE_TYPE_DESCRIPTOR_POOL,
+      .resource_id        = resource_id_locked(device, pool),
+      .is_driver_internal = false,
+      .descriptor_pool    = {
+         .max_sets        = create_info->maxSets,
+         .pool_size_count = create_info->poolSizeCount,
+         /* Using vk_rmv_token_pool_alloc frees the allocation automatically
+          * when the trace is done. */
+         .pool_sizes      = malloc(create_info->poolSizeCount *
+                                   sizeof(VkDescriptorPoolSize)),
+      },
+   };
+
+   if (!create_token.descriptor_pool.pool_sizes) {
+      simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+      return;
+   }
+
+   memcpy(create_token.descriptor_pool.pool_sizes, create_info->pPoolSizes,
+          create_info->poolSizeCount * sizeof(VkDescriptorPoolSize));
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data,
+                     VK_RMV_TOKEN_TYPE_RESOURCE_CREATE, &create_token);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+
+   if (pool->surfaces.bo) {
+      struct vk_rmv_resource_bind_token bind_token = {
+         .resource_id      = create_token.resource_id,
+         .is_system_memory = false,
+         .address          = pool->surfaces.bo->offset,
+         .size             = pool->surfaces.bo->size,
+      };
+
+      simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+      vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_BIND, &bind_token);
+      simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+   }
+   if (pool->samplers.bo) {
+      struct vk_rmv_resource_bind_token bind_token = {
+         .resource_id      = create_token.resource_id,
+         .is_system_memory = false,
+         .address          = pool->samplers.bo->offset,
+         .size             = pool->samplers.bo->size,
+      };
+
+      simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+      vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_BIND, &bind_token);
+      simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+   }
+}
+
+void
+anv_rmv_log_graphics_pipeline_create(struct anv_device *device,
+                                     struct anv_graphics_pipeline *pipeline,
+                                     bool is_internal)
+{
+   struct vk_rmv_resource_create_token create_token = {
+      .type               = VK_RMV_RESOURCE_TYPE_PIPELINE,
+      .resource_id        = resource_id_locked(device, pipeline),
+      .is_driver_internal = is_internal,
+      .pipeline           = {
+         .is_internal   = is_internal,
+         .hash_lo       = 0,/* TODO pipeline->pipeline_hash; */
+         .shader_stages = pipeline->base.base.active_stages,
+      },
+   };
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE, &create_token);
+   for (unsigned s = 0; s < ARRAY_SIZE(pipeline->base.shaders); s++) {
+      struct anv_shader_bin *shader = pipeline->base.shaders[s];
+
+      if (!shader)
+         continue;
+
+      log_state_pool_bind_locked(device, create_token.resource_id,
+                                 &device->instruction_state_pool,
+                                 &shader->kernel);
+   }
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_compute_pipeline_create(struct anv_device *device,
+                                    struct anv_compute_pipeline *pipeline,
+                                    bool is_internal)
+{
+   VkShaderStageFlagBits active_stages =
+      pipeline->base.type == ANV_PIPELINE_COMPUTE ?
+      VK_SHADER_STAGE_COMPUTE_BIT : VK_SHADER_STAGE_RAYGEN_BIT_KHR;
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_resource_create_token create_token = {
+      .type               = VK_RMV_RESOURCE_TYPE_PIPELINE,
+      .resource_id        = resource_id_locked(device, pipeline),
+      .is_driver_internal = is_internal,
+      .pipeline           = {
+         .is_internal   = is_internal,
+         .hash_lo       = 0,/* TODO pipeline->pipeline_hash; */
+         .shader_stages = active_stages,
+      },
+   };
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE, &create_token);
+   struct anv_shader_bin *shader = pipeline->cs;
+   log_state_pool_bind_locked(device, create_token.resource_id,
+                              &device->instruction_state_pool,
+                              &shader->kernel);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_rt_pipeline_create(struct anv_device *device,
+                               struct anv_ray_tracing_pipeline *pipeline,
+                               bool is_internal)
+{
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+
+   struct vk_rmv_resource_create_token create_token = {
+      .resource_id        = resource_id_locked(device, pipeline),
+      .type               = VK_RMV_RESOURCE_TYPE_PIPELINE,
+      .is_driver_internal = is_internal,
+      .pipeline           = {
+         .is_internal   = is_internal,
+         .hash_lo       = 0, /* TODO */
+         .shader_stages = pipeline->base.active_stages,
+      },
+   };
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE, &create_token);
+
+   struct anv_state_pool *state_pool = &device->instruction_state_pool;
+   for (uint32_t i = 0; i < pipeline->group_count; i++) {
+      struct anv_rt_shader_group *group = &pipeline->groups[i];
+
+      if (group->imported)
+         continue;
+
+      if (group->general) {
+         log_state_pool_bind_locked(device, create_token.resource_id, state_pool,
+                                    &group->general->kernel);
+      }
+      if (group->closest_hit) {
+         log_state_pool_bind_locked(device, create_token.resource_id, state_pool,
+                                    &group->closest_hit->kernel);
+      }
+      if (group->any_hit) {
+         log_state_pool_bind_locked(device, create_token.resource_id, state_pool,
+                                    &group->any_hit->kernel);
+      }
+      if (group->intersection) {
+         log_state_pool_bind_locked(device, create_token.resource_id, state_pool,
+                                    &group->intersection->kernel);
+      }
+   }
+
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_event_create(struct anv_device *device,
+                         struct anv_event *event,
+                         VkEventCreateFlags flags,
+                         bool is_internal)
+{
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_resource_create_token create_token = {
+      .type               = VK_RMV_RESOURCE_TYPE_GPU_EVENT,
+      .resource_id        = resource_id_locked(device, event),
+      .is_driver_internal = is_internal,
+      .event              = {
+         .flags = flags,
+      },
+   };
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_CREATE, &create_token);
+   log_state_pool_bind_locked(device, create_token.resource_id,
+                              &device->dynamic_state_pool,
+                              &event->state);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
+
+void
+anv_rmv_log_resource_destroy(struct anv_device *device, const void *obj)
+{
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_resource_destroy_token token = {
+      .resource_id = resource_id_locked(device, obj),
+   };
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_RESOURCE_DESTROY, &token);
+   resource_destroy_locked(device, obj);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+}
diff --git a/src/intel/vulkan/anv_rmv.h b/src/intel/vulkan/anv_rmv.h
new file mode 100644
index 00000000000..e5e94619863
--- /dev/null
+++ b/src/intel/vulkan/anv_rmv.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright © 2024 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef ANV_RMV_H
+#define ANV_RMV_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "vulkan/vulkan_core.h"
+
+struct anv_device;
+struct anv_device_memory;
+struct anv_physical_device;
+struct anv_descriptor_pool;
+struct anv_buffer;
+struct anv_image;
+struct anv_bo;
+struct anv_event;
+struct anv_graphics_pipeline;
+struct anv_compute_pipeline;
+struct anv_ray_tracing_pipeline;
+
+enum anv_image_memory_binding;
+
+#define ANV_RMV(func, device, ...) do { \
+      if (unlikely((device)->vk.memory_trace_data.is_enabled)) \
+         anv_rmv_log_##func(device, __VA_ARGS__); \
+   } while (0)
+
+void anv_memory_trace_init(struct anv_device *device);
+void anv_rmv_fill_device_info(const struct anv_physical_device *device,
+                              struct vk_rmv_device_info *info);
+void anv_memory_trace_finish(struct anv_device *device);
+
+void anv_rmv_log_heap_create(struct anv_device *device,
+                             struct anv_device_memory *memory,
+                             bool is_internal,
+                             VkMemoryAllocateFlags alloc_flags);
+void anv_rmv_log_bo_gtt_map(struct anv_device *device,
+                            struct anv_bo *bo);
+void anv_rmv_log_bo_gtt_unmap(struct anv_device *device,
+                              struct anv_bo *bo);
+void anv_rmv_log_bos_gtt_map(struct anv_device *device,
+                             struct anv_bo **bos,
+                             uint32_t bo_count);
+void anv_rmv_log_vm_binds(struct anv_device *device,
+                          struct anv_vm_bind *binds,
+                          uint32_t bind_count);
+void anv_rmv_log_bo_allocate(struct anv_device *device,
+                             struct anv_bo *bo);
+void anv_rmv_log_bo_destroy(struct anv_device *device, struct anv_bo *bo);
+void anv_rmv_log_buffer_create(struct anv_device *device,
+                               bool is_internal,
+                               struct anv_buffer *buffer);
+void anv_rmv_log_buffer_destroy(struct anv_device *device,
+                                struct anv_buffer *buffer);
+void anv_rmv_log_buffer_bind(struct anv_device *device, struct anv_buffer *buffer);
+void anv_rmv_log_image_create(struct anv_device *device,
+                              bool is_internal,
+                              struct anv_image *image);
+void anv_rmv_log_image_destroy(struct anv_device *device,
+                               struct anv_image *image);
+void anv_rmv_log_image_bind(struct anv_device *device,
+                            struct anv_image *image,
+                            enum anv_image_memory_binding binding);
+void anv_rmv_log_query_pool_create(struct anv_device *device,
+                                   struct anv_query_pool *pool,
+                                   bool is_internal);
+void anv_rmv_log_cmd_buffer_create(struct anv_device *device,
+                                   struct anv_cmd_buffer *cmd_buffer);
+void anv_rmv_log_cmd_buffer_destroy(struct anv_device *device,
+                                    struct anv_cmd_buffer *cmd_buffer);
+void anv_rmv_log_sparse_add_residency(struct anv_device *device,
+                                      struct anv_bo *src_bo,
+                                      uint64_t offset);
+void anv_rmv_log_sparse_remove_residency(struct anv_device *device,
+                                         struct anv_bo *src_bo,
+                                         uint64_t offset);
+void anv_rmv_log_descriptor_pool_create(struct anv_device *device,
+                                        const VkDescriptorPoolCreateInfo *create_info,
+                                        struct anv_descriptor_pool *pool,
+                                        bool is_internal);
+void anv_rmv_log_graphics_pipeline_create(struct anv_device *device,
+                                          struct anv_graphics_pipeline *pipeline,
+                                          bool is_internal);
+void anv_rmv_log_compute_pipeline_create(struct anv_device *device,
+                                         struct anv_compute_pipeline *pipeline,
+                                         bool is_internal);
+void anv_rmv_log_rt_pipeline_create(struct anv_device *device,
+                                    struct anv_ray_tracing_pipeline *pipeline,
+                                    bool is_internal);
+void anv_rmv_log_event_create(struct anv_device *device,
+                              struct anv_event *event,
+                              VkEventCreateFlags flags, bool is_internal);
+void anv_rmv_log_resource_destroy(struct anv_device *device, const void *obj);
+
+#endif /* ANV_RMV_H */
diff --git a/src/intel/vulkan/anv_sparse.c b/src/intel/vulkan/anv_sparse.c
new file mode 100644
index 00000000000..279dffea510
--- /dev/null
+++ b/src/intel/vulkan/anv_sparse.c
@@ -0,0 +1,1293 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <anv_private.h>
+
+/* Sparse binding handling.
+ *
+ * There is one main structure passed around all over this file:
+ *
+ * - struct anv_sparse_binding_data: every resource (VkBuffer or VkImage) has
+ *   a pointer to an instance of this structure. It contains the virtual
+ *   memory address (VMA) used by the binding operations (which is different
+ *   from the VMA used by the anv_bo it's bound to) and the VMA range size. We
+ *   do not keep record of our our list of bindings (which ranges were bound
+ *   to which buffers).
+ */
+
+__attribute__((format(printf, 1, 2)))
+static void
+sparse_debug(const char *format, ...)
+{
+   if (!INTEL_DEBUG(DEBUG_SPARSE))
+      return;
+
+   va_list args;
+   va_start(args, format);
+   vfprintf(stderr, format, args);
+   va_end(args);
+}
+
+static void
+dump_anv_vm_bind(struct anv_device *device,
+                 const struct anv_vm_bind *bind)
+{
+  sparse_debug("[%s] ", bind->op == ANV_VM_BIND ? " bind " : "unbind");
+
+   if (bind->bo)
+      sparse_debug("bo:%04u ", bind->bo->gem_handle);
+   else
+      sparse_debug("bo:---- ");
+   sparse_debug("address:%016"PRIx64" size:%08"PRIx64" "
+                "mem_offset:%08"PRIx64"\n",
+                bind->address, bind->size, bind->bo_offset);
+}
+
+static void
+dump_anv_image(struct anv_image *i)
+{
+   if (!INTEL_DEBUG(DEBUG_SPARSE))
+      return;
+
+   sparse_debug("anv_image:\n");
+   sparse_debug("- format: %d\n", i->vk.format);
+   sparse_debug("- extent: [%d, %d, %d]\n",
+                i->vk.extent.width, i->vk.extent.height, i->vk.extent.depth);
+   sparse_debug("- mip_levels: %d array_layers: %d samples: %d\n",
+                i->vk.mip_levels, i->vk.array_layers, i->vk.samples);
+   sparse_debug("- n_planes: %d\n", i->n_planes);
+   sparse_debug("- disjoint: %d\n", i->disjoint);
+}
+
+static void
+dump_isl_surf(struct isl_surf *s)
+{
+   if (!INTEL_DEBUG(DEBUG_SPARSE))
+      return;
+
+   sparse_debug("isl_surf:\n");
+
+   const char *dim_s = s->dim == ISL_SURF_DIM_1D ? "1D" :
+                       s->dim == ISL_SURF_DIM_2D ? "2D" :
+                       s->dim == ISL_SURF_DIM_3D ? "3D" :
+                       "(ERROR)";
+   sparse_debug("- dim: %s\n", dim_s);
+   sparse_debug("- tiling: %d (%s)\n", s->tiling,
+                isl_tiling_to_name(s->tiling));
+   sparse_debug("- format: %s\n", isl_format_get_short_name(s->format));
+   sparse_debug("- image_alignment_el: [%d, %d, %d]\n",
+                s->image_alignment_el.w, s->image_alignment_el.h,
+                s->image_alignment_el.d);
+   sparse_debug("- logical_level0_px: [%d, %d, %d, %d]\n",
+                s->logical_level0_px.w,
+                s->logical_level0_px.h,
+                s->logical_level0_px.d,
+                s->logical_level0_px.a);
+   sparse_debug("- phys_level0_sa: [%d, %d, %d, %d]\n",
+                s->phys_level0_sa.w,
+                s->phys_level0_sa.h,
+                s->phys_level0_sa.d,
+                s->phys_level0_sa.a);
+   sparse_debug("- levels: %d samples: %d\n", s->levels, s->samples);
+   sparse_debug("- size_B: %"PRIu64" alignment_B: %u\n",
+                s->size_B, s->alignment_B);
+   sparse_debug("- row_pitch_B: %u\n", s->row_pitch_B);
+   sparse_debug("- array_pitch_el_rows: %u\n", s->array_pitch_el_rows);
+
+   const struct isl_format_layout *layout = isl_format_get_layout(s->format);
+   sparse_debug("- format layout:\n");
+   sparse_debug("  - format:%d bpb:%d bw:%d bh:%d bd:%d\n",
+                layout->format, layout->bpb, layout->bw, layout->bh,
+                layout->bd);
+
+   struct isl_tile_info tile_info;
+   isl_surf_get_tile_info(s, &tile_info);
+
+   sparse_debug("- tile info:\n");
+   sparse_debug("  - format_bpb: %d\n", tile_info.format_bpb);
+   sparse_debug("  - logical_extent_el: [%d, %d, %d, %d]\n",
+                tile_info.logical_extent_el.w,
+                tile_info.logical_extent_el.h,
+                tile_info.logical_extent_el.d,
+                tile_info.logical_extent_el.a);
+   sparse_debug("  - phys_extent_B: [%d, %d]\n",
+                tile_info.phys_extent_B.w,
+                tile_info.phys_extent_B.h);
+}
+
+static VkOffset3D
+vk_offset3d_px_to_el(const VkOffset3D offset_px,
+                     const struct isl_format_layout *layout)
+{
+   return (VkOffset3D) {
+      .x = offset_px.x / layout->bw,
+      .y = offset_px.y / layout->bh,
+      .z = offset_px.z / layout->bd,
+   };
+}
+
+static VkOffset3D
+vk_offset3d_el_to_px(const VkOffset3D offset_el,
+                     const struct isl_format_layout *layout)
+{
+   return (VkOffset3D) {
+      .x = offset_el.x * layout->bw,
+      .y = offset_el.y * layout->bh,
+      .z = offset_el.z * layout->bd,
+   };
+}
+
+static VkExtent3D
+vk_extent3d_px_to_el(const VkExtent3D extent_px,
+                     const struct isl_format_layout *layout)
+{
+   return (VkExtent3D) {
+      .width = extent_px.width / layout->bw,
+      .height = extent_px.height / layout->bh,
+      .depth = extent_px.depth / layout->bd,
+   };
+}
+
+static VkExtent3D
+vk_extent3d_el_to_px(const VkExtent3D extent_el,
+                     const struct isl_format_layout *layout)
+{
+   return (VkExtent3D) {
+      .width = extent_el.width * layout->bw,
+      .height = extent_el.height * layout->bh,
+      .depth = extent_el.depth * layout->bd,
+   };
+}
+
+static bool
+isl_tiling_supports_standard_block_shapes(enum isl_tiling tiling)
+{
+   return isl_tiling_is_64(tiling) ||
+          tiling == ISL_TILING_ICL_Ys ||
+          tiling == ISL_TILING_SKL_Ys;
+}
+
+static VkExtent3D
+anv_sparse_get_standard_image_block_shape(enum isl_format format,
+                                          VkImageType image_type,
+                                          uint16_t texel_size)
+{
+   const struct isl_format_layout *layout = isl_format_get_layout(format);
+   VkExtent3D block_shape = { .width = 0, .height = 0, .depth = 0 };
+
+   switch (image_type) {
+   case VK_IMAGE_TYPE_1D:
+      /* 1D images don't have a standard block format. */
+      assert(false);
+      break;
+   case VK_IMAGE_TYPE_2D:
+      switch (texel_size) {
+      case 8:
+         block_shape = (VkExtent3D) { .width = 256, .height = 256, .depth = 1 };
+         break;
+      case 16:
+         block_shape = (VkExtent3D) { .width = 256, .height = 128, .depth = 1 };
+         break;
+      case 32:
+         block_shape = (VkExtent3D) { .width = 128, .height = 128, .depth = 1 };
+         break;
+      case 64:
+         block_shape = (VkExtent3D) { .width = 128, .height = 64, .depth = 1 };
+         break;
+      case 128:
+         block_shape = (VkExtent3D) { .width = 64, .height = 64, .depth = 1 };
+         break;
+      default:
+         fprintf(stderr, "unexpected texel_size %d\n", texel_size);
+         assert(false);
+      }
+      break;
+   case VK_IMAGE_TYPE_3D:
+      switch (texel_size) {
+      case 8:
+         block_shape = (VkExtent3D) { .width = 64, .height = 32, .depth = 32 };
+         break;
+      case 16:
+         block_shape = (VkExtent3D) { .width = 32, .height = 32, .depth = 32 };
+         break;
+      case 32:
+         block_shape = (VkExtent3D) { .width = 32, .height = 32, .depth = 16 };
+         break;
+      case 64:
+         block_shape = (VkExtent3D) { .width = 32, .height = 16, .depth = 16 };
+         break;
+      case 128:
+         block_shape = (VkExtent3D) { .width = 16, .height = 16, .depth = 16 };
+         break;
+      default:
+         fprintf(stderr, "unexpected texel_size %d\n", texel_size);
+         assert(false);
+      }
+      break;
+   default:
+      fprintf(stderr, "unexpected image_type %d\n", image_type);
+      assert(false);
+   }
+
+   return vk_extent3d_el_to_px(block_shape, layout);
+}
+
+/* Adds "bind_op" to the list in "submit", while also trying to check if we
+ * can just extend the last operation instead.
+ */
+static VkResult
+anv_sparse_submission_add(struct anv_device *device,
+                          struct anv_sparse_submission *submit,
+                          struct anv_vm_bind *bind_op)
+{
+   struct anv_vm_bind *prev_bind = submit->binds_len == 0 ? NULL :
+                                    &submit->binds[submit->binds_len - 1];
+
+   if (prev_bind &&
+       bind_op->op == prev_bind->op &&
+       bind_op->bo == prev_bind->bo &&
+       bind_op->address == prev_bind->address + prev_bind->size &&
+       (bind_op->bo_offset == prev_bind->bo_offset + prev_bind->size ||
+        prev_bind->bo == NULL)) {
+      prev_bind->size += bind_op->size;
+      return VK_SUCCESS;
+   }
+
+   if (submit->binds_len < submit->binds_capacity) {
+      submit->binds[submit->binds_len++] = *bind_op;
+      return VK_SUCCESS;
+   }
+
+   int new_capacity = MAX2(32, submit->binds_capacity * 2);
+   struct anv_vm_bind *new_binds =
+      vk_realloc(&device->vk.alloc, submit->binds,
+                 new_capacity * sizeof(*new_binds), 8,
+                 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!new_binds)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   new_binds[submit->binds_len] = *bind_op;
+
+   submit->binds = new_binds;
+   submit->binds_len++;
+   submit->binds_capacity = new_capacity;
+
+   return VK_SUCCESS;
+}
+
+/* We really want to try to have all the page tables on as few BOs as possible
+ * to benefit from cache locality and to keep the i915.ko relocation lists
+ * small. On the other hand, we don't want to waste memory on unused space.
+ */
+#define ANV_TRTT_PAGE_TABLE_BO_SIZE (2 * 1024 * 1024)
+
+static VkResult
+trtt_make_page_table_bo(struct anv_device *device, struct anv_bo **bo)
+{
+   VkResult result;
+   struct anv_trtt *trtt = &device->trtt;
+
+   result = anv_device_alloc_bo(device, "trtt-page-table",
+                                ANV_TRTT_PAGE_TABLE_BO_SIZE |
+                                ANV_BO_ALLOC_INTERNAL,
+                                0, 0, bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (trtt->num_page_table_bos < trtt->page_table_bos_capacity) {
+      trtt->page_table_bos[trtt->num_page_table_bos++] = *bo;
+   } else {
+
+      int new_capacity = MAX2(8, trtt->page_table_bos_capacity * 2);
+      struct anv_bo **new_page_table_bos =
+         vk_realloc(&device->vk.alloc, trtt->page_table_bos,
+                    new_capacity * sizeof(*trtt->page_table_bos), 8,
+                    VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+      if (!new_page_table_bos) {
+         anv_device_release_bo(device, *bo);
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
+
+      new_page_table_bos[trtt->num_page_table_bos] = *bo;
+
+      trtt->page_table_bos = new_page_table_bos;
+      trtt->page_table_bos_capacity = new_capacity;
+      trtt->num_page_table_bos++;
+   }
+
+   trtt->cur_page_table_bo = *bo;
+   trtt->next_page_table_bo_offset = 0;
+
+   sparse_debug("new number of page table BOs: %d\n",
+                trtt->num_page_table_bos);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+trtt_get_page_table_bo(struct anv_device *device, struct anv_bo **bo,
+                       uint64_t *bo_addr)
+{
+   struct anv_trtt *trtt = &device->trtt;
+   VkResult result;
+
+   if (!trtt->cur_page_table_bo) {
+      result = trtt_make_page_table_bo(device, bo);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   *bo = trtt->cur_page_table_bo;
+   *bo_addr = trtt->cur_page_table_bo->offset +
+              trtt->next_page_table_bo_offset;
+
+   trtt->next_page_table_bo_offset += 4096;
+   if (trtt->next_page_table_bo_offset >= ANV_TRTT_PAGE_TABLE_BO_SIZE)
+      trtt->cur_page_table_bo = NULL;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_trtt_init_context_state(struct anv_queue *queue)
+{
+   struct anv_device *device = queue->device;
+   struct anv_trtt *trtt = &device->trtt;
+
+   struct drm_syncobj_create create = {
+      .handle = 0,
+      .flags = 0,
+   };
+   if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_CREATE, &create))
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+   assert(create.handle != 0);
+   trtt->timeline_handle = create.handle;
+
+   struct anv_bo *l3_bo;
+   VkResult result = trtt_get_page_table_bo(device, &l3_bo, &trtt->l3_addr);
+   if (result != VK_SUCCESS)
+      return result;
+
+   trtt->l3_mirror = vk_zalloc(&device->vk.alloc, 4096, 8,
+                                VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!trtt->l3_mirror) {
+      result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return result;
+   }
+
+   /* L3 has 512 entries, so we can have up to 512 L2 tables. */
+   trtt->l2_mirror = vk_zalloc(&device->vk.alloc, 512 * 4096, 8,
+                               VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!trtt->l2_mirror) {
+      result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      goto fail_free_l3;
+   }
+
+   result = anv_genX(device->info, init_trtt_context_state)(queue);
+
+   return result;
+
+fail_free_l3:
+   vk_free(&device->vk.alloc, trtt->l3_mirror);
+   return result;
+}
+
+static void
+anv_trtt_bind_list_add_entry(struct anv_trtt_bind *binds, int *binds_len,
+                             uint64_t pte_addr, uint64_t entry_addr)
+{
+   binds[*binds_len] = (struct anv_trtt_bind) {
+      .pte_addr = pte_addr,
+      .entry_addr = entry_addr,
+   };
+   (*binds_len)++;
+}
+
+/* For L3 and L2 pages, null and invalid entries are indicated by bits 1 and 0
+ * respectively. For L1 entries, the hardware compares the addresses against
+ * what we program to the GFX_TRTT_NULL and GFX_TRTT_INVAL registers.
+ */
+#define ANV_TRTT_L3L2_NULL_ENTRY (1 << 1)
+#define ANV_TRTT_L3L2_INVALID_ENTRY (1 << 0)
+
+/* Adds elements to the anv_trtt_bind structs passed. This doesn't write the
+ * entries to the HW yet.
+ */
+static VkResult
+anv_trtt_bind_add(struct anv_device *device,
+                  uint64_t trtt_addr, uint64_t dest_addr,
+                  struct anv_trtt_submission *s)
+{
+   VkResult result = VK_SUCCESS;
+   struct anv_trtt *trtt = &device->trtt;
+   bool is_null_bind = dest_addr == ANV_TRTT_L1_NULL_TILE_VAL;
+
+   int l3_index = (trtt_addr >> 35) & 0x1FF;
+   int l2_index = (trtt_addr >> 26) & 0x1FF;
+   int l1_index = (trtt_addr >> 16) & 0x3FF;
+
+   uint64_t l2_addr = trtt->l3_mirror[l3_index];
+   if (l2_addr == ANV_TRTT_L3L2_NULL_ENTRY && is_null_bind) {
+      return VK_SUCCESS;
+   } else if (l2_addr == 0 || l2_addr == ANV_TRTT_L3L2_NULL_ENTRY) {
+      if (is_null_bind) {
+         trtt->l3_mirror[l3_index] = ANV_TRTT_L3L2_NULL_ENTRY;
+
+         anv_trtt_bind_list_add_entry(s->l3l2_binds, &s->l3l2_binds_len,
+               trtt->l3_addr + l3_index * sizeof(uint64_t),
+               ANV_TRTT_L3L2_NULL_ENTRY);
+
+         return VK_SUCCESS;
+      }
+
+      struct anv_bo *l2_bo;
+      result = trtt_get_page_table_bo(device, &l2_bo, &l2_addr);
+      if (result != VK_SUCCESS)
+         return result;
+
+      trtt->l3_mirror[l3_index] = l2_addr;
+
+      anv_trtt_bind_list_add_entry(s->l3l2_binds, &s->l3l2_binds_len,
+            trtt->l3_addr + l3_index * sizeof(uint64_t), l2_addr);
+   }
+   assert(l2_addr != 0 && l2_addr != ANV_TRTT_L3L2_NULL_ENTRY);
+
+   /* The first page in the l2_mirror corresponds to l3_index=0 and so on. */
+   uint64_t l1_addr = trtt->l2_mirror[l3_index * 512 + l2_index];
+   if (l1_addr == ANV_TRTT_L3L2_NULL_ENTRY && is_null_bind) {
+      return VK_SUCCESS;
+   } else if (l1_addr == 0 || l1_addr == ANV_TRTT_L3L2_NULL_ENTRY) {
+      if (is_null_bind) {
+         trtt->l2_mirror[l3_index * 512 + l2_index] =
+            ANV_TRTT_L3L2_NULL_ENTRY;
+
+         anv_trtt_bind_list_add_entry(s->l3l2_binds, &s->l3l2_binds_len,
+               l2_addr + l2_index * sizeof(uint64_t),
+               ANV_TRTT_L3L2_NULL_ENTRY);
+
+         return VK_SUCCESS;
+      }
+
+      struct anv_bo *l1_bo;
+      result = trtt_get_page_table_bo(device, &l1_bo, &l1_addr);
+      if (result != VK_SUCCESS)
+         return result;
+
+      trtt->l2_mirror[l3_index * 512 + l2_index] = l1_addr;
+
+      anv_trtt_bind_list_add_entry(s->l3l2_binds, &s->l3l2_binds_len,
+            l2_addr + l2_index * sizeof(uint64_t), l1_addr);
+   }
+   assert(l1_addr != 0 && l1_addr != ANV_TRTT_L3L2_NULL_ENTRY);
+
+   anv_trtt_bind_list_add_entry(s->l1_binds, &s->l1_binds_len,
+            l1_addr + l1_index * sizeof(uint32_t), dest_addr);
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_sparse_bind_trtt(struct anv_device *device,
+                     struct anv_sparse_submission *sparse_submit)
+{
+   struct anv_trtt *trtt = &device->trtt;
+   VkResult result;
+
+   /* TR-TT submission needs a queue even when the API entry point doesn't
+    * give one, such as resource creation. */
+   if (!sparse_submit->queue)
+      sparse_submit->queue = trtt->queue;
+
+   /* These capacities are conservative estimations. For L1 binds the
+    * number will match exactly unless we skip NULL binds due to L2 already
+    * being NULL. For L3/L2 things are harder to estimate, but the resulting
+    * numbers are so small that a little overestimation won't hurt.
+    *
+    * We have assertions below to catch estimation errors.
+    */
+   int l3l2_binds_capacity = 1;
+   int l1_binds_capacity = 0;
+   for (int b = 0; b < sparse_submit->binds_len; b++) {
+      assert(sparse_submit->binds[b].size % (64 * 1024) == 0);
+      int pages = sparse_submit->binds[b].size / (64 * 1024);
+      l1_binds_capacity += pages;
+      l3l2_binds_capacity += (pages / 1024 + 1) * 2;
+   }
+
+   STACK_ARRAY(struct anv_trtt_bind, l3l2_binds, l3l2_binds_capacity);
+   STACK_ARRAY(struct anv_trtt_bind, l1_binds, l1_binds_capacity);
+   struct anv_trtt_submission trtt_submit = {
+      .sparse = sparse_submit,
+      .l3l2_binds = l3l2_binds,
+      .l1_binds = l1_binds,
+      .l3l2_binds_len = 0,
+      .l1_binds_len = 0,
+   };
+
+   pthread_mutex_lock(&trtt->mutex);
+
+   if (!trtt->l3_addr)
+      anv_trtt_init_context_state(sparse_submit->queue);
+
+   assert(trtt->l3_addr);
+
+   for (int b = 0; b < sparse_submit->binds_len; b++) {
+      struct anv_vm_bind *vm_bind = &sparse_submit->binds[b];
+      for (size_t i = 0; i < vm_bind->size; i += 64 * 1024) {
+         uint64_t trtt_addr = vm_bind->address + i;
+         uint64_t dest_addr =
+            (vm_bind->op == ANV_VM_BIND && vm_bind->bo) ?
+               vm_bind->bo->offset + vm_bind->bo_offset + i :
+               ANV_TRTT_L1_NULL_TILE_VAL;
+
+         result = anv_trtt_bind_add(device, trtt_addr, dest_addr,
+                                    &trtt_submit);
+         if (result != VK_SUCCESS)
+            goto out;
+      }
+   }
+
+   assert(trtt_submit.l3l2_binds_len <= l3l2_binds_capacity);
+   assert(trtt_submit.l1_binds_len <= l1_binds_capacity);
+
+   sparse_debug("trtt_binds: num_vm_binds:%02d l3l2:%04d l1:%04d\n",
+                sparse_submit->binds_len, trtt_submit.l3l2_binds_len,
+                trtt_submit.l1_binds_len);
+
+   if (trtt_submit.l3l2_binds_len || trtt_submit.l1_binds_len)
+      result = anv_genX(device->info, write_trtt_entries)(&trtt_submit);
+
+   if (result == VK_SUCCESS)
+      ANV_RMV(vm_binds, device, sparse_submit->binds, sparse_submit->binds_len);
+
+out:
+   pthread_mutex_unlock(&trtt->mutex);
+   STACK_ARRAY_FINISH(l1_binds);
+   STACK_ARRAY_FINISH(l3l2_binds);
+   return result;
+}
+
+static VkResult
+anv_sparse_bind_vm_bind(struct anv_device *device,
+                        struct anv_sparse_submission *submit)
+{
+   struct anv_queue *queue = submit->queue;
+
+   if (!queue)
+      assert(submit->wait_count == 0 && submit->signal_count == 0);
+
+   return device->kmd_backend->vm_bind(device, submit, ANV_VM_BIND_FLAG_NONE);
+}
+
+VkResult
+anv_sparse_bind(struct anv_device *device,
+                struct anv_sparse_submission *submit)
+{
+   if (INTEL_DEBUG(DEBUG_SPARSE)) {
+      for (int b = 0; b < submit->binds_len; b++)
+         dump_anv_vm_bind(device, &submit->binds[b]);
+   }
+
+   return device->physical->sparse_type == ANV_SPARSE_TYPE_TRTT ?
+            anv_sparse_bind_trtt(device, submit) :
+            anv_sparse_bind_vm_bind(device, submit);
+}
+
+VkResult
+anv_init_sparse_bindings(struct anv_device *device,
+                         uint64_t size_,
+                         struct anv_sparse_binding_data *sparse,
+                         enum anv_bo_alloc_flags alloc_flags,
+                         uint64_t client_address,
+                         struct anv_address *out_address)
+{
+   uint64_t size = align64(size_, ANV_SPARSE_BLOCK_SIZE);
+
+   if (device->physical->sparse_type == ANV_SPARSE_TYPE_TRTT)
+      alloc_flags |= ANV_BO_ALLOC_TRTT;
+
+   sparse->address = anv_vma_alloc(device, size, ANV_SPARSE_BLOCK_SIZE,
+                                   alloc_flags,
+                                   intel_48b_address(client_address),
+                                   &sparse->vma_heap);
+   sparse->size = size;
+
+   out_address->bo = NULL;
+   out_address->offset = sparse->address;
+
+   struct anv_vm_bind bind = {
+      .bo = NULL, /* That's a NULL binding. */
+      .address = sparse->address,
+      .bo_offset = 0,
+      .size = size,
+      .op = ANV_VM_BIND,
+   };
+   struct anv_sparse_submission submit = {
+      .queue = NULL,
+      .binds = &bind,
+      .binds_len = 1,
+      .binds_capacity = 1,
+      .wait_count = 0,
+      .signal_count = 0,
+   };
+   VkResult res = anv_sparse_bind(device, &submit);
+   if (res != VK_SUCCESS) {
+      anv_vma_free(device, sparse->vma_heap, sparse->address, sparse->size);
+      return res;
+   }
+
+   p_atomic_inc(&device->num_sparse_resources);
+   return VK_SUCCESS;
+}
+
+void
+anv_free_sparse_bindings(struct anv_device *device,
+                         struct anv_sparse_binding_data *sparse)
+{
+   if (!sparse->address)
+      return;
+
+   sparse_debug("%s: address:0x%016"PRIx64" size:0x%08"PRIx64"\n",
+                __func__, sparse->address, sparse->size);
+
+   p_atomic_dec(&device->num_sparse_resources);
+
+   struct anv_vm_bind unbind = {
+      .bo = 0,
+      .address = sparse->address,
+      .bo_offset = 0,
+      .size = sparse->size,
+      .op = ANV_VM_UNBIND,
+   };
+   struct anv_sparse_submission submit = {
+      .queue = NULL,
+      .binds = &unbind,
+      .binds_len = 1,
+      .binds_capacity = 1,
+      .wait_count = 0,
+      .signal_count = 0,
+   };
+   VkResult res = anv_sparse_bind(device, &submit);
+
+   /* Our callers don't have a way to signal failure to the upper layers, so
+    * just keep the vma if we fail to unbind it. Still, let's have an
+    * assertion because this really shouldn't be happening.
+    */
+   assert(res == VK_SUCCESS);
+   if (res != VK_SUCCESS)
+      return;
+
+   anv_vma_free(device, sparse->vma_heap, sparse->address, sparse->size);
+}
+
+static VkExtent3D
+anv_sparse_calc_block_shape(struct anv_physical_device *pdevice,
+                            struct isl_surf *surf)
+{
+   const struct isl_format_layout *layout =
+      isl_format_get_layout(surf->format);
+   const int Bpb = layout->bpb / 8;
+
+   struct isl_tile_info tile_info;
+   isl_surf_get_tile_info(surf, &tile_info);
+
+   VkExtent3D block_shape_el = {
+      .width = tile_info.logical_extent_el.width,
+      .height = tile_info.logical_extent_el.height,
+      .depth = tile_info.logical_extent_el.depth,
+   };
+   VkExtent3D block_shape_px = vk_extent3d_el_to_px(block_shape_el, layout);
+
+   if (surf->tiling == ISL_TILING_LINEAR) {
+      uint32_t elements_per_row = surf->row_pitch_B /
+                                  (block_shape_el.width * Bpb);
+      uint32_t rows_per_tile = ANV_SPARSE_BLOCK_SIZE /
+                               (elements_per_row * Bpb);
+      assert(rows_per_tile * elements_per_row * Bpb == ANV_SPARSE_BLOCK_SIZE);
+
+      block_shape_px = (VkExtent3D) {
+         .width = elements_per_row * layout->bw,
+         .height = rows_per_tile * layout->bh,
+         .depth = layout->bd,
+      };
+   }
+
+   return block_shape_px;
+}
+
+VkSparseImageFormatProperties
+anv_sparse_calc_image_format_properties(struct anv_physical_device *pdevice,
+                                        VkImageAspectFlags aspect,
+                                        VkImageType vk_image_type,
+                                        struct isl_surf *surf)
+{
+   const struct isl_format_layout *isl_layout =
+      isl_format_get_layout(surf->format);
+   const int bpb = isl_layout->bpb;
+   assert(bpb == 8 || bpb == 16 || bpb == 32 || bpb == 64 ||bpb == 128);
+   const int Bpb = bpb / 8;
+
+   VkExtent3D granularity = anv_sparse_calc_block_shape(pdevice, surf);
+   bool is_standard = false;
+   bool is_known_nonstandard_format = false;
+
+   if (vk_image_type != VK_IMAGE_TYPE_1D) {
+      VkExtent3D std_shape =
+         anv_sparse_get_standard_image_block_shape(surf->format, vk_image_type,
+                                                   bpb);
+      /* YUV formats don't work with Tile64, which is required if we want to
+       * claim standard block shapes. The spec requires us to support all
+       * non-compressed color formats that non-sparse supports, so we can't
+       * just say YUV formats are not supported by Sparse. So we end
+       * supporting this format and anv_sparse_calc_miptail_properties() will
+       * say that everything is part of the miptail.
+       *
+       * For more details on the hardware restriction, please check
+       * isl_gfx125_filter_tiling().
+       */
+      if (pdevice->info.verx10 >= 125 && isl_format_is_yuv(surf->format))
+         is_known_nonstandard_format = true;
+
+      /* The standard block shapes (and by extension, the tiling formats they
+       * require) are simply incompatible with getting a 2D view of a 3D
+       * image.
+       */
+      if (surf->usage & ISL_SURF_USAGE_2D_3D_COMPATIBLE_BIT)
+         is_known_nonstandard_format = true;
+
+      is_standard = granularity.width == std_shape.width &&
+                    granularity.height == std_shape.height &&
+                    granularity.depth == std_shape.depth;
+
+      /* TODO: dEQP seems to care about the block shapes being standard even
+       * for the cases where is_known_nonstandard_format is true. Luckily as
+       * of today all of those cases are NotSupported but sooner or later we
+       * may end up getting a failure.
+       * Notice that in practice we report these cases as having the mip tail
+       * starting on mip level 0, so the reported block shapes are irrelevant
+       * since non-opaque binds are not supported. Still, dEQP seems to care.
+       */
+      assert(is_standard || is_known_nonstandard_format);
+   }
+
+   uint32_t block_size = granularity.width * granularity.height *
+                         granularity.depth * Bpb;
+   bool wrong_block_size = block_size != ANV_SPARSE_BLOCK_SIZE;
+
+   return (VkSparseImageFormatProperties) {
+      .aspectMask = aspect,
+      .imageGranularity = granularity,
+      .flags = ((is_standard || is_known_nonstandard_format) ? 0 :
+                  VK_SPARSE_IMAGE_FORMAT_NONSTANDARD_BLOCK_SIZE_BIT) |
+               (wrong_block_size ? VK_SPARSE_IMAGE_FORMAT_SINGLE_MIPTAIL_BIT :
+                  0),
+   };
+}
+
+/* The miptail is supposed to be this region where the tiniest mip levels
+ * are squished together in one single page, which should save us some memory.
+ * It's a hardware feature which our hardware supports on certain tiling
+ * formats - the ones we always want to use for sparse resources.
+ *
+ * For sparse, the main feature of the miptail is that it only supports opaque
+ * binds, so you either bind the whole miptail or you bind nothing at all,
+ * there are no subresources inside it to separately bind. While the idea is
+ * that the miptail as reported by sparse should match what our hardware does,
+ * in practice we can say in our sparse functions that certain mip levels are
+ * part of the miptail while from the point of view of our hardwared they
+ * aren't.
+ *
+ * If we detect we're using the sparse-friendly tiling formats and ISL
+ * supports miptails for them, we can just trust the miptail level set by ISL
+ * and things can proceed as The Spec intended.
+ *
+ * However, if that's not the case, we have to go on a best-effort policy. We
+ * could simply declare that every mip level is part of the miptail and be
+ * done, but since that kinda defeats the purpose of Sparse we try to find
+ * what level we really should be reporting as the first miptail level based
+ * on the alignments of the surface subresources.
+ */
+void
+anv_sparse_calc_miptail_properties(struct anv_device *device,
+                                   struct anv_image *image,
+                                   VkImageAspectFlags vk_aspect,
+                                   uint32_t *imageMipTailFirstLod,
+                                   VkDeviceSize *imageMipTailSize,
+                                   VkDeviceSize *imageMipTailOffset,
+                                   VkDeviceSize *imageMipTailStride)
+{
+   const uint32_t plane = anv_image_aspect_to_plane(image, vk_aspect);
+   struct isl_surf *surf = &image->planes[plane].primary_surface.isl;
+   uint64_t binding_plane_offset =
+      image->planes[plane].primary_surface.memory_range.offset;
+   const struct isl_format_layout *isl_layout =
+      isl_format_get_layout(surf->format);
+   const int Bpb = isl_layout->bpb / 8;
+   struct isl_tile_info tile_info;
+   isl_surf_get_tile_info(surf, &tile_info);
+   uint32_t tile_size = tile_info.logical_extent_el.width * Bpb *
+                        tile_info.logical_extent_el.height *
+                        tile_info.logical_extent_el.depth;
+
+   uint64_t layer1_offset;
+   uint32_t x_off, y_off;
+
+   /* Treat the whole thing as a single miptail. We should have already
+    * reported this image as VK_SPARSE_IMAGE_FORMAT_SINGLE_MIPTAIL_BIT.
+    *
+    * In theory we could try to make ISL massage the alignments so that we
+    * could at least claim mip level 0 to be not part of the miptail, but
+    * that could end up wasting a lot of memory, so it's better to do
+    * nothing and focus our efforts into making things use the appropriate
+    * tiling formats that give us the standard block shapes.
+    */
+   if (tile_size != ANV_SPARSE_BLOCK_SIZE)
+      goto out_everything_is_miptail;
+
+   assert(surf->tiling != ISL_TILING_LINEAR);
+
+   if (image->vk.array_layers == 1) {
+      layer1_offset = surf->size_B;
+   } else {
+      isl_surf_get_image_offset_B_tile_sa(surf, 0, 1, 0, &layer1_offset,
+                                          &x_off, &y_off);
+      if (x_off || y_off)
+         goto out_everything_is_miptail;
+   }
+   assert(layer1_offset % tile_size == 0);
+
+   /* We could try to do better here, but there's not really any point since
+    * we should be supporting the appropriate tiling formats everywhere.
+    */
+   if (!isl_tiling_supports_standard_block_shapes(surf->tiling))
+      goto out_everything_is_miptail;
+
+   int miptail_first_level = surf->miptail_start_level;
+   if (miptail_first_level >= image->vk.mip_levels)
+      goto out_no_miptail;
+
+   uint64_t miptail_offset = 0;
+   isl_surf_get_image_offset_B_tile_sa(surf, miptail_first_level, 0, 0,
+                                       &miptail_offset,
+                                       &x_off, &y_off);
+   assert(x_off == 0 && y_off == 0);
+   assert(miptail_offset % tile_size == 0);
+
+   *imageMipTailFirstLod = miptail_first_level;
+   *imageMipTailSize = tile_size;
+   *imageMipTailOffset = binding_plane_offset + miptail_offset;
+   *imageMipTailStride = layer1_offset;
+   goto out_debug;
+
+out_no_miptail:
+   *imageMipTailFirstLod = image->vk.mip_levels;
+   *imageMipTailSize = 0;
+   *imageMipTailOffset = 0;
+   *imageMipTailStride = 0;
+   goto out_debug;
+
+out_everything_is_miptail:
+   *imageMipTailFirstLod = 0;
+   *imageMipTailSize = surf->size_B;
+   *imageMipTailOffset = binding_plane_offset;
+   *imageMipTailStride = 0;
+
+out_debug:
+   sparse_debug("miptail first_lod:%d size:%"PRIu64" offset:%"PRIu64" "
+                "stride:%"PRIu64"\n",
+                *imageMipTailFirstLod, *imageMipTailSize,
+                *imageMipTailOffset, *imageMipTailStride);
+}
+
+static struct anv_vm_bind
+vk_bind_to_anv_vm_bind(struct anv_sparse_binding_data *sparse,
+                       const struct VkSparseMemoryBind *vk_bind)
+{
+   struct anv_vm_bind anv_bind = {
+      .bo = NULL,
+      .address = sparse->address + vk_bind->resourceOffset,
+      .bo_offset = 0,
+      .size = vk_bind->size,
+      .op = ANV_VM_BIND,
+   };
+
+   assert(vk_bind->size);
+   assert(vk_bind->resourceOffset + vk_bind->size <= sparse->size);
+
+   if (vk_bind->memory != VK_NULL_HANDLE) {
+      anv_bind.bo = anv_device_memory_from_handle(vk_bind->memory)->bo;
+      anv_bind.bo_offset = vk_bind->memoryOffset,
+      assert(vk_bind->memoryOffset + vk_bind->size <= anv_bind.bo->size);
+   }
+
+   return anv_bind;
+}
+
+static VkResult
+anv_sparse_bind_resource_memory(struct anv_device *device,
+                                struct anv_sparse_binding_data *sparse,
+                                uint64_t resource_size,
+                                const VkSparseMemoryBind *vk_bind,
+                                struct anv_sparse_submission *submit)
+{
+   struct anv_vm_bind bind = vk_bind_to_anv_vm_bind(sparse, vk_bind);
+   uint64_t rem = vk_bind->size % ANV_SPARSE_BLOCK_SIZE;
+
+   if (rem != 0) {
+      if (vk_bind->resourceOffset + vk_bind->size == resource_size)
+         bind.size += ANV_SPARSE_BLOCK_SIZE - rem;
+      else
+         return vk_error(device, VK_ERROR_VALIDATION_FAILED_EXT);
+   }
+
+   return anv_sparse_submission_add(device, submit, &bind);
+}
+
+VkResult
+anv_sparse_bind_buffer(struct anv_device *device,
+                       struct anv_buffer *buffer,
+                       const VkSparseMemoryBind *vk_bind,
+                       struct anv_sparse_submission *submit)
+{
+   return anv_sparse_bind_resource_memory(device, &buffer->sparse_data,
+                                          buffer->vk.size,
+                                          vk_bind, submit);
+}
+
+VkResult
+anv_sparse_bind_image_opaque(struct anv_device *device,
+                             struct anv_image *image,
+                             const VkSparseMemoryBind *vk_bind,
+                             struct anv_sparse_submission *submit)
+{
+   struct anv_image_binding *b =
+      &image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN];
+   assert(!image->disjoint);
+
+   return anv_sparse_bind_resource_memory(device, &b->sparse_data,
+                                          b->memory_range.size,
+                                          vk_bind, submit);
+}
+
+VkResult
+anv_sparse_bind_image_memory(struct anv_queue *queue,
+                             struct anv_image *image,
+                             const VkSparseImageMemoryBind *bind,
+                             struct anv_sparse_submission *submit)
+{
+   struct anv_device *device = queue->device;
+   VkImageAspectFlags aspect = bind->subresource.aspectMask;
+   uint32_t mip_level = bind->subresource.mipLevel;
+   uint32_t array_layer = bind->subresource.arrayLayer;
+
+   assert(!(bind->flags & VK_SPARSE_MEMORY_BIND_METADATA_BIT));
+
+   struct anv_image_binding *img_binding = image->disjoint ?
+      anv_image_aspect_to_binding(image, aspect) :
+      &image->bindings[ANV_IMAGE_MEMORY_BINDING_MAIN];
+   struct anv_sparse_binding_data *sparse_data = &img_binding->sparse_data;
+
+   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+   struct isl_surf *surf = &image->planes[plane].primary_surface.isl;
+   uint64_t binding_plane_offset =
+      image->planes[plane].primary_surface.memory_range.offset;
+   const struct isl_format_layout *layout =
+      isl_format_get_layout(surf->format);
+
+   if (INTEL_DEBUG(DEBUG_SPARSE)) {
+      sparse_debug("%s:", __func__);
+      sparse_debug("mip_level:%d array_layer:%d\n", mip_level, array_layer);
+      sparse_debug("aspect:0x%x plane:%d\n", aspect, plane);
+      sparse_debug("binding offset: [%d, %d, %d] extent: [%d, %d, %d]\n",
+                   bind->offset.x, bind->offset.y, bind->offset.z,
+                   bind->extent.width, bind->extent.height,
+                   bind->extent.depth);
+      dump_anv_image(image);
+      dump_isl_surf(surf);
+      sparse_debug("\n");
+   }
+
+   VkExtent3D block_shape_px =
+      anv_sparse_calc_block_shape(device->physical, surf);
+   VkExtent3D block_shape_el = vk_extent3d_px_to_el(block_shape_px, layout);
+
+   /* Both bind->offset and bind->extent are in pixel units. */
+   VkOffset3D bind_offset_el = vk_offset3d_px_to_el(bind->offset, layout);
+
+   /* The spec says we only really need to align if for a given coordinate
+    * offset + extent equals the corresponding dimensions of the image
+    * subresource, but all the other non-aligned usage is invalid, so just
+    * align everything.
+    */
+   VkExtent3D bind_extent_px = {
+      .width = ALIGN_NPOT(bind->extent.width, block_shape_px.width),
+      .height = ALIGN_NPOT(bind->extent.height, block_shape_px.height),
+      .depth = ALIGN_NPOT(bind->extent.depth, block_shape_px.depth),
+   };
+   VkExtent3D bind_extent_el = vk_extent3d_px_to_el(bind_extent_px, layout);
+
+   /* A sparse block should correspond to our tile size, so this has to be
+    * either 4k or 64k depending on the tiling format. */
+   const uint64_t block_size_B = block_shape_el.width * (layout->bpb / 8) *
+                                 block_shape_el.height *
+                                 block_shape_el.depth;
+   /* How many blocks are necessary to form a whole line on this image? */
+   const uint32_t blocks_per_line = surf->row_pitch_B / (layout->bpb / 8) /
+                                    block_shape_el.width;
+   /* The loop below will try to bind a whole line of blocks at a time as
+    * they're guaranteed to be contiguous, so we calculate how many blocks
+    * that is and how big is each block to figure the bind size of a whole
+    * line.
+    */
+   uint64_t line_bind_size_in_blocks = bind_extent_el.width /
+                                       block_shape_el.width;
+   uint64_t line_bind_size = line_bind_size_in_blocks * block_size_B;
+   assert(line_bind_size_in_blocks != 0);
+   assert(line_bind_size != 0);
+
+   uint64_t memory_offset = bind->memoryOffset;
+   for (uint32_t z = bind_offset_el.z;
+        z < bind_offset_el.z + bind_extent_el.depth;
+        z += block_shape_el.depth) {
+      uint64_t subresource_offset_B;
+      uint32_t subresource_x_offset, subresource_y_offset;
+      isl_surf_get_image_offset_B_tile_sa(surf, mip_level, array_layer, z,
+                                          &subresource_offset_B,
+                                          &subresource_x_offset,
+                                          &subresource_y_offset);
+      assert(subresource_x_offset == 0 && subresource_y_offset == 0);
+      assert(subresource_offset_B % block_size_B == 0);
+
+      for (uint32_t y = bind_offset_el.y;
+           y < bind_offset_el.y + bind_extent_el.height;
+           y+= block_shape_el.height) {
+         uint32_t line_block_offset = y / block_shape_el.height *
+                                      blocks_per_line;
+         uint64_t line_start_B = subresource_offset_B +
+                                 line_block_offset * block_size_B;
+         uint64_t bind_offset_B = line_start_B +
+                                  (bind_offset_el.x / block_shape_el.width) *
+                                  block_size_B;
+
+         VkSparseMemoryBind opaque_bind = {
+            .resourceOffset = binding_plane_offset + bind_offset_B,
+            .size = line_bind_size,
+            .memory = bind->memory,
+            .memoryOffset = memory_offset,
+            .flags = bind->flags,
+         };
+
+         memory_offset += line_bind_size;
+
+         assert(line_start_B % block_size_B == 0);
+         assert(opaque_bind.resourceOffset % block_size_B == 0);
+         assert(opaque_bind.size % block_size_B == 0);
+
+         struct anv_vm_bind anv_bind = vk_bind_to_anv_vm_bind(sparse_data,
+                                                              &opaque_bind);
+         VkResult result = anv_sparse_submission_add(device, submit,
+                                                     &anv_bind);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_sparse_image_check_support(struct anv_physical_device *pdevice,
+                               VkImageCreateFlags flags,
+                               VkImageTiling tiling,
+                               VkSampleCountFlagBits samples,
+                               VkImageType type,
+                               VkFormat vk_format)
+{
+   assert(flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT);
+
+   /* The spec says:
+    *   "A sparse image created using VK_IMAGE_CREATE_SPARSE_BINDING_BIT (but
+    *    not VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT) supports all formats that
+    *    non-sparse usage supports, and supports both VK_IMAGE_TILING_OPTIMAL
+    *    and VK_IMAGE_TILING_LINEAR tiling."
+    */
+   if (!(flags & VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT))
+      return VK_SUCCESS;
+
+   /* From here on, these are the rules:
+    *   "A sparse image created using VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT
+    *    supports all non-compressed color formats with power-of-two element
+    *    size that non-sparse usage supports. Additional formats may also be
+    *    supported and can be queried via
+    *    vkGetPhysicalDeviceSparseImageFormatProperties.
+    *    VK_IMAGE_TILING_LINEAR tiling is not supported."
+    */
+
+   /* We choose not to support sparse residency on emulated compressed
+    * formats due to the additional image plane. It would make the
+    * implementation extremely complicated.
+    */
+   if (anv_is_format_emulated(pdevice, vk_format))
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   /* While the spec itself says linear is not supported (see above), deqp-vk
+    * tries anyway to create linear sparse images, so we have to check for it.
+    * This is also said in VUID-VkImageCreateInfo-tiling-04121:
+    *   "If tiling is VK_IMAGE_TILING_LINEAR, flags must not contain
+    *    VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT"
+    */
+   if (tiling == VK_IMAGE_TILING_LINEAR)
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   /* TODO: not supported yet. */
+   if (samples != VK_SAMPLE_COUNT_1_BIT)
+      return VK_ERROR_FEATURE_NOT_PRESENT;
+
+   /* While the Vulkan spec allows us to support depth/stencil sparse images
+    * everywhere, sometimes we're not able to have them with the tiling
+    * formats that give us the standard block shapes. Having standard block
+    * shapes is higher priority than supporting depth/stencil sparse images.
+    *
+    * Please see ISL's filter_tiling() functions for accurate explanations on
+    * why depth/stencil images are not always supported with the tiling
+    * formats we want. But in short: depth/stencil support in our HW is
+    * limited to 2D and we can't build a 2D view of a 3D image with these
+    * tiling formats due to the address swizzling being different.
+    */
+   VkImageAspectFlags aspects = vk_format_aspects(vk_format);
+   if (aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
+      /* For 125+, isl_gfx125_filter_tiling() claims 3D is not supported.
+       * For the previous platforms, isl_gfx6_filter_tiling() says only 2D is
+       * supported.
+       */
+      if (pdevice->info.verx10 >= 125) {
+         if (type == VK_IMAGE_TYPE_3D)
+            return VK_ERROR_FORMAT_NOT_SUPPORTED;
+      } else {
+         if (type != VK_IMAGE_TYPE_2D)
+            return VK_ERROR_FORMAT_NOT_SUPPORTED;
+      }
+   }
+
+   const struct anv_format *anv_format = anv_get_format(vk_format);
+   if (!anv_format)
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   for (int p = 0; p < anv_format->n_planes; p++) {
+      enum isl_format isl_format = anv_format->planes[p].isl_format;
+
+      if (isl_format == ISL_FORMAT_UNSUPPORTED)
+         return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+      const struct isl_format_layout *isl_layout =
+         isl_format_get_layout(isl_format);
+
+      /* As quoted above, we only need to support the power-of-two formats.
+       * The problem with the non-power-of-two formats is that we need an
+       * integer number of pixels to fit into a sparse block, so we'd need the
+       * sparse block sizes to be, for example, 192k for 24bpp.
+       *
+       * TODO: add support for these formats.
+       */
+      if (isl_layout->bpb != 8 && isl_layout->bpb != 16 &&
+          isl_layout->bpb != 32 && isl_layout->bpb != 64 &&
+          isl_layout->bpb != 128)
+         return VK_ERROR_FORMAT_NOT_SUPPORTED;
+   }
+
+   /* These YUV formats are considered by Vulkan to be compressed 2x1 blocks.
+    * We don't need to support them since they're compressed. On Gfx12 we
+    * can't even have Tile64 for them. Once we do support these formats we'll
+    * have to report the correct block shapes because dEQP cares about them,
+    * and we'll have to adjust for the fact that ISL treats these as 16bpp 1x1
+    * blocks instead of 32bpp 2x1 compressed blocks (as block shapes are
+    * reported in units of compressed blocks).
+    */
+   if (vk_format == VK_FORMAT_G8B8G8R8_422_UNORM ||
+       vk_format == VK_FORMAT_B8G8R8G8_422_UNORM)
+      return VK_ERROR_FORMAT_NOT_SUPPORTED;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_trtt_garbage_collect_batches(struct anv_device *device)
+{
+   struct anv_trtt *trtt = &device->trtt;
+
+   if (trtt->timeline_val % 8 != 7)
+      return VK_SUCCESS;
+
+   uint64_t cur_timeline_val = 0;
+   struct drm_syncobj_timeline_array array = {
+      .handles = (uintptr_t)&trtt->timeline_handle,
+      .points = (uintptr_t)&cur_timeline_val,
+      .count_handles = 1,
+      .flags = 0,
+   };
+   if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_QUERY, &array))
+      return vk_error(device, VK_ERROR_UNKNOWN);
+
+   list_for_each_entry_safe(struct anv_trtt_batch_bo, trtt_bbo,
+                            &trtt->in_flight_batches, link) {
+      if (trtt_bbo->timeline_val > cur_timeline_val)
+         return VK_SUCCESS;
+
+      anv_trtt_batch_bo_free(device, trtt_bbo);
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_trtt_batch_bo_new(struct anv_device *device, uint32_t batch_size,
+                      struct anv_trtt_batch_bo **out_trtt_bbo)
+{
+   struct anv_trtt *trtt = &device->trtt;
+   VkResult result;
+
+   anv_trtt_garbage_collect_batches(device);
+
+   struct anv_trtt_batch_bo *trtt_bbo =
+      vk_alloc(&device->vk.alloc, sizeof(*trtt_bbo), 8,
+               VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!trtt_bbo)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = anv_bo_pool_alloc(&device->batch_bo_pool, batch_size,
+                              &trtt_bbo->bo);
+   if (result != VK_SUCCESS)
+      goto out;
+
+   trtt_bbo->size = batch_size;
+   trtt_bbo->timeline_val = ++trtt->timeline_val;
+
+   list_addtail(&trtt_bbo->link, &trtt->in_flight_batches);
+
+   *out_trtt_bbo = trtt_bbo;
+
+   return VK_SUCCESS;
+out:
+   vk_free(&device->vk.alloc, trtt_bbo);
+   return result;
+}
diff --git a/src/intel/vulkan/anv_util.c b/src/intel/vulkan/anv_util.c
index b06ee760f70..15a160b6194 100644
--- a/src/intel/vulkan/anv_util.c
+++ b/src/intel/vulkan/anv_util.c
@@ -31,24 +31,6 @@
 #include "anv_private.h"
 #include "vk_enum_to_str.h"
 
-/** Log an error message.  */
-void anv_printflike(1, 2)
-anv_loge(const char *format, ...)
-{
-   va_list va;
-
-   va_start(va, format);
-   anv_loge_v(format, va);
-   va_end(va);
-}
-
-/** \see anv_loge() */
-void
-anv_loge_v(const char *format, va_list va)
-{
-   mesa_loge_v(format, va);
-}
-
 void
 __anv_perf_warn(struct anv_device *device,
                 const struct vk_object_base *object,
@@ -56,91 +38,119 @@ __anv_perf_warn(struct anv_device *device,
 {
    va_list ap;
    char buffer[256];
-   char report[512];
 
    va_start(ap, format);
    vsnprintf(buffer, sizeof(buffer), format, ap);
    va_end(ap);
 
-   snprintf(report, sizeof(report), "%s: %s", file, buffer);
-
-   vk_debug_report(&device->physical->instance->vk,
-                   VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT,
-                   object, line, 0, "anv", report);
-
-   mesa_logw("%s:%d: PERF: %s", file, line, buffer);
-}
-
-VkResult
-__vk_errorv(struct anv_instance *instance,
-            const struct vk_object_base *object, VkResult error,
-            const char *file, int line, const char *format, va_list ap)
-{
-   char buffer[256];
-   char report[512];
-
-   const char *error_str = vk_Result_to_str(error);
-
-   if (format) {
-      vsnprintf(buffer, sizeof(buffer), format, ap);
-
-      snprintf(report, sizeof(report), "%s:%d: %s (%s)", file, line, buffer,
-               error_str);
+   if (object) {
+      __vk_log(VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT,
+               VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT,
+               VK_LOG_OBJS(object), file, line,
+               "PERF: %s", buffer);
    } else {
-      snprintf(report, sizeof(report), "%s:%d: %s", file, line, error_str);
+      __vk_log(VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT,
+               VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT,
+               VK_LOG_NO_OBJS(device->physical->instance), file, line,
+               "PERF: %s", buffer);
    }
-
-   if (instance) {
-      vk_debug_report(&instance->vk, VK_DEBUG_REPORT_ERROR_BIT_EXT,
-                      object, line, 0, "anv", report);
-   }
-
-   mesa_loge("%s", report);
-
-   return error;
-}
-
-VkResult
-__vk_errorf(struct anv_instance *instance,
-            const struct vk_object_base *object, VkResult error,
-            const char *file, int line, const char *format, ...)
-{
-   va_list ap;
-
-   va_start(ap, format);
-   __vk_errorv(instance, object, error, file, line, format, ap);
-   va_end(ap);
-
-   return error;
 }
 
 void
-anv_dump_pipe_bits(enum anv_pipe_bits bits)
+anv_dump_pipe_bits(enum anv_pipe_bits bits, FILE *f)
 {
    if (bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT)
-      fputs("+depth_flush ", stderr);
+      fputs("+depth_flush ", f);
    if (bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT)
-      fputs("+dc_flush ", stderr);
+      fputs("+dc_flush ", f);
    if (bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT)
-      fputs("+hdc_flush ", stderr);
+      fputs("+hdc_flush ", f);
    if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
-      fputs("+rt_flush ", stderr);
+      fputs("+rt_flush ", f);
    if (bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT)
-      fputs("+tile_flush ", stderr);
+      fputs("+tile_flush ", f);
    if (bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT)
-      fputs("+state_inval ", stderr);
+      fputs("+state_inval ", f);
    if (bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT)
-      fputs("+const_inval ", stderr);
+      fputs("+const_inval ", f);
    if (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)
-      fputs("+vf_inval ", stderr);
+      fputs("+vf_inval ", f);
    if (bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT)
-      fputs("+tex_inval ", stderr);
+      fputs("+tex_inval ", f);
    if (bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT)
-      fputs("+ic_inval ", stderr);
+      fputs("+ic_inval ", f);
    if (bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT)
-      fputs("+pb_stall ", stderr);
+      fputs("+pb_stall ", f);
+   if (bits & ANV_PIPE_PSS_STALL_SYNC_BIT)
+      fputs("+pss_stall ", f);
    if (bits & ANV_PIPE_DEPTH_STALL_BIT)
-      fputs("+depth_stall ", stderr);
-   if (bits & ANV_PIPE_CS_STALL_BIT)
-      fputs("+cs_stall ", stderr);
+      fputs("+depth_stall ", f);
+   if (bits & ANV_PIPE_CS_STALL_BIT ||
+       bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT)
+      fputs("+cs_stall ", f);
+   if (bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT)
+      fputs("+utdp_flush ", f);
+   if (bits & ANV_PIPE_CCS_CACHE_FLUSH_BIT)
+      fputs("+ccs_flush ", f);
+}
+
+const char *
+anv_gfx_state_bit_to_str(enum anv_gfx_state_bits state)
+{
+#define NAME(name) case ANV_GFX_STATE_##name: return #name;
+   switch (state) {
+      NAME(URB);
+      NAME(VF_STATISTICS);
+      NAME(VF_SGVS);
+      NAME(VF_SGVS_2);
+      NAME(VF_SGVS_INSTANCING);
+      NAME(PRIMITIVE_REPLICATION);
+      NAME(MULTISAMPLE);
+      NAME(SBE);
+      NAME(SBE_SWIZ);
+      NAME(SO_DECL_LIST);
+      NAME(VS);
+      NAME(HS);
+      NAME(DS);
+      NAME(GS);
+      NAME(PS);
+      NAME(PS_EXTRA);
+      NAME(SBE_MESH);
+      NAME(CLIP_MESH);
+      NAME(MESH_CONTROL);
+      NAME(MESH_SHADER);
+      NAME(MESH_DISTRIB);
+      NAME(TASK_CONTROL);
+      NAME(TASK_SHADER);
+      NAME(TASK_REDISTRIB);
+      NAME(BLEND_STATE_PTR);
+      NAME(CLIP);
+      NAME(CC_STATE);
+      NAME(CC_STATE_PTR);
+      NAME(CPS);
+      NAME(DEPTH_BOUNDS);
+      NAME(INDEX_BUFFER);
+      NAME(LINE_STIPPLE);
+      NAME(PS_BLEND);
+      NAME(RASTER);
+      NAME(SAMPLE_MASK);
+      NAME(SAMPLE_PATTERN);
+      NAME(SCISSOR);
+      NAME(SF);
+      NAME(STREAMOUT);
+      NAME(TE);
+      NAME(VERTEX_INPUT);
+      NAME(VF);
+      NAME(VF_TOPOLOGY);
+      NAME(VFG);
+      NAME(VIEWPORT_CC);
+      NAME(VIEWPORT_CC_PTR);
+      NAME(VIEWPORT_SF_CLIP);
+      NAME(WM);
+      NAME(WM_DEPTH_STENCIL);
+      NAME(PMA_FIX);
+      NAME(WA_18019816803);
+      NAME(TBIMR_TILE_PASS_INFO);
+   default: unreachable("invalid state");
+   }
 }
diff --git a/src/intel/vulkan/anv_utrace.c b/src/intel/vulkan/anv_utrace.c
new file mode 100644
index 00000000000..9b66300a44c
--- /dev/null
+++ b/src/intel/vulkan/anv_utrace.c
@@ -0,0 +1,684 @@
+/*
+ * Copyright © 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+#include "anv_internal_kernels.h"
+
+#include "ds/intel_tracepoints.h"
+#include "genxml/gen9_pack.h"
+#include "perf/intel_perf.h"
+#include "util/perf/cpu_trace.h"
+
+#include "vk_common_entrypoints.h"
+
+/** Timestamp structure format */
+union anv_utrace_timestamp {
+   /* Timestamp writtem by either 2 * MI_STORE_REGISTER_MEM or
+    * PIPE_CONTROL.
+    */
+   uint64_t timestamp;
+
+   /* Timestamp written by COMPUTE_WALKER::PostSync
+    *
+    * Layout is described in PRMs.
+    * ATSM PRMs, Volume 2d: Command Reference: Structures, POSTSYNC_DATA:
+    *
+    *    "The timestamp layout :
+    *        [0] = 32b Context Timestamp Start
+    *        [1] = 32b Global Timestamp Start
+    *        [2] = 32b Context Timestamp End
+    *        [3] = 32b Global Timestamp End"
+    */
+   uint32_t compute_walker[4];
+};
+
+static uint32_t
+command_buffers_count_utraces(struct anv_device *device,
+                              uint32_t cmd_buffer_count,
+                              struct anv_cmd_buffer **cmd_buffers,
+                              uint32_t *utrace_copies)
+{
+   if (!u_trace_should_process(&device->ds.trace_context))
+      return 0;
+
+   uint32_t utraces = 0;
+   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+      if (u_trace_has_points(&cmd_buffers[i]->trace)) {
+         utraces++;
+         if (!(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT))
+            *utrace_copies += list_length(&cmd_buffers[i]->trace.trace_chunks);
+      }
+   }
+
+   return utraces;
+}
+
+static void
+anv_utrace_delete_submit(struct u_trace_context *utctx, void *submit_data)
+{
+   struct anv_device *device =
+      container_of(utctx, struct anv_device, ds.trace_context);
+   struct anv_utrace_submit *submit = submit_data;
+
+   intel_ds_flush_data_fini(&submit->ds);
+
+   anv_state_stream_finish(&submit->dynamic_state_stream);
+   anv_state_stream_finish(&submit->general_state_stream);
+
+   if (submit->trace_bo)
+      anv_bo_pool_free(&device->utrace_bo_pool, submit->trace_bo);
+
+   util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
+      anv_bo_pool_free(&device->utrace_bo_pool, *bo);
+   util_dynarray_fini(&submit->batch_bos);
+
+   vk_sync_destroy(&device->vk, submit->sync);
+
+   vk_free(&device->vk.alloc, submit);
+}
+
+static void
+anv_device_utrace_emit_gfx_copy_ts_buffer(struct u_trace_context *utctx,
+                                          void *cmdstream,
+                                          void *ts_from, uint32_t from_offset,
+                                          void *ts_to, uint32_t to_offset,
+                                          uint32_t count)
+{
+   struct anv_device *device =
+      container_of(utctx, struct anv_device, ds.trace_context);
+   struct anv_utrace_submit *submit = cmdstream;
+   struct anv_address from_addr = (struct anv_address) {
+      .bo = ts_from, .offset = from_offset * sizeof(union anv_utrace_timestamp) };
+   struct anv_address to_addr = (struct anv_address) {
+      .bo = ts_to, .offset = to_offset * sizeof(union anv_utrace_timestamp) };
+
+   anv_genX(device->info, emit_so_memcpy)(&submit->memcpy_state,
+                                          to_addr, from_addr,
+                                          count * sizeof(union anv_utrace_timestamp));
+}
+
+static void
+anv_device_utrace_emit_cs_copy_ts_buffer(struct u_trace_context *utctx,
+                                         void *cmdstream,
+                                         void *ts_from, uint32_t from_offset,
+                                         void *ts_to, uint32_t to_offset,
+                                         uint32_t count)
+{
+   struct anv_device *device =
+      container_of(utctx, struct anv_device, ds.trace_context);
+   struct anv_utrace_submit *submit = cmdstream;
+   struct anv_address from_addr = (struct anv_address) {
+      .bo = ts_from, .offset = from_offset * sizeof(union anv_utrace_timestamp) };
+   struct anv_address to_addr = (struct anv_address) {
+      .bo = ts_to, .offset = to_offset * sizeof(union anv_utrace_timestamp) };
+
+   struct anv_state push_data_state =
+      anv_genX(device->info, simple_shader_alloc_push)(
+         &submit->simple_state, sizeof(struct anv_memcpy_params));
+   struct anv_memcpy_params *params = push_data_state.map;
+
+   *params = (struct anv_memcpy_params) {
+      .num_dwords = count * sizeof(union anv_utrace_timestamp) / 4,
+      .src_addr   = anv_address_physical(from_addr),
+      .dst_addr   = anv_address_physical(to_addr),
+   };
+
+   anv_genX(device->info, emit_simple_shader_dispatch)(
+      &submit->simple_state, DIV_ROUND_UP(params->num_dwords, 4),
+      push_data_state);
+}
+
+static VkResult
+anv_utrace_submit_extend_batch(struct anv_batch *batch, uint32_t size,
+                               void *user_data)
+{
+   struct anv_utrace_submit *submit = user_data;
+
+   uint32_t alloc_size = 0;
+   util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
+      alloc_size += (*bo)->size;
+   alloc_size = MAX2(alloc_size * 2, 8192);
+
+   struct anv_bo *bo;
+   VkResult result = anv_bo_pool_alloc(&submit->queue->device->utrace_bo_pool,
+                                       align(alloc_size, 4096),
+                                       &bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   util_dynarray_append(&submit->batch_bos, struct anv_bo *, bo);
+
+   batch->end += 4 * GFX9_MI_BATCH_BUFFER_START_length;
+
+   anv_batch_emit(batch, GFX9_MI_BATCH_BUFFER_START, bbs) {
+      bbs.DWordLength               = GFX9_MI_BATCH_BUFFER_START_length -
+                                      GFX9_MI_BATCH_BUFFER_START_length_bias;
+      bbs.SecondLevelBatchBuffer    = Firstlevelbatch;
+      bbs.AddressSpaceIndicator     = ASI_PPGTT;
+      bbs.BatchBufferStartAddress   = (struct anv_address) { bo, 0 };
+   }
+
+   anv_batch_set_storage(batch,
+                         (struct anv_address) { .bo = bo, },
+                         bo->map,
+                         bo->size - 4 * GFX9_MI_BATCH_BUFFER_START_length);
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_device_utrace_flush_cmd_buffers(struct anv_queue *queue,
+                                    uint32_t cmd_buffer_count,
+                                    struct anv_cmd_buffer **cmd_buffers,
+                                    struct anv_utrace_submit **out_submit)
+{
+   struct anv_device *device = queue->device;
+   uint32_t utrace_copies = 0;
+   uint32_t utraces = command_buffers_count_utraces(device,
+                                                    cmd_buffer_count,
+                                                    cmd_buffers,
+                                                    &utrace_copies);
+   if (!utraces) {
+      *out_submit = NULL;
+      return VK_SUCCESS;
+   }
+
+   VkResult result;
+   struct anv_utrace_submit *submit =
+      vk_zalloc(&device->vk.alloc, sizeof(struct anv_utrace_submit),
+                8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!submit)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   submit->queue = queue;
+
+   intel_ds_flush_data_init(&submit->ds, &queue->ds, queue->ds.submission_id);
+
+   result = vk_sync_create(&device->vk, &device->physical->sync_syncobj_type,
+                           0, 0, &submit->sync);
+   if (result != VK_SUCCESS)
+      goto error_sync;
+
+   util_dynarray_init(&submit->batch_bos, NULL);
+
+   if (utrace_copies > 0) {
+      result = anv_bo_pool_alloc(&device->utrace_bo_pool,
+                                 utrace_copies * 4096,
+                                 &submit->trace_bo);
+      if (result != VK_SUCCESS)
+         goto error_trace_buf;
+
+      const bool uses_relocs = device->physical->uses_relocs;
+      result = anv_reloc_list_init(&submit->relocs, &device->vk.alloc, uses_relocs);
+      if (result != VK_SUCCESS)
+         goto error_reloc_list;
+
+      anv_state_stream_init(&submit->dynamic_state_stream,
+                            &device->dynamic_state_pool, 16384);
+      anv_state_stream_init(&submit->general_state_stream,
+                            &device->general_state_pool, 16384);
+
+      submit->batch = (struct anv_batch) {
+         .alloc = &device->vk.alloc,
+         .relocs = &submit->relocs,
+         .user_data = submit,
+         .extend_cb = anv_utrace_submit_extend_batch,
+      };
+
+      /* Only engine class where we support timestamp copies
+       *
+       * TODO: add INTEL_ENGINE_CLASS_COPY support (should be trivial ;)
+       */
+      assert(queue->family->engine_class == INTEL_ENGINE_CLASS_RENDER ||
+             queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE);
+      if (queue->family->engine_class == INTEL_ENGINE_CLASS_RENDER) {
+
+         trace_intel_begin_trace_copy_cb(&submit->ds.trace, &submit->batch);
+
+         anv_genX(device->info, emit_so_memcpy_init)(&submit->memcpy_state,
+                                                     device,
+                                                     &submit->batch);
+         uint32_t num_traces = 0;
+         for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+            if (cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) {
+               intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
+                                         &submit->ds, false);
+            } else {
+               num_traces += cmd_buffers[i]->trace.num_traces;
+               u_trace_clone_append(u_trace_begin_iterator(&cmd_buffers[i]->trace),
+                                    u_trace_end_iterator(&cmd_buffers[i]->trace),
+                                    &submit->ds.trace,
+                                    submit,
+                                    anv_device_utrace_emit_gfx_copy_ts_buffer);
+            }
+         }
+         anv_genX(device->info, emit_so_memcpy_fini)(&submit->memcpy_state);
+
+         trace_intel_end_trace_copy_cb(&submit->ds.trace, &submit->batch,
+                                       num_traces);
+
+         anv_genX(device->info, emit_so_memcpy_end)(&submit->memcpy_state);
+      } else {
+         struct anv_shader_bin *copy_kernel;
+         VkResult ret =
+            anv_device_get_internal_shader(device,
+                                           ANV_INTERNAL_KERNEL_MEMCPY_COMPUTE,
+                                           &copy_kernel);
+         if (ret != VK_SUCCESS)
+            goto error_batch;
+
+         trace_intel_begin_trace_copy_cb(&submit->ds.trace, &submit->batch);
+
+         submit->simple_state = (struct anv_simple_shader) {
+            .device               = device,
+            .dynamic_state_stream = &submit->dynamic_state_stream,
+            .general_state_stream = &submit->general_state_stream,
+            .batch                = &submit->batch,
+            .kernel               = copy_kernel,
+            .l3_config            = device->internal_kernels_l3_config,
+         };
+         anv_genX(device->info, emit_simple_shader_init)(&submit->simple_state);
+
+         uint32_t num_traces = 0;
+         for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+            num_traces += cmd_buffers[i]->trace.num_traces;
+            if (cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) {
+               intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
+                                         &submit->ds, false);
+            } else {
+               num_traces += cmd_buffers[i]->trace.num_traces;
+               u_trace_clone_append(u_trace_begin_iterator(&cmd_buffers[i]->trace),
+                                    u_trace_end_iterator(&cmd_buffers[i]->trace),
+                                    &submit->ds.trace,
+                                    submit,
+                                    anv_device_utrace_emit_cs_copy_ts_buffer);
+            }
+         }
+
+         trace_intel_end_trace_copy_cb(&submit->ds.trace, &submit->batch,
+                                       num_traces);
+
+         anv_genX(device->info, emit_simple_shader_end)(&submit->simple_state);
+      }
+
+      intel_ds_queue_flush_data(&queue->ds, &submit->ds.trace, &submit->ds, true);
+
+      if (submit->batch.status != VK_SUCCESS) {
+         result = submit->batch.status;
+         goto error_batch;
+      }
+   } else {
+      for (uint32_t i = 0; i < cmd_buffer_count; i++) {
+         assert(cmd_buffers[i]->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT);
+         intel_ds_queue_flush_data(&queue->ds, &cmd_buffers[i]->trace,
+                                   &submit->ds, i == (cmd_buffer_count - 1));
+      }
+   }
+
+   *out_submit = submit;
+
+   return VK_SUCCESS;
+
+ error_batch:
+   anv_reloc_list_finish(&submit->relocs);
+   util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
+      anv_bo_pool_free(&device->utrace_bo_pool, *bo);
+ error_reloc_list:
+   anv_bo_pool_free(&device->utrace_bo_pool, submit->trace_bo);
+ error_trace_buf:
+   vk_sync_destroy(&device->vk, submit->sync);
+ error_sync:
+   intel_ds_flush_data_fini(&submit->ds);
+   vk_free(&device->vk.alloc, submit);
+   return result;
+}
+
+static void *
+anv_utrace_create_ts_buffer(struct u_trace_context *utctx, uint32_t size_b)
+{
+   struct anv_device *device =
+      container_of(utctx, struct anv_device, ds.trace_context);
+
+   uint32_t anv_ts_size_b = (size_b / sizeof(uint64_t)) *
+      sizeof(union anv_utrace_timestamp);
+
+   struct anv_bo *bo = NULL;
+   UNUSED VkResult result =
+      anv_bo_pool_alloc(&device->utrace_bo_pool,
+                        align(anv_ts_size_b, 4096),
+                        &bo);
+   assert(result == VK_SUCCESS);
+
+   memset(bo->map, 0, bo->size);
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+   if (device->physical->memory.need_flush &&
+       anv_bo_needs_host_cache_flush(bo->alloc_flags))
+      intel_flush_range(bo->map, bo->size);
+#endif
+
+   return bo;
+}
+
+static void
+anv_utrace_destroy_ts_buffer(struct u_trace_context *utctx, void *timestamps)
+{
+   struct anv_device *device =
+      container_of(utctx, struct anv_device, ds.trace_context);
+   struct anv_bo *bo = timestamps;
+
+   anv_bo_pool_free(&device->utrace_bo_pool, bo);
+}
+
+static void
+anv_utrace_record_ts(struct u_trace *ut, void *cs,
+                     void *timestamps, unsigned idx,
+                     bool end_of_pipe)
+{
+   struct anv_device *device =
+      container_of(ut->utctx, struct anv_device, ds.trace_context);
+   struct anv_cmd_buffer *cmd_buffer =
+      container_of(ut, struct anv_cmd_buffer, trace);
+   /* cmd_buffer is only valid if cs == NULL */
+   struct anv_batch *batch = cs != NULL ? cs : &cmd_buffer->batch;
+   struct anv_bo *bo = timestamps;
+
+   struct anv_address ts_address = (struct anv_address) {
+      .bo = bo,
+      .offset = idx * sizeof(union anv_utrace_timestamp)
+   };
+
+   /* Is this a end of compute trace point? */
+   const bool is_end_compute =
+      cs == NULL &&
+      (cmd_buffer->last_compute_walker != NULL ||
+       cmd_buffer->last_indirect_dispatch != NULL) &&
+      end_of_pipe;
+
+   enum anv_timestamp_capture_type capture_type = end_of_pipe ?
+      (is_end_compute ?
+       (cmd_buffer->last_indirect_dispatch != NULL ?
+        ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH : ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER) :
+       ANV_TIMESTAMP_CAPTURE_END_OF_PIPE) : ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE;
+
+   void *addr = capture_type ==  ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH ?
+                cmd_buffer->last_indirect_dispatch :
+                capture_type ==  ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER ?
+                cmd_buffer->last_compute_walker : NULL;
+
+   device->physical->cmd_emit_timestamp(batch, device, ts_address,
+                                        capture_type,
+                                        addr);
+   if (is_end_compute) {
+      cmd_buffer->last_compute_walker = NULL;
+      cmd_buffer->last_indirect_dispatch = NULL;
+   }
+}
+
+static uint64_t
+anv_utrace_read_ts(struct u_trace_context *utctx,
+                   void *timestamps, unsigned idx, void *flush_data)
+{
+   struct anv_device *device =
+      container_of(utctx, struct anv_device, ds.trace_context);
+   struct anv_bo *bo = timestamps;
+   struct anv_utrace_submit *submit = flush_data;
+
+   /* Only need to stall on results for the first entry: */
+   if (idx == 0) {
+      MESA_TRACE_SCOPE("anv utrace wait timestamps");
+      UNUSED VkResult result =
+         vk_sync_wait(&device->vk,
+                      submit->sync,
+                      0,
+                      VK_SYNC_WAIT_COMPLETE,
+                      os_time_get_absolute_timeout(OS_TIMEOUT_INFINITE));
+      assert(result == VK_SUCCESS);
+   }
+
+   union anv_utrace_timestamp *ts = (union anv_utrace_timestamp *)bo->map;
+
+   /* Don't translate the no-timestamp marker: */
+   if (ts[idx].timestamp == U_TRACE_NO_TIMESTAMP)
+      return U_TRACE_NO_TIMESTAMP;
+
+   /* Detect a 16bytes timestamp write */
+   if (ts[idx].compute_walker[2] != 0 || ts[idx].compute_walker[3] != 0) {
+      /* The timestamp written by COMPUTE_WALKER::PostSync only as 32bits. We
+       * need to rebuild the full 64bits using the previous timestamp. We
+       * assume that utrace is reading the timestamp in order. Anyway
+       * timestamp rollover on 32bits in a few minutes so in most cases that
+       * should be correct.
+       */
+      uint64_t timestamp =
+         (submit->last_full_timestamp & 0xffffffff00000000) |
+         (uint64_t) ts[idx].compute_walker[3];
+
+      return intel_device_info_timebase_scale(device->info, timestamp);
+   }
+
+   submit->last_full_timestamp = ts[idx].timestamp;
+
+   return intel_device_info_timebase_scale(device->info, ts[idx].timestamp);
+}
+
+void
+anv_device_utrace_init(struct anv_device *device)
+{
+   anv_bo_pool_init(&device->utrace_bo_pool, device, "utrace",
+                    ANV_BO_ALLOC_MAPPED | ANV_BO_ALLOC_HOST_CACHED_COHERENT);
+   intel_ds_device_init(&device->ds, device->info, device->fd,
+                        device->physical->local_minor,
+                        INTEL_DS_API_VULKAN);
+   u_trace_context_init(&device->ds.trace_context,
+                        &device->ds,
+                        anv_utrace_create_ts_buffer,
+                        anv_utrace_destroy_ts_buffer,
+                        anv_utrace_record_ts,
+                        anv_utrace_read_ts,
+                        anv_utrace_delete_submit);
+
+   for (uint32_t q = 0; q < device->queue_count; q++) {
+      struct anv_queue *queue = &device->queues[q];
+
+      intel_ds_device_init_queue(&device->ds, &queue->ds, "%s%u",
+                                 intel_engines_class_to_string(queue->family->engine_class),
+                                 queue->vk.index_in_family);
+   }
+}
+
+void
+anv_device_utrace_finish(struct anv_device *device)
+{
+   intel_ds_device_process(&device->ds, true);
+   intel_ds_device_fini(&device->ds);
+   anv_bo_pool_finish(&device->utrace_bo_pool);
+}
+
+enum intel_ds_stall_flag
+anv_pipe_flush_bit_to_ds_stall_flag(enum anv_pipe_bits bits)
+{
+   static const struct {
+      enum anv_pipe_bits anv;
+      enum intel_ds_stall_flag ds;
+   } anv_to_ds_flags[] = {
+      { .anv = ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,            .ds = INTEL_DS_DEPTH_CACHE_FLUSH_BIT, },
+      { .anv = ANV_PIPE_DATA_CACHE_FLUSH_BIT,             .ds = INTEL_DS_DATA_CACHE_FLUSH_BIT, },
+      { .anv = ANV_PIPE_TILE_CACHE_FLUSH_BIT,             .ds = INTEL_DS_TILE_CACHE_FLUSH_BIT, },
+      { .anv = ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,    .ds = INTEL_DS_RENDER_TARGET_CACHE_FLUSH_BIT, },
+      { .anv = ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,       .ds = INTEL_DS_STATE_CACHE_INVALIDATE_BIT, },
+      { .anv = ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT,    .ds = INTEL_DS_CONST_CACHE_INVALIDATE_BIT, },
+      { .anv = ANV_PIPE_VF_CACHE_INVALIDATE_BIT,          .ds = INTEL_DS_VF_CACHE_INVALIDATE_BIT, },
+      { .anv = ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT,     .ds = INTEL_DS_TEXTURE_CACHE_INVALIDATE_BIT, },
+      { .anv = ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT, .ds = INTEL_DS_INST_CACHE_INVALIDATE_BIT, },
+      { .anv = ANV_PIPE_DEPTH_STALL_BIT,                  .ds = INTEL_DS_DEPTH_STALL_BIT, },
+      { .anv = ANV_PIPE_CS_STALL_BIT,                     .ds = INTEL_DS_CS_STALL_BIT, },
+      { .anv = ANV_PIPE_HDC_PIPELINE_FLUSH_BIT,           .ds = INTEL_DS_HDC_PIPELINE_FLUSH_BIT, },
+      { .anv = ANV_PIPE_STALL_AT_SCOREBOARD_BIT,          .ds = INTEL_DS_STALL_AT_SCOREBOARD_BIT, },
+      { .anv = ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, .ds = INTEL_DS_UNTYPED_DATAPORT_CACHE_FLUSH_BIT, },
+      { .anv = ANV_PIPE_PSS_STALL_SYNC_BIT,               .ds = INTEL_DS_PSS_STALL_SYNC_BIT, },
+      { .anv = ANV_PIPE_END_OF_PIPE_SYNC_BIT,             .ds = INTEL_DS_END_OF_PIPE_BIT, },
+      { .anv = ANV_PIPE_CCS_CACHE_FLUSH_BIT,              .ds = INTEL_DS_CCS_CACHE_FLUSH_BIT, },
+   };
+
+   enum intel_ds_stall_flag ret = 0;
+   for (uint32_t i = 0; i < ARRAY_SIZE(anv_to_ds_flags); i++) {
+      if (anv_to_ds_flags[i].anv & bits)
+         ret |= anv_to_ds_flags[i].ds;
+   }
+
+   return ret;
+}
+
+void anv_CmdBeginDebugUtilsLabelEXT(
+   VkCommandBuffer _commandBuffer,
+   const VkDebugUtilsLabelEXT *pLabelInfo)
+{
+   VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, _commandBuffer);
+
+   vk_common_CmdBeginDebugUtilsLabelEXT(_commandBuffer, pLabelInfo);
+
+   trace_intel_begin_cmd_buffer_annotation(&cmd_buffer->trace);
+}
+
+void anv_CmdEndDebugUtilsLabelEXT(VkCommandBuffer _commandBuffer)
+{
+   VK_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, _commandBuffer);
+
+   if (cmd_buffer->vk.labels.size > 0) {
+      const VkDebugUtilsLabelEXT *label =
+         util_dynarray_top_ptr(&cmd_buffer->vk.labels, VkDebugUtilsLabelEXT);
+
+      trace_intel_end_cmd_buffer_annotation(&cmd_buffer->trace,
+                                            strlen(label->pLabelName),
+                                            label->pLabelName);
+   }
+
+   vk_common_CmdEndDebugUtilsLabelEXT(_commandBuffer);
+}
+
+void
+anv_queue_trace(struct anv_queue *queue, const char *label, bool frame, bool begin)
+{
+   struct anv_device *device = queue->device;
+
+   VkResult result;
+   struct anv_utrace_submit *submit =
+      vk_zalloc(&device->vk.alloc, sizeof(struct anv_utrace_submit),
+                8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!submit)
+      return;
+
+   submit->queue = queue;
+
+   intel_ds_flush_data_init(&submit->ds, &queue->ds, queue->ds.submission_id);
+
+   result = vk_sync_create(&device->vk, &device->physical->sync_syncobj_type,
+                           0, 0, &submit->sync);
+   if (result != VK_SUCCESS)
+      goto error_trace;
+
+   const bool uses_relocs = device->physical->uses_relocs;
+   result = anv_reloc_list_init(&submit->relocs, &device->vk.alloc, uses_relocs);
+   if (result != VK_SUCCESS)
+      goto error_sync;
+
+   submit->batch = (struct anv_batch) {
+      .alloc = &device->vk.alloc,
+      .relocs = &submit->relocs,
+      .user_data = submit,
+      .extend_cb = anv_utrace_submit_extend_batch,
+   };
+
+   if (frame) {
+      if (begin)
+         trace_intel_begin_frame(&submit->ds.trace, &submit->batch);
+      else
+         trace_intel_end_frame(&submit->ds.trace, &submit->batch,
+                               device->debug_frame_desc->frame_id);
+   } else {
+      if (begin) {
+         trace_intel_begin_queue_annotation(&submit->ds.trace, &submit->batch);
+      } else {
+         trace_intel_end_queue_annotation(&submit->ds.trace,
+                                          &submit->batch,
+                                          strlen(label),
+                                          label);
+      }
+   }
+
+   anv_batch_emit(&submit->batch, GFX9_MI_BATCH_BUFFER_END, bbs);
+   anv_batch_emit(&submit->batch, GFX9_MI_NOOP, noop);
+
+   if (submit->batch.status != VK_SUCCESS) {
+      result = submit->batch.status;
+      goto error_reloc_list;
+   }
+
+   intel_ds_queue_flush_data(&queue->ds, &submit->ds.trace, &submit->ds, true);
+
+   pthread_mutex_lock(&device->mutex);
+   device->kmd_backend->queue_exec_trace(queue, submit);
+   pthread_mutex_unlock(&device->mutex);
+
+   return;
+
+ error_reloc_list:
+   anv_reloc_list_finish(&submit->relocs);
+   util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, bo)
+      anv_bo_pool_free(&device->utrace_bo_pool, *bo);
+ error_sync:
+   vk_sync_destroy(&device->vk, submit->sync);
+ error_trace:
+   intel_ds_flush_data_fini(&submit->ds);
+   vk_free(&device->vk.alloc, submit);
+}
+
+void
+anv_QueueBeginDebugUtilsLabelEXT(
+   VkQueue _queue,
+   const VkDebugUtilsLabelEXT *pLabelInfo)
+{
+   VK_FROM_HANDLE(anv_queue, queue, _queue);
+
+   vk_common_QueueBeginDebugUtilsLabelEXT(_queue, pLabelInfo);
+
+   anv_queue_trace(queue, pLabelInfo->pLabelName,
+                   false /* frame */, true /* begin */);
+}
+
+void
+anv_QueueEndDebugUtilsLabelEXT(VkQueue _queue)
+{
+   VK_FROM_HANDLE(anv_queue, queue, _queue);
+
+   if (queue->vk.labels.size > 0) {
+      const VkDebugUtilsLabelEXT *label =
+         util_dynarray_top_ptr(&queue->vk.labels, VkDebugUtilsLabelEXT);
+      anv_queue_trace(queue, label->pLabelName,
+                      false /* frame */, false /* begin */);
+
+      intel_ds_device_process(&queue->device->ds, true);
+   }
+
+   vk_common_QueueEndDebugUtilsLabelEXT(_queue);
+}
diff --git a/src/intel/vulkan/anv_va.c b/src/intel/vulkan/anv_va.c
new file mode 100644
index 00000000000..fe05342a7f6
--- /dev/null
+++ b/src/intel/vulkan/anv_va.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include "util/u_math.h"
+
+static uint64_t
+va_add(struct anv_va_range *range, uint64_t addr, uint64_t size)
+{
+   range->addr = addr;
+   range->size = size;
+
+   return addr + size;
+}
+
+static void
+va_at(struct anv_va_range *range, uint64_t addr, uint64_t size)
+{
+   range->addr = addr;
+   range->size = size;
+}
+
+static void
+anv_device_print_vas(struct anv_physical_device *device)
+{
+   fprintf(stderr, "Driver heaps:\n");
+#define PRINT_HEAP(name) \
+   fprintf(stderr, "   0x%016"PRIx64"-0x%016"PRIx64": %s\n", \
+           device->va.name.addr, \
+           device->va.name.addr + device->va.name.size, \
+           #name);
+   PRINT_HEAP(general_state_pool);
+   PRINT_HEAP(low_heap);
+   PRINT_HEAP(dynamic_state_pool);
+   PRINT_HEAP(sampler_state_pool);
+   PRINT_HEAP(binding_table_pool);
+   PRINT_HEAP(internal_surface_state_pool);
+   PRINT_HEAP(scratch_surface_state_pool);
+   PRINT_HEAP(bindless_surface_state_pool);
+   PRINT_HEAP(indirect_descriptor_pool);
+   PRINT_HEAP(indirect_push_descriptor_pool);
+   PRINT_HEAP(instruction_state_pool);
+   PRINT_HEAP(dynamic_state_db_pool);
+   PRINT_HEAP(descriptor_buffer_pool);
+   PRINT_HEAP(push_descriptor_buffer_pool);
+   PRINT_HEAP(high_heap);
+   PRINT_HEAP(trtt);
+}
+
+void
+anv_physical_device_init_va_ranges(struct anv_physical_device *device)
+{
+   /* anv Virtual Memory Layout
+    * =========================
+    *
+    * When the anv driver is determining the virtual graphics addresses of
+    * memory objects itself using the softpin mechanism, the following memory
+    * ranges will be used.
+    *
+    * Three special considerations to notice:
+    *
+    * (1) the dynamic state pool is located within the same 4 GiB as the low
+    * heap. This is to work around a VF cache issue described in a comment in
+    * anv_physical_device_init_heaps.
+    *
+    * (2) the binding table pool is located at lower addresses than the BT
+    * (binding table) surface state pool, within a 4 GiB range which also
+    * contains the bindless surface state pool. This allows surface state base
+    * addresses to cover both binding tables (16 bit offsets), the internal
+    * surface states (32 bit offsets) and the bindless surface states.
+    *
+    * (3) the last 4 GiB of the address space is withheld from the high heap.
+    * Various hardware units will read past the end of an object for various
+    * reasons. This healthy margin prevents reads from wrapping around 48-bit
+    * addresses.
+    */
+   uint64_t _1Mb = 1ull * 1024 * 1024;
+   uint64_t _1Gb = 1ull * 1024 * 1024 * 1024;
+   uint64_t _4Gb = 4ull * 1024 * 1024 * 1024;
+
+   uint64_t address = 0x000000200000ULL; /* 2MiB */
+
+   address = va_add(&device->va.general_state_pool, address,
+                    _1Gb - address);
+
+   address = va_add(&device->va.low_heap, address, _1Gb);
+
+   /* The binding table pool has to be located directly in front of the
+    * surface states.
+    */
+   address += _1Gb;
+   address = va_add(&device->va.binding_table_pool, address, _1Gb);
+   address = va_add(&device->va.internal_surface_state_pool, address, 1 * _1Gb);
+   assert(device->va.internal_surface_state_pool.addr ==
+          align64(device->va.internal_surface_state_pool.addr, 2 * _1Gb));
+   /* Scratch surface state overlaps with the internal surface state */
+   va_at(&device->va.scratch_surface_state_pool,
+         device->va.internal_surface_state_pool.addr,
+         8 * _1Mb);
+   address = va_add(&device->va.bindless_surface_state_pool, address, 2 * _1Gb);
+
+
+   /* PRMs & simulation disagrees on the actual size of this heap. Take the
+    * smallest (simulation) so that it works everywhere.
+    */
+   address = align64(address, _4Gb);
+   address = va_add(&device->va.dynamic_state_pool, address, _1Gb);
+   address = va_add(&device->va.sampler_state_pool, address, 2 * _1Gb);
+
+   if (device->indirect_descriptors) {
+      /* With indirect descriptors, descriptor buffers can go anywhere, they
+       * just need to be in a 4Gb aligned range, so all shader accesses can
+       * use a relocatable upper dword for the 64bit address.
+       */
+      address = align64(address, _4Gb);
+      address = va_add(&device->va.indirect_descriptor_pool, address, 3 * _1Gb);
+      address = va_add(&device->va.indirect_push_descriptor_pool, address, _1Gb);
+   }
+
+   /* We use a trick to compute constant data offsets in the shaders to avoid
+    * unnecessary 64bit address computations (see lower_load_constant() in
+    * anv_nir_apply_pipeline_layout.c). This assumes the instruction pool is
+    * located at an address with the lower 32bits at 0.
+    */
+   address = align64(address, _4Gb);
+   address = va_add(&device->va.instruction_state_pool, address, 2 * _1Gb);
+
+   address += 1 * _1Gb;
+   address = va_add(&device->va.dynamic_state_db_pool, address, _1Gb);
+   address = va_add(&device->va.descriptor_buffer_pool, address, 2 *_1Gb);
+   assert(device->va.descriptor_buffer_pool.addr % _4Gb == 0);
+   if (device->info.verx10 >= 125)
+      address = va_add(&device->va.push_descriptor_buffer_pool, address, _1Gb - 4096);
+
+   assert(device->va.descriptor_buffer_pool.addr ==
+          align64(device->va.descriptor_buffer_pool.addr, 4 * _1Gb));
+
+   address = align64(address, device->info.mem_alignment);
+   address = va_add(&device->va.aux_tt_pool, address, 2 * _1Gb);
+
+   /* What's left to do for us is to set va.high_heap and va.trtt without
+    * overlap, but there are a few things to be considered:
+    *
+    * The TR-TT address space is governed by the GFX_TRTT_VA_RANGE register,
+    * which carves out part of the address space for TR-TT and is independent
+    * of device->gtt_size. We use 47:44 for gen9+, the values we set here
+    * should be in sync with what we write to the register.
+    *
+    * If we ever gain the capability to use more than 48 bits of address space
+    * we'll have to adjust where we put the TR-TT space (and how we set
+    * GFX_TRTT_VA_RANGE).
+    *
+    * We have to leave the last 4GiB out of the high vma range, so that no
+    * state base address + size can overflow 48 bits. For more information see
+    * the comment about Wa32bitGeneralStateOffset in anv_allocator.c
+    *
+    * Despite the comment above, before we had TR-TT we were not only avoiding
+    * the last 4GiB of the 48bit address space, but also avoiding the last
+    * 4GiB from gtt_size, so let's be on the safe side and do the 4GiB
+    * avoiding for both the TR-TT space top and the gtt top.
+    */
+   assert(device->gtt_size <= (1uLL << 48));
+   uint64_t trtt_start = 0xFuLL << 44;
+   uint64_t trtt_end = (1uLL << 48) - 4 * _1Gb;
+   uint64_t addressable_top = MIN2(device->gtt_size, trtt_start) - 4 * _1Gb;
+
+   uint64_t user_heaps_size = addressable_top - address;
+   address = va_add(&device->va.high_heap, address, user_heaps_size);
+   assert(address <= trtt_start);
+   address = va_add(&device->va.trtt, trtt_start, trtt_end - trtt_start);
+
+   if (INTEL_DEBUG(DEBUG_HEAPS))
+      anv_device_print_vas(device);
+}
diff --git a/src/intel/vulkan/anv_video.c b/src/intel/vulkan/anv_video.c
new file mode 100644
index 00000000000..070c1806cc3
--- /dev/null
+++ b/src/intel/vulkan/anv_video.c
@@ -0,0 +1,435 @@
+/*
+ * Copyright © 2021 Red Hat
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include "vk_video/vulkan_video_codecs_common.h"
+
+VkResult
+anv_CreateVideoSessionKHR(VkDevice _device,
+                           const VkVideoSessionCreateInfoKHR *pCreateInfo,
+                           const VkAllocationCallbacks *pAllocator,
+                           VkVideoSessionKHR *pVideoSession)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   struct anv_video_session *vid =
+      vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*vid), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!vid)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   memset(vid, 0, sizeof(struct anv_video_session));
+
+   VkResult result = vk_video_session_init(&device->vk,
+                                           &vid->vk,
+                                           pCreateInfo);
+   if (result != VK_SUCCESS) {
+      vk_free2(&device->vk.alloc, pAllocator, vid);
+      return result;
+   }
+
+   *pVideoSession = anv_video_session_to_handle(vid);
+   return VK_SUCCESS;
+}
+
+void
+anv_DestroyVideoSessionKHR(VkDevice _device,
+                           VkVideoSessionKHR _session,
+                           const VkAllocationCallbacks *pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_video_session, vid, _session);
+   if (!_session)
+      return;
+
+   vk_object_base_finish(&vid->vk.base);
+   vk_free2(&device->vk.alloc, pAllocator, vid);
+}
+
+VkResult
+anv_CreateVideoSessionParametersKHR(VkDevice _device,
+                                     const VkVideoSessionParametersCreateInfoKHR *pCreateInfo,
+                                     const VkAllocationCallbacks *pAllocator,
+                                     VkVideoSessionParametersKHR *pVideoSessionParameters)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_video_session, vid, pCreateInfo->videoSession);
+   ANV_FROM_HANDLE(anv_video_session_params, templ, pCreateInfo->videoSessionParametersTemplate);
+   struct anv_video_session_params *params =
+      vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*params), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!params)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   VkResult result = vk_video_session_parameters_init(&device->vk,
+                                                      &params->vk,
+                                                      &vid->vk,
+                                                      templ ? &templ->vk : NULL,
+                                                      pCreateInfo);
+   if (result != VK_SUCCESS) {
+      vk_free2(&device->vk.alloc, pAllocator, params);
+      return result;
+   }
+
+   *pVideoSessionParameters = anv_video_session_params_to_handle(params);
+   return VK_SUCCESS;
+}
+
+void
+anv_DestroyVideoSessionParametersKHR(VkDevice _device,
+                                      VkVideoSessionParametersKHR _params,
+                                      const VkAllocationCallbacks *pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_video_session_params, params, _params);
+   if (!_params)
+      return;
+   vk_video_session_parameters_finish(&device->vk, &params->vk);
+   vk_free2(&device->vk.alloc, pAllocator, params);
+}
+
+VkResult
+anv_GetPhysicalDeviceVideoCapabilitiesKHR(VkPhysicalDevice physicalDevice,
+                                           const VkVideoProfileInfoKHR *pVideoProfile,
+                                           VkVideoCapabilitiesKHR *pCapabilities)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+
+   pCapabilities->minBitstreamBufferOffsetAlignment = 32;
+   pCapabilities->minBitstreamBufferSizeAlignment = 32;
+   pCapabilities->maxCodedExtent.width = 4096;
+   pCapabilities->maxCodedExtent.height = 4096;
+   pCapabilities->flags = VK_VIDEO_CAPABILITY_SEPARATE_REFERENCE_IMAGES_BIT_KHR;
+
+   struct VkVideoDecodeCapabilitiesKHR *dec_caps = (struct VkVideoDecodeCapabilitiesKHR *)
+      vk_find_struct(pCapabilities->pNext, VIDEO_DECODE_CAPABILITIES_KHR);
+   if (dec_caps)
+      dec_caps->flags = VK_VIDEO_DECODE_CAPABILITY_DPB_AND_OUTPUT_COINCIDE_BIT_KHR;
+
+   /* H264 allows different luma and chroma bit depths */
+   if (pVideoProfile->lumaBitDepth != pVideoProfile->chromaBitDepth)
+      return VK_ERROR_VIDEO_PROFILE_FORMAT_NOT_SUPPORTED_KHR;
+
+   if (pVideoProfile->chromaSubsampling != VK_VIDEO_CHROMA_SUBSAMPLING_420_BIT_KHR)
+      return VK_ERROR_VIDEO_PROFILE_FORMAT_NOT_SUPPORTED_KHR;
+
+   switch (pVideoProfile->videoCodecOperation) {
+   case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR: {
+      struct VkVideoDecodeH264CapabilitiesKHR *ext = (struct VkVideoDecodeH264CapabilitiesKHR *)
+         vk_find_struct(pCapabilities->pNext, VIDEO_DECODE_H264_CAPABILITIES_KHR);
+
+      if (pVideoProfile->lumaBitDepth != VK_VIDEO_COMPONENT_BIT_DEPTH_8_BIT_KHR)
+         return VK_ERROR_VIDEO_PROFILE_FORMAT_NOT_SUPPORTED_KHR;
+
+      pCapabilities->maxDpbSlots = 17;
+      pCapabilities->maxActiveReferencePictures = ANV_VIDEO_H264_MAX_NUM_REF_FRAME;
+      pCapabilities->pictureAccessGranularity.width = ANV_MB_WIDTH;
+      pCapabilities->pictureAccessGranularity.height = ANV_MB_HEIGHT;
+      pCapabilities->minCodedExtent.width = ANV_MB_WIDTH;
+      pCapabilities->minCodedExtent.height = ANV_MB_HEIGHT;
+
+      ext->fieldOffsetGranularity.x = 0;
+      ext->fieldOffsetGranularity.y = 0;
+      ext->maxLevelIdc = STD_VIDEO_H264_LEVEL_IDC_5_1;
+      strcpy(pCapabilities->stdHeaderVersion.extensionName, VK_STD_VULKAN_VIDEO_CODEC_H264_DECODE_EXTENSION_NAME);
+      pCapabilities->stdHeaderVersion.specVersion = VK_STD_VULKAN_VIDEO_CODEC_H264_DECODE_SPEC_VERSION;
+      break;
+   }
+   case VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR: {
+      struct VkVideoDecodeH265CapabilitiesKHR *ext = (struct VkVideoDecodeH265CapabilitiesKHR *)
+         vk_find_struct(pCapabilities->pNext, VIDEO_DECODE_H265_CAPABILITIES_KHR);
+
+      const struct VkVideoDecodeH265ProfileInfoKHR *h265_profile =
+         vk_find_struct_const(pVideoProfile->pNext,
+                              VIDEO_DECODE_H265_PROFILE_INFO_KHR);
+
+      /* No hardware supports the scc extension profile */
+      if (h265_profile->stdProfileIdc != STD_VIDEO_H265_PROFILE_IDC_MAIN &&
+          h265_profile->stdProfileIdc != STD_VIDEO_H265_PROFILE_IDC_MAIN_10 &&
+          h265_profile->stdProfileIdc != STD_VIDEO_H265_PROFILE_IDC_MAIN_STILL_PICTURE &&
+          h265_profile->stdProfileIdc != STD_VIDEO_H265_PROFILE_IDC_FORMAT_RANGE_EXTENSIONS)
+         return VK_ERROR_VIDEO_PROFILE_OPERATION_NOT_SUPPORTED_KHR;
+
+      /* Skylake only supports the main profile */
+      if (h265_profile->stdProfileIdc != STD_VIDEO_H265_PROFILE_IDC_MAIN &&
+          h265_profile->stdProfileIdc != STD_VIDEO_H265_PROFILE_IDC_MAIN_STILL_PICTURE &&
+          pdevice->info.platform <= INTEL_PLATFORM_SKL)
+         return VK_ERROR_VIDEO_PROFILE_OPERATION_NOT_SUPPORTED_KHR;
+
+      /* Gfx10 and under don't support the range extension profile */
+      if (h265_profile->stdProfileIdc != STD_VIDEO_H265_PROFILE_IDC_MAIN &&
+          h265_profile->stdProfileIdc != STD_VIDEO_H265_PROFILE_IDC_MAIN_10 &&
+          h265_profile->stdProfileIdc != STD_VIDEO_H265_PROFILE_IDC_MAIN_STILL_PICTURE &&
+          pdevice->info.ver <= 10)
+         return VK_ERROR_VIDEO_PROFILE_OPERATION_NOT_SUPPORTED_KHR;
+
+      if (pVideoProfile->lumaBitDepth != VK_VIDEO_COMPONENT_BIT_DEPTH_8_BIT_KHR &&
+          pVideoProfile->lumaBitDepth != VK_VIDEO_COMPONENT_BIT_DEPTH_10_BIT_KHR)
+         return VK_ERROR_VIDEO_PROFILE_FORMAT_NOT_SUPPORTED_KHR;
+
+      pCapabilities->pictureAccessGranularity.width = ANV_MAX_H265_CTB_SIZE;
+      pCapabilities->pictureAccessGranularity.height = ANV_MAX_H265_CTB_SIZE;
+      pCapabilities->minCodedExtent.width = ANV_MAX_H265_CTB_SIZE;
+      pCapabilities->minCodedExtent.height = ANV_MAX_H265_CTB_SIZE;
+      pCapabilities->maxDpbSlots = ANV_VIDEO_H265_MAX_NUM_REF_FRAME;
+      pCapabilities->maxActiveReferencePictures = ANV_VIDEO_H265_HCP_NUM_REF_FRAME;
+
+      ext->maxLevelIdc = STD_VIDEO_H265_LEVEL_IDC_6_2;
+
+      strcpy(pCapabilities->stdHeaderVersion.extensionName, VK_STD_VULKAN_VIDEO_CODEC_H265_DECODE_EXTENSION_NAME);
+      pCapabilities->stdHeaderVersion.specVersion = VK_STD_VULKAN_VIDEO_CODEC_H265_DECODE_SPEC_VERSION;
+      break;
+   }
+   default:
+      break;
+   }
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_GetPhysicalDeviceVideoFormatPropertiesKHR(VkPhysicalDevice physicalDevice,
+                                               const VkPhysicalDeviceVideoFormatInfoKHR *pVideoFormatInfo,
+                                               uint32_t *pVideoFormatPropertyCount,
+                                               VkVideoFormatPropertiesKHR *pVideoFormatProperties)
+{
+   VK_OUTARRAY_MAKE_TYPED(VkVideoFormatPropertiesKHR, out,
+                          pVideoFormatProperties,
+                          pVideoFormatPropertyCount);
+
+   bool need_10bit = false;
+   const struct VkVideoProfileListInfoKHR *prof_list = (struct VkVideoProfileListInfoKHR *)
+      vk_find_struct_const(pVideoFormatInfo->pNext, VIDEO_PROFILE_LIST_INFO_KHR);
+
+   if (prof_list) {
+      for (unsigned i = 0; i < prof_list->profileCount; i++) {
+         const VkVideoProfileInfoKHR *profile = &prof_list->pProfiles[i];
+         if (profile->lumaBitDepth & VK_VIDEO_COMPONENT_BIT_DEPTH_10_BIT_KHR ||
+             profile->chromaBitDepth & VK_VIDEO_COMPONENT_BIT_DEPTH_10_BIT_KHR)
+            need_10bit = true;
+      }
+   }
+
+   vk_outarray_append_typed(VkVideoFormatPropertiesKHR, &out, p) {
+      p->format = VK_FORMAT_G8_B8R8_2PLANE_420_UNORM;
+      p->imageType = VK_IMAGE_TYPE_2D;
+      p->imageTiling = VK_IMAGE_TILING_OPTIMAL;
+      p->imageUsageFlags = pVideoFormatInfo->imageUsage;
+   }
+
+   if (need_10bit) {
+      vk_outarray_append_typed(VkVideoFormatPropertiesKHR, &out, p) {
+         p->format = VK_FORMAT_G10X6_B10X6R10X6_2PLANE_420_UNORM_3PACK16;
+         p->imageType = VK_IMAGE_TYPE_2D;
+         p->imageTiling = VK_IMAGE_TILING_OPTIMAL;
+         p->imageUsageFlags = pVideoFormatInfo->imageUsage;
+      }
+   }
+
+   return vk_outarray_status(&out);
+}
+
+static uint64_t
+get_h264_video_mem_size(struct anv_video_session *vid, uint32_t mem_idx)
+{
+   uint32_t width_in_mb =
+      align(vid->vk.max_coded.width, ANV_MB_WIDTH) / ANV_MB_WIDTH;
+
+   switch (mem_idx) {
+   case ANV_VID_MEM_H264_INTRA_ROW_STORE:
+      return width_in_mb * 64;
+   case ANV_VID_MEM_H264_DEBLOCK_FILTER_ROW_STORE:
+      return width_in_mb * 64 * 4;
+   case ANV_VID_MEM_H264_BSD_MPC_ROW_SCRATCH:
+      return width_in_mb * 64 * 2;
+   case ANV_VID_MEM_H264_MPR_ROW_SCRATCH:
+      return width_in_mb * 64 * 2;
+   default:
+      unreachable("unknown memory");
+   }
+}
+
+static uint64_t
+get_h265_video_mem_size(struct anv_video_session *vid, uint32_t mem_idx)
+{
+   uint32_t bit_shift =
+      vid->vk.h265.profile_idc == STD_VIDEO_H265_PROFILE_IDC_MAIN_10 ? 2 : 3;
+
+   /* TODO. these sizes can be determined dynamically depending on ctb sizes of each slice. */
+   uint32_t width_in_ctb =
+      align(vid->vk.max_coded.width, ANV_MAX_H265_CTB_SIZE) / ANV_MAX_H265_CTB_SIZE;
+   uint32_t height_in_ctb =
+      align(vid->vk.max_coded.height, ANV_MAX_H265_CTB_SIZE) / ANV_MAX_H265_CTB_SIZE;
+   uint64_t size;
+
+   switch (mem_idx) {
+   case ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_LINE:
+   case ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_TILE_LINE:
+      size = align(vid->vk.max_coded.width, 32) >> bit_shift;
+      break;
+   case ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_TILE_COLUMN:
+      size = align(vid->vk.max_coded.height + 6 * height_in_ctb, 32) >> bit_shift;
+      break;
+   case ANV_VID_MEM_H265_METADATA_LINE:
+      size = (((vid->vk.max_coded.width + 15) >> 4) * 188 + width_in_ctb * 9 + 1023) >> 9;
+      break;
+   case ANV_VID_MEM_H265_METADATA_TILE_LINE:
+      size = (((vid->vk.max_coded.width + 15) >> 4) * 172 + width_in_ctb * 9 + 1023) >> 9;
+      break;
+   case ANV_VID_MEM_H265_METADATA_TILE_COLUMN:
+      size = (((vid->vk.max_coded.height + 15) >> 4) * 176 + height_in_ctb * 89 + 1023) >> 9;
+      break;
+   case ANV_VID_MEM_H265_SAO_LINE:
+      size = align((vid->vk.max_coded.width >> 1) + width_in_ctb * 3, 16) >> bit_shift;
+      break;
+   case ANV_VID_MEM_H265_SAO_TILE_LINE:
+      size = align((vid->vk.max_coded.width >> 1) + width_in_ctb * 6, 16) >> bit_shift;
+      break;
+   case ANV_VID_MEM_H265_SAO_TILE_COLUMN:
+      size = align((vid->vk.max_coded.height >> 1) + height_in_ctb * 6, 16) >> bit_shift;
+      break;
+   default:
+      unreachable("unknown memory");
+   }
+
+   return size << 6;
+}
+
+static void
+get_h264_video_session_mem_reqs(struct anv_video_session *vid,
+                                VkVideoSessionMemoryRequirementsKHR *mem_reqs,
+                                uint32_t *pVideoSessionMemoryRequirementsCount,
+                                uint32_t memory_types)
+{
+   VK_OUTARRAY_MAKE_TYPED(VkVideoSessionMemoryRequirementsKHR,
+                          out,
+                          mem_reqs,
+                          pVideoSessionMemoryRequirementsCount);
+
+   for (unsigned i = 0; i < ANV_VIDEO_MEM_REQS_H264; i++) {
+      uint32_t bind_index = ANV_VID_MEM_H264_INTRA_ROW_STORE + i;
+      uint64_t size = get_h264_video_mem_size(vid, i);
+
+      vk_outarray_append_typed(VkVideoSessionMemoryRequirementsKHR, &out, p) {
+         p->memoryBindIndex = bind_index;
+         p->memoryRequirements.size = size;
+         p->memoryRequirements.alignment = 4096;
+         p->memoryRequirements.memoryTypeBits = memory_types;
+      }
+   }
+}
+
+static void
+get_h265_video_session_mem_reqs(struct anv_video_session *vid,
+                                VkVideoSessionMemoryRequirementsKHR *mem_reqs,
+                                uint32_t *pVideoSessionMemoryRequirementsCount,
+                                uint32_t memory_types)
+{
+   VK_OUTARRAY_MAKE_TYPED(VkVideoSessionMemoryRequirementsKHR,
+                          out,
+                          mem_reqs,
+                          pVideoSessionMemoryRequirementsCount);
+
+   for (unsigned i = 0; i < ANV_VIDEO_MEM_REQS_H265; i++) {
+      uint32_t bind_index =
+         ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_LINE + i;
+      uint64_t size = get_h265_video_mem_size(vid, i);
+
+      vk_outarray_append_typed(VkVideoSessionMemoryRequirementsKHR, &out, p) {
+         p->memoryBindIndex = bind_index;
+         p->memoryRequirements.size = size;
+         p->memoryRequirements.alignment = 4096;
+         p->memoryRequirements.memoryTypeBits = memory_types;
+      }
+   }
+}
+
+VkResult
+anv_GetVideoSessionMemoryRequirementsKHR(VkDevice _device,
+                                         VkVideoSessionKHR videoSession,
+                                         uint32_t *pVideoSessionMemoryRequirementsCount,
+                                         VkVideoSessionMemoryRequirementsKHR *mem_reqs)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_video_session, vid, videoSession);
+
+   uint32_t memory_types =
+      (vid->vk.flags & VK_VIDEO_SESSION_CREATE_PROTECTED_CONTENT_BIT_KHR) ?
+      device->physical->memory.protected_mem_types :
+      device->physical->memory.default_buffer_mem_types;
+   switch (vid->vk.op) {
+   case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR:
+      get_h264_video_session_mem_reqs(vid,
+                                      mem_reqs,
+                                      pVideoSessionMemoryRequirementsCount,
+                                      memory_types);
+      break;
+   case VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR:
+      get_h265_video_session_mem_reqs(vid,
+                                      mem_reqs,
+                                      pVideoSessionMemoryRequirementsCount,
+                                      memory_types);
+      break;
+   default:
+      unreachable("unknown codec");
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_UpdateVideoSessionParametersKHR(VkDevice _device,
+                                     VkVideoSessionParametersKHR _params,
+                                     const VkVideoSessionParametersUpdateInfoKHR *pUpdateInfo)
+{
+   ANV_FROM_HANDLE(anv_video_session_params, params, _params);
+   return vk_video_session_parameters_update(&params->vk, pUpdateInfo);
+}
+
+static void
+copy_bind(struct anv_vid_mem *dst,
+          const VkBindVideoSessionMemoryInfoKHR *src)
+{
+   dst->mem = anv_device_memory_from_handle(src->memory);
+   dst->offset = src->memoryOffset;
+   dst->size = src->memorySize;
+}
+
+VkResult
+anv_BindVideoSessionMemoryKHR(VkDevice _device,
+                              VkVideoSessionKHR videoSession,
+                              uint32_t bind_mem_count,
+                              const VkBindVideoSessionMemoryInfoKHR *bind_mem)
+{
+   ANV_FROM_HANDLE(anv_video_session, vid, videoSession);
+
+   switch (vid->vk.op) {
+   case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR:
+   case VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR:
+      for (unsigned i = 0; i < bind_mem_count; i++) {
+         copy_bind(&vid->vid_mem[bind_mem[i].memoryBindIndex], &bind_mem[i]);
+      }
+      break;
+   default:
+      unreachable("unknown codec");
+   }
+   return VK_SUCCESS;
+}
diff --git a/src/intel/vulkan/anv_wsi.c b/src/intel/vulkan/anv_wsi.c
index 04d85d99d67..ab8e5d5fc6c 100644
--- a/src/intel/vulkan/anv_wsi.c
+++ b/src/intel/vulkan/anv_wsi.c
@@ -24,6 +24,9 @@
 #include "anv_private.h"
 #include "anv_measure.h"
 #include "wsi_common.h"
+#include "vk_fence.h"
+#include "vk_queue.h"
+#include "vk_semaphore.h"
 #include "vk_util.h"
 
 static PFN_vkVoidFunction
@@ -33,46 +36,17 @@ anv_wsi_proc_addr(VkPhysicalDevice physicalDevice, const char *pName)
    return vk_instance_get_proc_addr_unchecked(&pdevice->instance->vk, pName);
 }
 
-static void
-anv_wsi_signal_semaphore_for_memory(VkDevice _device,
-                                    VkSemaphore _semaphore,
-                                    VkDeviceMemory _memory)
+static VkQueue
+anv_wsi_get_prime_blit_queue(VkDevice _device)
 {
    ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_semaphore, semaphore, _semaphore);
-   ANV_FROM_HANDLE(anv_device_memory, memory, _memory);
 
-   /* Put a BO semaphore with the image BO in the temporary.  For BO binary
-    * semaphores, we always set EXEC_OBJECT_WRITE so this creates a WaR
-    * hazard with the display engine's read to ensure that no one writes to
-    * the image before the read is complete.
-    */
-   anv_semaphore_reset_temporary(device, semaphore);
-
-   struct anv_semaphore_impl *impl = &semaphore->temporary;
-   impl->type = ANV_SEMAPHORE_TYPE_WSI_BO;
-   impl->bo = anv_bo_ref(memory->bo);
-}
-
-static void
-anv_wsi_signal_fence_for_memory(VkDevice _device,
-                                VkFence _fence,
-                                VkDeviceMemory _memory)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_fence, fence, _fence);
-   ANV_FROM_HANDLE(anv_device_memory, memory, _memory);
-
-   /* Put a BO fence with the image BO in the temporary.  For BO fences, we
-    * always just wait until the BO isn't busy and reads from the BO should
-    * count as busy.
-    */
-   anv_fence_reset_temporary(device, fence);
-
-   struct anv_fence_impl *impl = &fence->temporary;
-   impl->type = ANV_FENCE_TYPE_WSI_BO;
-   impl->bo.bo = anv_bo_ref(memory->bo);
-   impl->bo.state = ANV_BO_FENCE_STATE_SUBMITTED;
+   vk_foreach_queue(_queue, &device->vk) {
+      struct anv_queue *queue = (struct anv_queue *)_queue;
+      if (queue->family->queueFlags & (VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT))
+         return vk_queue_to_handle(_queue);
+   }
+   return NULL;
 }
 
 VkResult
@@ -86,15 +60,21 @@ anv_init_wsi(struct anv_physical_device *physical_device)
                             &physical_device->instance->vk.alloc,
                             physical_device->master_fd,
                             &physical_device->instance->dri_options,
-                            false);
+                            &(struct wsi_device_options){.sw_device = false});
    if (result != VK_SUCCESS)
       return result;
 
    physical_device->wsi_device.supports_modifiers = true;
-   physical_device->wsi_device.signal_semaphore_for_memory =
-      anv_wsi_signal_semaphore_for_memory;
-   physical_device->wsi_device.signal_fence_for_memory =
-      anv_wsi_signal_fence_for_memory;
+   physical_device->wsi_device.get_blit_queue = anv_wsi_get_prime_blit_queue;
+   if (physical_device->info.kmd_type == INTEL_KMD_TYPE_I915) {
+      physical_device->wsi_device.signal_semaphore_with_memory = true;
+      physical_device->wsi_device.signal_fence_with_memory = true;
+   }
+
+   physical_device->vk.wsi_device = &physical_device->wsi_device;
+
+   wsi_device_setup_syncobj_fd(&physical_device->wsi_device,
+                               physical_device->local_fd);
 
    return VK_SUCCESS;
 }
@@ -102,187 +82,25 @@ anv_init_wsi(struct anv_physical_device *physical_device)
 void
 anv_finish_wsi(struct anv_physical_device *physical_device)
 {
+   physical_device->vk.wsi_device = NULL;
    wsi_device_finish(&physical_device->wsi_device,
                      &physical_device->instance->vk.alloc);
 }
 
-void anv_DestroySurfaceKHR(
-    VkInstance                                   _instance,
-    VkSurfaceKHR                                 _surface,
-    const VkAllocationCallbacks*                 pAllocator)
-{
-   ANV_FROM_HANDLE(anv_instance, instance, _instance);
-   ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, _surface);
-
-   if (!surface)
-      return;
-
-   vk_free2(&instance->vk.alloc, pAllocator, surface);
-}
-
-VkResult anv_GetPhysicalDeviceSurfaceSupportKHR(
-    VkPhysicalDevice                            physicalDevice,
-    uint32_t                                    queueFamilyIndex,
-    VkSurfaceKHR                                surface,
-    VkBool32*                                   pSupported)
-{
-   ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_support(&device->wsi_device,
-                                         queueFamilyIndex,
-                                         surface,
-                                         pSupported);
-}
-
-VkResult anv_GetPhysicalDeviceSurfaceCapabilitiesKHR(
-    VkPhysicalDevice                            physicalDevice,
-    VkSurfaceKHR                                surface,
-    VkSurfaceCapabilitiesKHR*                   pSurfaceCapabilities)
-{
-   ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_capabilities(&device->wsi_device,
-                                              surface,
-                                              pSurfaceCapabilities);
-}
-
-VkResult anv_GetPhysicalDeviceSurfaceCapabilities2KHR(
-    VkPhysicalDevice                            physicalDevice,
-    const VkPhysicalDeviceSurfaceInfo2KHR*      pSurfaceInfo,
-    VkSurfaceCapabilities2KHR*                  pSurfaceCapabilities)
-{
-   ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_capabilities2(&device->wsi_device,
-                                               pSurfaceInfo,
-                                               pSurfaceCapabilities);
-}
-
-VkResult anv_GetPhysicalDeviceSurfaceCapabilities2EXT(
- 	VkPhysicalDevice                            physicalDevice,
-	VkSurfaceKHR                                surface,
-	VkSurfaceCapabilities2EXT*                  pSurfaceCapabilities)
-{
-   ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_capabilities2ext(&device->wsi_device,
-                                                  surface,
-                                                  pSurfaceCapabilities);
-}
-
-VkResult anv_GetPhysicalDeviceSurfaceFormatsKHR(
-    VkPhysicalDevice                            physicalDevice,
-    VkSurfaceKHR                                surface,
-    uint32_t*                                   pSurfaceFormatCount,
-    VkSurfaceFormatKHR*                         pSurfaceFormats)
-{
-   ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_formats(&device->wsi_device, surface,
-                                         pSurfaceFormatCount, pSurfaceFormats);
-}
-
-VkResult anv_GetPhysicalDeviceSurfaceFormats2KHR(
-    VkPhysicalDevice                            physicalDevice,
-    const VkPhysicalDeviceSurfaceInfo2KHR*      pSurfaceInfo,
-    uint32_t*                                   pSurfaceFormatCount,
-    VkSurfaceFormat2KHR*                        pSurfaceFormats)
-{
-   ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_formats2(&device->wsi_device, pSurfaceInfo,
-                                          pSurfaceFormatCount, pSurfaceFormats);
-}
-
-VkResult anv_GetPhysicalDeviceSurfacePresentModesKHR(
-    VkPhysicalDevice                            physicalDevice,
-    VkSurfaceKHR                                surface,
-    uint32_t*                                   pPresentModeCount,
-    VkPresentModeKHR*                           pPresentModes)
-{
-   ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_surface_present_modes(&device->wsi_device, surface,
-                                               pPresentModeCount,
-                                               pPresentModes);
-}
-
-VkResult anv_CreateSwapchainKHR(
-    VkDevice                                     _device,
-    const VkSwapchainCreateInfoKHR*              pCreateInfo,
-    const VkAllocationCallbacks*                 pAllocator,
-    VkSwapchainKHR*                              pSwapchain)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   struct wsi_device *wsi_device = &device->physical->wsi_device;
-   const VkAllocationCallbacks *alloc;
-
-   if (pAllocator)
-     alloc = pAllocator;
-   else
-     alloc = &device->vk.alloc;
-
-   return wsi_common_create_swapchain(wsi_device, _device,
-                                      pCreateInfo, alloc, pSwapchain);
-}
-
-void anv_DestroySwapchainKHR(
-    VkDevice                                     _device,
-    VkSwapchainKHR                               swapchain,
-    const VkAllocationCallbacks*                 pAllocator)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   const VkAllocationCallbacks *alloc;
-
-   if (pAllocator)
-     alloc = pAllocator;
-   else
-     alloc = &device->vk.alloc;
-
-   wsi_common_destroy_swapchain(_device, swapchain, alloc);
-}
-
-VkResult anv_GetSwapchainImagesKHR(
-    VkDevice                                     device,
-    VkSwapchainKHR                               swapchain,
-    uint32_t*                                    pSwapchainImageCount,
-    VkImage*                                     pSwapchainImages)
-{
-   return wsi_common_get_images(swapchain,
-                                pSwapchainImageCount,
-                                pSwapchainImages);
-}
-
-VkResult anv_AcquireNextImageKHR(
-    VkDevice                                     device,
-    VkSwapchainKHR                               swapchain,
-    uint64_t                                     timeout,
-    VkSemaphore                                  semaphore,
-    VkFence                                      fence,
-    uint32_t*                                    pImageIndex)
-{
-   VkAcquireNextImageInfoKHR acquire_info = {
-      .sType = VK_STRUCTURE_TYPE_ACQUIRE_NEXT_IMAGE_INFO_KHR,
-      .swapchain = swapchain,
-      .timeout = timeout,
-      .semaphore = semaphore,
-      .fence = fence,
-      .deviceMask = 0,
-   };
-
-   return anv_AcquireNextImage2KHR(device, &acquire_info, pImageIndex);
-}
-
 VkResult anv_AcquireNextImage2KHR(
-    VkDevice                                     _device,
-    const VkAcquireNextImageInfoKHR*             pAcquireInfo,
-    uint32_t*                                    pImageIndex)
+   VkDevice _device,
+   const VkAcquireNextImageInfoKHR *pAcquireInfo,
+   uint32_t *pImageIndex)
 {
-   ANV_FROM_HANDLE(anv_device, device, _device);
+   VK_FROM_HANDLE(anv_device, device, _device);
+
+   VkResult result =
+      wsi_common_acquire_next_image2(&device->physical->wsi_device,
+                                     _device, pAcquireInfo, pImageIndex);
+   if (result == VK_SUCCESS)
+      anv_measure_acquire(device);
 
-   anv_measure_acquire(device);
-   return wsi_common_acquire_next_image2(&device->physical->wsi_device,
-                                         _device, pAcquireInfo, pImageIndex);
+   return result;
 }
 
 VkResult anv_QueuePresentKHR(
@@ -291,111 +109,26 @@ VkResult anv_QueuePresentKHR(
 {
    ANV_FROM_HANDLE(anv_queue, queue, _queue);
    struct anv_device *device = queue->device;
+   VkResult result;
 
    if (device->debug_frame_desc) {
       device->debug_frame_desc->frame_id++;
-      if (!device->info.has_llc) {
-         intel_clflush_range(device->debug_frame_desc,
-                           sizeof(*device->debug_frame_desc));
-      }
    }
 
-   if (device->has_thread_submit &&
-       pPresentInfo->waitSemaphoreCount > 0) {
-      /* Make sure all of the dependency semaphores have materialized when
-       * using a threaded submission.
-       */
-      VK_MULTIALLOC(ma);
-      VK_MULTIALLOC_DECL(&ma, uint64_t, values,
-                              pPresentInfo->waitSemaphoreCount);
-      VK_MULTIALLOC_DECL(&ma, uint32_t, syncobjs,
-                              pPresentInfo->waitSemaphoreCount);
-
-      if (!vk_multialloc_alloc(&ma, &device->vk.alloc,
-                               VK_SYSTEM_ALLOCATION_SCOPE_COMMAND))
-         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   if (u_trace_should_process(&device->ds.trace_context))
+      anv_queue_trace(queue, NULL, true /* frame */, false /* begin */);
 
-      uint32_t wait_count = 0;
-      for (uint32_t i = 0; i < pPresentInfo->waitSemaphoreCount; i++) {
-         ANV_FROM_HANDLE(anv_semaphore, semaphore, pPresentInfo->pWaitSemaphores[i]);
-         struct anv_semaphore_impl *impl =
-            semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ?
-            &semaphore->temporary : &semaphore->permanent;
-
-         if (impl->type == ANV_SEMAPHORE_TYPE_DUMMY)
-            continue;
-         assert(impl->type == ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ);
-         syncobjs[wait_count] = impl->syncobj;
-         values[wait_count] = 0;
-         wait_count++;
-      }
-
-      int ret = 0;
-      if (wait_count > 0) {
-         ret =
-            anv_gem_syncobj_timeline_wait(device,
-                                          syncobjs, values, wait_count,
-                                          anv_get_absolute_timeout(INT64_MAX),
-                                          true /* wait_all */,
-                                          true /* wait_materialize */);
-      }
-
-      vk_free(&device->vk.alloc, values);
-
-      if (ret)
-         return vk_error(VK_ERROR_DEVICE_LOST);
-   }
+   result = vk_queue_wait_before_present(&queue->vk, pPresentInfo);
+   if (result != VK_SUCCESS)
+      return result;
 
-   VkResult result = wsi_common_queue_present(&device->physical->wsi_device,
-                                              anv_device_to_handle(queue->device),
-                                              _queue, 0,
-                                              pPresentInfo);
+   result = wsi_common_queue_present(&device->physical->wsi_device,
+                                     anv_device_to_handle(queue->device),
+                                     _queue, 0,
+                                     pPresentInfo);
 
-   for (uint32_t i = 0; i < pPresentInfo->waitSemaphoreCount; i++) {
-      ANV_FROM_HANDLE(anv_semaphore, semaphore, pPresentInfo->pWaitSemaphores[i]);
-      /* From the Vulkan 1.0.53 spec:
-       *
-       *    "If the import is temporary, the implementation must restore the
-       *    semaphore to its prior permanent state after submitting the next
-       *    semaphore wait operation."
-       */
-      anv_semaphore_reset_temporary(queue->device, semaphore);
-   }
+   if (u_trace_should_process(&device->ds.trace_context))
+      anv_queue_trace(queue, NULL, true /* frame */, true /* begin */);
 
    return result;
 }
-
-VkResult anv_GetDeviceGroupPresentCapabilitiesKHR(
-    VkDevice                                    device,
-    VkDeviceGroupPresentCapabilitiesKHR*        pCapabilities)
-{
-   memset(pCapabilities->presentMask, 0,
-          sizeof(pCapabilities->presentMask));
-   pCapabilities->presentMask[0] = 0x1;
-   pCapabilities->modes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR;
-
-   return VK_SUCCESS;
-}
-
-VkResult anv_GetDeviceGroupSurfacePresentModesKHR(
-    VkDevice                                    device,
-    VkSurfaceKHR                                surface,
-    VkDeviceGroupPresentModeFlagsKHR*           pModes)
-{
-   *pModes = VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHR;
-
-   return VK_SUCCESS;
-}
-
-VkResult anv_GetPhysicalDevicePresentRectanglesKHR(
-    VkPhysicalDevice                            physicalDevice,
-    VkSurfaceKHR                                surface,
-    uint32_t*                                   pRectCount,
-    VkRect2D*                                   pRects)
-{
-   ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
-   return wsi_common_get_present_rectangles(&device->wsi_device,
-                                            surface,
-                                            pRectCount, pRects);
-}
diff --git a/src/intel/vulkan/anv_wsi_display.c b/src/intel/vulkan/anv_wsi_display.c
deleted file mode 100644
index 4bb0453f55f..00000000000
--- a/src/intel/vulkan/anv_wsi_display.c
+++ /dev/null
@@ -1,338 +0,0 @@
-/*
- * Copyright © 2017 Keith Packard
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that copyright
- * notice and this permission notice appear in supporting documentation, and
- * that the name of the copyright holders not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission.  The copyright holders make no representations
- * about the suitability of this software for any purpose.  It is provided "as
- * is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
- * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
- * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
- * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
- * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
- * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THIS SOFTWARE.
- */
-
-#include "anv_private.h"
-#include "wsi_common.h"
-#include "vk_util.h"
-#include "wsi_common_display.h"
-
-VkResult
-anv_GetPhysicalDeviceDisplayPropertiesKHR(VkPhysicalDevice physical_device,
-                                          uint32_t *property_count,
-                                          VkDisplayPropertiesKHR *properties)
-{
-   ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_physical_device_display_properties(
-      physical_device,
-      &pdevice->wsi_device,
-      property_count,
-      properties);
-}
-
-VkResult
-anv_GetPhysicalDeviceDisplayProperties2KHR(
-    VkPhysicalDevice                            physicalDevice,
-    uint32_t*                                   pPropertyCount,
-    VkDisplayProperties2KHR*                    pProperties)
-{
-   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
-
-   return wsi_display_get_physical_device_display_properties2(
-      physicalDevice, &pdevice->wsi_device,
-      pPropertyCount, pProperties);
-}
-
-VkResult
-anv_GetPhysicalDeviceDisplayPlanePropertiesKHR(
-   VkPhysicalDevice physical_device,
-   uint32_t *property_count,
-   VkDisplayPlanePropertiesKHR *properties)
-{
-   ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_physical_device_display_plane_properties(
-      physical_device, &pdevice->wsi_device,
-      property_count, properties);
-}
-
-VkResult
-anv_GetPhysicalDeviceDisplayPlaneProperties2KHR(
-    VkPhysicalDevice                            physicalDevice,
-    uint32_t*                                   pPropertyCount,
-    VkDisplayPlaneProperties2KHR*               pProperties)
-{
-   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
-
-   return wsi_display_get_physical_device_display_plane_properties2(
-      physicalDevice, &pdevice->wsi_device,
-      pPropertyCount, pProperties);
-}
-
-VkResult
-anv_GetDisplayPlaneSupportedDisplaysKHR(VkPhysicalDevice physical_device,
-                                        uint32_t plane_index,
-                                        uint32_t *display_count,
-                                        VkDisplayKHR *displays)
-{
-   ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_display_plane_supported_displays(physical_device,
-                                                           &pdevice->wsi_device,
-                                                           plane_index,
-                                                           display_count,
-                                                           displays);
-}
-
-
-VkResult
-anv_GetDisplayModePropertiesKHR(VkPhysicalDevice physical_device,
-                                VkDisplayKHR display,
-                                uint32_t *property_count,
-                                VkDisplayModePropertiesKHR *properties)
-{
-   ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
-   return wsi_display_get_display_mode_properties(physical_device,
-                                                  &pdevice->wsi_device,
-                                                  display,
-                                                  property_count,
-                                                  properties);
-}
-
-VkResult
-anv_GetDisplayModeProperties2KHR(
-    VkPhysicalDevice                            physicalDevice,
-    VkDisplayKHR                                display,
-    uint32_t*                                   pPropertyCount,
-    VkDisplayModeProperties2KHR*                pProperties)
-{
-   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
-
-   return wsi_display_get_display_mode_properties2(physicalDevice,
-                                                   &pdevice->wsi_device,
-                                                   display,
-                                                   pPropertyCount,
-                                                   pProperties);
-}
-
-VkResult
-anv_CreateDisplayModeKHR(VkPhysicalDevice physical_device,
-                         VkDisplayKHR display,
-                         const VkDisplayModeCreateInfoKHR *create_info,
-                         const VkAllocationCallbacks *allocator,
-                         VkDisplayModeKHR *mode)
-{
-   ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
-   return wsi_display_create_display_mode(physical_device,
-                                          &pdevice->wsi_device,
-                                          display,
-                                          create_info,
-                                          allocator,
-                                          mode);
-}
-
-VkResult
-anv_GetDisplayPlaneCapabilitiesKHR(VkPhysicalDevice physical_device,
-                                   VkDisplayModeKHR mode_khr,
-                                   uint32_t plane_index,
-                                   VkDisplayPlaneCapabilitiesKHR *capabilities)
-{
-   ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
-   return wsi_get_display_plane_capabilities(physical_device,
-                                             &pdevice->wsi_device,
-                                             mode_khr,
-                                             plane_index,
-                                             capabilities);
-}
-
-VkResult
-anv_GetDisplayPlaneCapabilities2KHR(
-    VkPhysicalDevice                            physicalDevice,
-    const VkDisplayPlaneInfo2KHR*               pDisplayPlaneInfo,
-    VkDisplayPlaneCapabilities2KHR*             pCapabilities)
-{
-   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
-
-   return wsi_get_display_plane_capabilities2(physicalDevice,
-                                              &pdevice->wsi_device,
-                                              pDisplayPlaneInfo,
-                                              pCapabilities);
-}
-
-VkResult
-anv_CreateDisplayPlaneSurfaceKHR(
-   VkInstance _instance,
-   const VkDisplaySurfaceCreateInfoKHR *create_info,
-   const VkAllocationCallbacks *allocator,
-   VkSurfaceKHR *surface)
-{
-   ANV_FROM_HANDLE(anv_instance, instance, _instance);
-   const VkAllocationCallbacks *alloc;
-
-   if (allocator)
-     alloc = allocator;
-   else
-     alloc = &instance->vk.alloc;
-
-   return wsi_create_display_surface(_instance, alloc, create_info, surface);
-}
-
-VkResult
-anv_ReleaseDisplayEXT(VkPhysicalDevice physical_device,
-                       VkDisplayKHR     display)
-{
-   ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
-   return wsi_release_display(physical_device,
-                              &pdevice->wsi_device,
-                              display);
-}
-
-#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT
-VkResult
-anv_AcquireXlibDisplayEXT(VkPhysicalDevice     physical_device,
-                           Display              *dpy,
-                           VkDisplayKHR         display)
-{
-   ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
-   return wsi_acquire_xlib_display(physical_device,
-                                   &pdevice->wsi_device,
-                                   dpy,
-                                   display);
-}
-
-VkResult
-anv_GetRandROutputDisplayEXT(VkPhysicalDevice  physical_device,
-                              Display           *dpy,
-                              RROutput          output,
-                              VkDisplayKHR      *display)
-{
-   ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
-   return wsi_get_randr_output_display(physical_device,
-                                       &pdevice->wsi_device,
-                                       dpy,
-                                       output,
-                                       display);
-}
-#endif /* VK_USE_PLATFORM_XLIB_XRANDR_EXT */
-
-/* VK_EXT_display_control */
-
-VkResult
-anv_DisplayPowerControlEXT(VkDevice                    _device,
-                            VkDisplayKHR                display,
-                            const VkDisplayPowerInfoEXT *display_power_info)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-
-   return wsi_display_power_control(
-      _device, &device->physical->wsi_device,
-      display, display_power_info);
-}
-
-VkResult
-anv_RegisterDeviceEventEXT(VkDevice _device,
-                            const VkDeviceEventInfoEXT *device_event_info,
-                            const VkAllocationCallbacks *allocator,
-                            VkFence *_fence)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   struct anv_fence *fence;
-   VkResult ret;
-
-   fence = vk_zalloc2(&device->vk.alloc, allocator, sizeof (*fence), 8,
-                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-   if (!fence)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   fence->permanent.type = ANV_FENCE_TYPE_WSI;
-
-   ret = wsi_register_device_event(_device,
-                                   &device->physical->wsi_device,
-                                   device_event_info,
-                                   allocator,
-                                   &fence->permanent.fence_wsi,
-                                   -1);
-   if (ret == VK_SUCCESS)
-      *_fence = anv_fence_to_handle(fence);
-   else
-      vk_free2(&device->vk.alloc, allocator, fence);
-   return ret;
-}
-
-VkResult
-anv_RegisterDisplayEventEXT(VkDevice _device,
-                             VkDisplayKHR display,
-                             const VkDisplayEventInfoEXT *display_event_info,
-                             const VkAllocationCallbacks *allocator,
-                             VkFence *_fence)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   struct anv_fence *fence;
-   VkResult ret;
-
-   fence = vk_zalloc2(&device->vk.alloc, allocator, sizeof (*fence), 8,
-                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-   if (!fence)
-      return VK_ERROR_OUT_OF_HOST_MEMORY;
-
-   fence->permanent.type = ANV_FENCE_TYPE_WSI;
-
-   ret = wsi_register_display_event(
-      _device, &device->physical->wsi_device,
-      display, display_event_info, allocator, &fence->permanent.fence_wsi, -1);
-
-   if (ret == VK_SUCCESS)
-      *_fence = anv_fence_to_handle(fence);
-   else
-      vk_free2(&device->vk.alloc, allocator, fence);
-   return ret;
-}
-
-VkResult
-anv_GetSwapchainCounterEXT(VkDevice _device,
-                            VkSwapchainKHR swapchain,
-                            VkSurfaceCounterFlagBitsEXT flag_bits,
-                            uint64_t *value)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-
-   return wsi_get_swapchain_counter(
-      _device, &device->physical->wsi_device,
-      swapchain, flag_bits, value);
-}
-
-VkResult
-anv_AcquireDrmDisplayEXT(VkPhysicalDevice physical_device,
-                         int32_t drm_fd,
-                         VkDisplayKHR display)
-{
-   ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
-   return wsi_acquire_drm_display(physical_device, &pdevice->wsi_device, drm_fd, display);
-}
-
-VkResult
-anv_GetDrmDisplayEXT(VkPhysicalDevice physical_device,
-                     int32_t drm_fd,
-                     uint32_t connector_id,
-                     VkDisplayKHR *display)
-{
-   ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
-
-   return wsi_get_drm_display(physical_device, &pdevice->wsi_device, drm_fd, connector_id, display);
-}
diff --git a/src/intel/vulkan/anv_wsi_x11.c b/src/intel/vulkan/anv_wsi_x11.c
deleted file mode 100644
index 702eb57aafe..00000000000
--- a/src/intel/vulkan/anv_wsi_x11.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright © 2015 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <X11/Xlib-xcb.h>
-#include <X11/xshmfence.h>
-#include <xcb/xcb.h>
-#include <xcb/dri3.h>
-#include <xcb/present.h>
-
-#include "wsi_common_x11.h"
-#include "anv_private.h"
-
-VkBool32 anv_GetPhysicalDeviceXcbPresentationSupportKHR(
-    VkPhysicalDevice                            physicalDevice,
-    uint32_t                                    queueFamilyIndex,
-    xcb_connection_t*                           connection,
-    xcb_visualid_t                              visual_id)
-{
-   ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
-   return wsi_get_physical_device_xcb_presentation_support(
-      &device->wsi_device,
-      queueFamilyIndex,
-      connection, visual_id);
-}
-
-VkBool32 anv_GetPhysicalDeviceXlibPresentationSupportKHR(
-    VkPhysicalDevice                            physicalDevice,
-    uint32_t                                    queueFamilyIndex,
-    Display*                                    dpy,
-    VisualID                                    visualID)
-{
-   ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
-
-   return wsi_get_physical_device_xcb_presentation_support(
-      &device->wsi_device,
-      queueFamilyIndex,
-      XGetXCBConnection(dpy), visualID);
-}
-
-VkResult anv_CreateXcbSurfaceKHR(
-    VkInstance                                  _instance,
-    const VkXcbSurfaceCreateInfoKHR*            pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkSurfaceKHR*                               pSurface)
-{
-   ANV_FROM_HANDLE(anv_instance, instance, _instance);
-   const VkAllocationCallbacks *alloc;
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_XCB_SURFACE_CREATE_INFO_KHR);
-
-   if (pAllocator)
-     alloc = pAllocator;
-   else
-     alloc = &instance->vk.alloc;
-
-   return wsi_create_xcb_surface(alloc, pCreateInfo, pSurface);
-}
-
-VkResult anv_CreateXlibSurfaceKHR(
-    VkInstance                                  _instance,
-    const VkXlibSurfaceCreateInfoKHR*           pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkSurfaceKHR*                               pSurface)
-{
-   ANV_FROM_HANDLE(anv_instance, instance, _instance);
-   const VkAllocationCallbacks *alloc;
-
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR);
-
-   if (pAllocator)
-     alloc = pAllocator;
-   else
-     alloc = &instance->vk.alloc;
-
-   return wsi_create_xlib_surface(alloc, pCreateInfo, pSurface);
-}
diff --git a/src/intel/vulkan/genX_acceleration_structure.c b/src/intel/vulkan/genX_acceleration_structure.c
new file mode 100644
index 00000000000..db5c34cdcdb
--- /dev/null
+++ b/src/intel/vulkan/genX_acceleration_structure.c
@@ -0,0 +1,1287 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include <math.h>
+
+#include "util/u_debug.h"
+#include "util/half_float.h"
+#include "util/u_atomic.h"
+
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+#include "genxml/genX_rt_pack.h"
+
+#include "ds/intel_tracepoints.h"
+
+#if GFX_VERx10 == 125
+#include "grl/grl_structs.h"
+
+/* Wait for the previous dispatches to finish and flush their data port
+ * writes.
+ */
+#define ANV_GRL_FLUSH_FLAGS (ANV_PIPE_END_OF_PIPE_SYNC_BIT | \
+                             ANV_PIPE_DATA_CACHE_FLUSH_BIT | \
+                             ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT)
+
+static const VkAccelerationStructureGeometryKHR *
+get_geometry(const VkAccelerationStructureBuildGeometryInfoKHR *pInfo,
+             uint32_t index)
+{
+   return pInfo->pGeometries ? &pInfo->pGeometries[index] :
+                               pInfo->ppGeometries[index];
+}
+
+static size_t align_transient_size(size_t bytes)
+{
+   return align_uintptr(bytes, 64);
+}
+
+static size_t align_private_size(size_t bytes)
+{
+   return align_uintptr(bytes, 64);
+}
+
+static size_t get_scheduler_size(size_t num_builds)
+{
+    size_t scheduler_size = sizeof(union SchedulerUnion);
+    /* add more memory for qnode creation stage if needed */
+    if (num_builds > QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM) {
+        scheduler_size += (num_builds - QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM) * 2 *
+           sizeof(struct QNodeGlobalRootBufferEntry);
+    }
+
+    return align_private_size(scheduler_size);
+}
+
+static size_t
+get_batched_binnedsah_transient_mem_size(size_t num_builds)
+{
+   if (num_builds == 0)
+      return 0;
+   return num_builds * (sizeof(struct SAHBuildBuffersInfo) + sizeof(gpuva_t));
+}
+
+static size_t
+get_batched_binnedsah_private_mem_size(size_t num_builds)
+{
+   if (num_builds == 0)
+      return 0;
+
+   size_t globals_size = align_private_size(num_builds * sizeof(struct SAHBuildGlobals));
+   return globals_size + get_scheduler_size(num_builds);
+}
+
+static uint32_t
+estimate_qbvh6_nodes(const uint32_t N)
+{
+   const uint32_t W = 6;
+   const uint32_t N0 = N / 2 + N % 2; // lowest level with 2 leaves per QBVH6 node
+   const uint32_t N1 = N0 / W + (N0 % W ? 1 : 0); // filled level
+   const uint32_t N2 = N0 / W + (N1 % W ? 1 : 0); // filled level
+   const uint32_t N3 = N0 / W + (N2 % W ? 1 : 0); // filled level
+   const uint32_t N4 = N3; // overestimate remaining nodes
+   return N0 + N1 + N2 + N3 + N4;
+}
+
+/* Estimates the worst case number of QBVH6 nodes for a top-down BVH
+ * build that guarantees to produce subtree with N >= K primitives
+ * from which a single QBVH6 node is created.
+ */
+static uint32_t
+estimate_qbvh6_nodes_minK(const uint32_t N, uint32_t K)
+{
+    const uint32_t N0 = N / K + (N % K ? 1 : 0); // lowest level of nodes with K leaves minimally
+    return N0 + estimate_qbvh6_nodes(N0);
+}
+
+static size_t
+estimate_qbvh6_fatleafs(const size_t P)
+{
+   return P;
+}
+
+static size_t
+estimate_qbvh6_nodes_worstcase(const size_t P)
+{
+   const size_t F = estimate_qbvh6_fatleafs(P);
+
+   // worst-case each inner node having 5 fat-leaf children.
+   //  number of inner nodes is F/5 and number of fat-leaves is F
+   return F + ceil(F/5.0);
+}
+
+#define sizeof_PrimRef      32
+#define sizeof_HwInstanceLeaf (GENX(RT_BVH_INSTANCE_LEAF_length) * 4)
+#define sizeof_InternalNode   (GENX(RT_BVH_INTERNAL_NODE_length) * 4)
+#define sizeof_Procedural     (GENX(RT_BVH_PROCEDURAL_LEAF_length) * 4)
+#define sizeof_Quad           (GENX(RT_BVH_QUAD_LEAF_length) * 4)
+
+static struct MKSizeEstimate
+get_gpu_size_estimate(const VkAccelerationStructureBuildGeometryInfoKHR *pInfo,
+                      const VkAccelerationStructureBuildRangeInfoKHR *pBuildRangeInfos,
+                      const uint32_t *pMaxPrimitiveCounts)
+{
+   uint32_t num_triangles = 0, num_aabbs = 0, num_instances = 0;
+   for (unsigned g = 0; g < pInfo->geometryCount; g++) {
+      const VkAccelerationStructureGeometryKHR *pGeometry =
+         get_geometry(pInfo, g);
+      uint32_t prim_count = pBuildRangeInfos != NULL ?
+         pBuildRangeInfos[g].primitiveCount : pMaxPrimitiveCounts[g];
+
+      switch (pGeometry->geometryType) {
+      case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
+         num_triangles += prim_count;
+         break;
+      case VK_GEOMETRY_TYPE_AABBS_KHR:
+         num_aabbs += prim_count;
+         break;
+      case VK_GEOMETRY_TYPE_INSTANCES_KHR:
+         num_instances += prim_count;
+         break;
+      default:
+         unreachable("Unsupported geometry type");
+      }
+   }
+   const uint32_t num_primitives = num_triangles + num_aabbs + num_instances;
+
+   struct MKSizeEstimate est = {};
+
+   uint64_t size = sizeof(BVHBase);
+   size = align64(size, 64);
+
+   /* Must immediately follow BVHBase because we use fixed offset to nodes. */
+   est.node_data_start = size;
+
+   switch (pInfo->type) {
+   case VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR: {
+      assert(num_triangles == 0 && num_aabbs == 0);
+
+      est.numPrimitives = num_instances;
+      est.numPrimitivesToSplit = 0;
+      est.numBuildPrimitives = est.numPrimitives + est.numPrimitivesToSplit;
+
+      est.min_primitives = est.numPrimitives;
+      est.max_primitives = est.numPrimitives + est.numPrimitivesToSplit;
+
+      unsigned int sizeInnerNodes =
+         (unsigned int) estimate_qbvh6_nodes_worstcase(est.numBuildPrimitives) *
+         sizeof_InternalNode;
+      if (sizeInnerNodes == 0)
+         sizeInnerNodes = sizeof_InternalNode;
+
+      est.max_inner_nodes = sizeInnerNodes / sizeof_InternalNode;
+
+      size += sizeInnerNodes;
+      STATIC_ASSERT(sizeof_InternalNode % 64 == 0);
+
+      est.leaf_data_start = size;
+      size += est.numBuildPrimitives * sizeof_HwInstanceLeaf;
+      STATIC_ASSERT(sizeof_HwInstanceLeaf % 64 == 0);
+
+      est.leaf_data_size = est.numBuildPrimitives * sizeof_HwInstanceLeaf;
+
+      break;
+   }
+
+   case VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR: {
+      assert(num_instances == 0);
+
+      /* RT: TODO */
+      const float split_factor = 0.0f;
+      uint32_t num_prims_to_split = 0;
+      if (false)
+         num_prims_to_split = num_triangles + (double)split_factor;
+
+      const uint32_t num_build_triangles = num_triangles + num_prims_to_split;
+      const uint32_t num_build_primitives = num_build_triangles + num_aabbs;
+
+      est.numPrimitives = num_primitives;
+      est.numTriangles = num_triangles;
+      est.numProcedurals = num_aabbs;
+      est.numMeshes = pInfo->geometryCount;
+      est.numBuildPrimitives = num_build_primitives;
+      est.numPrimitivesToSplit = num_prims_to_split;
+      est.max_instance_leafs = 0;
+
+      est.min_primitives = (size_t)(num_build_triangles * 0.5f + num_aabbs);
+      est.max_primitives = num_build_triangles + num_aabbs;
+
+      size_t nodeBytes = 0;
+      nodeBytes += estimate_qbvh6_nodes_worstcase(num_build_triangles) * sizeof_InternalNode;
+      nodeBytes += estimate_qbvh6_nodes_worstcase(num_aabbs) * sizeof_InternalNode;
+      if (nodeBytes == 0) // for case with 0 primitives
+         nodeBytes = sizeof_InternalNode;
+      nodeBytes = MAX2(nodeBytes, 8 * (size_t)num_build_primitives); // for primref_index0/1 buffers
+
+      est.max_inner_nodes = nodeBytes / sizeof_InternalNode;
+
+      size += nodeBytes;
+      STATIC_ASSERT(sizeof_InternalNode % 64 == 0);
+
+      est.leaf_data_start = size;
+      size += num_build_triangles * sizeof_Quad;
+      STATIC_ASSERT(sizeof_Quad % 64 == 0);
+
+      est.procedural_data_start = size;
+      size += num_aabbs * sizeof_Procedural;
+      STATIC_ASSERT(sizeof_Procedural % 64 == 0);
+
+      est.leaf_data_size = num_build_triangles * sizeof_Quad +
+                           num_aabbs * sizeof_Procedural;
+
+      if (num_build_primitives == 0)
+         size += MAX2(sizeof_Quad, sizeof_Procedural);
+      break;
+   }
+
+   default:
+      unreachable("Unsupported acceleration structure type");
+   }
+
+   size = align64(size, 64);
+   est.instance_descs_start = size;
+   size += sizeof(struct InstanceDesc) * num_instances;
+
+   est.geo_meta_data_start = size;
+   size += sizeof(struct GeoMetaData) * pInfo->geometryCount;
+   size = align64(size, 64);
+
+   assert(size == align64(size, 64));
+   est.back_pointer_start = size;
+
+   const bool alloc_backpointers = false; /* RT TODO */
+   if (alloc_backpointers) {
+      size += est.max_inner_nodes * sizeof(uint32_t);
+      size = align64(size, 64);
+   }
+
+   assert(size < UINT32_MAX);
+   est.sizeTotal = align64(size, 64);
+
+   return est;
+}
+
+struct scratch_layout {
+   gpuva_t base;
+   uint32_t total_size;
+
+   gpuva_t primrefs;
+   gpuva_t globals;
+   gpuva_t leaf_index_buffers;
+   uint32_t leaf_index_buffer_stride;
+
+   /* new_sah */
+   gpuva_t qnode_buffer;
+   gpuva_t bvh2_buffer;
+};
+
+static size_t
+get_bvh2_size(uint32_t num_primitivies)
+{
+   if (num_primitivies == 0)
+      return 0;
+   return sizeof(struct BVH2) +
+      (2 * num_primitivies - 1) * sizeof(struct BVH2Node);
+}
+
+static struct scratch_layout
+get_gpu_scratch_layout(struct anv_address base,
+                       struct MKSizeEstimate est,
+                       enum anv_rt_bvh_build_method build_method)
+{
+   struct scratch_layout scratch = {
+      .base = anv_address_physical(base),
+   };
+   gpuva_t current = anv_address_physical(base);
+
+   scratch.globals = current;
+   current += sizeof(struct Globals);
+
+   scratch.primrefs = intel_canonical_address(current);
+   current += est.numBuildPrimitives * sizeof_PrimRef;
+
+   scratch.leaf_index_buffers = intel_canonical_address(current);
+   current += est.numBuildPrimitives * sizeof(uint32_t) * 2;
+   scratch.leaf_index_buffer_stride = sizeof(uint32_t);
+
+   switch (build_method) {
+   case ANV_BVH_BUILD_METHOD_TRIVIAL:
+      break;
+
+   case ANV_BVH_BUILD_METHOD_NEW_SAH: {
+      size_t bvh2_size = get_bvh2_size(est.numBuildPrimitives);
+      if (est.leaf_data_size < bvh2_size) {
+         scratch.bvh2_buffer = intel_canonical_address(current);
+         current += bvh2_size;
+      }
+
+      scratch.qnode_buffer = intel_canonical_address(current);
+      current += 2 * sizeof(dword) * est.max_inner_nodes;
+      break;
+   }
+
+   default:
+      unreachable("invalid build");
+   }
+
+   assert((current - scratch.base) < UINT32_MAX);
+   scratch.total_size = current - scratch.base;
+
+   return scratch;
+}
+
+static void
+anv_get_gpu_acceleration_structure_size(
+   UNUSED struct anv_device                   *device,
+   VkAccelerationStructureBuildTypeKHR         buildType,
+   const VkAccelerationStructureBuildGeometryInfoKHR* pBuildInfo,
+   const uint32_t*                             pMaxPrimitiveCounts,
+   VkAccelerationStructureBuildSizesInfoKHR*   pSizeInfo)
+{
+
+   struct MKSizeEstimate est = get_gpu_size_estimate(pBuildInfo, NULL,
+                                                     pMaxPrimitiveCounts);
+   struct scratch_layout scratch = get_gpu_scratch_layout(ANV_NULL_ADDRESS, est,
+                                                          device->bvh_build_method);
+
+   pSizeInfo->accelerationStructureSize = est.sizeTotal;
+   pSizeInfo->buildScratchSize = scratch.total_size;
+   pSizeInfo->updateScratchSize = scratch.total_size; /* TODO */
+}
+
+void
+genX(GetAccelerationStructureBuildSizesKHR)(
+    VkDevice                                    _device,
+    VkAccelerationStructureBuildTypeKHR         buildType,
+    const VkAccelerationStructureBuildGeometryInfoKHR* pBuildInfo,
+    const uint32_t*                             pMaxPrimitiveCounts,
+    VkAccelerationStructureBuildSizesInfoKHR*   pSizeInfo)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   assert(pSizeInfo->sType ==
+          VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_SIZES_INFO_KHR);
+
+   VkAccelerationStructureBuildSizesInfoKHR gpu_size_info;
+   anv_get_gpu_acceleration_structure_size(device, buildType, pBuildInfo,
+                                           pMaxPrimitiveCounts,
+                                           &gpu_size_info);
+
+   pSizeInfo->accelerationStructureSize =
+      gpu_size_info.accelerationStructureSize;
+   pSizeInfo->buildScratchSize = gpu_size_info.buildScratchSize;
+   pSizeInfo->updateScratchSize = gpu_size_info.updateScratchSize;
+}
+
+void
+genX(GetDeviceAccelerationStructureCompatibilityKHR)(
+    VkDevice                                    _device,
+    const VkAccelerationStructureVersionInfoKHR* pVersionInfo,
+    VkAccelerationStructureCompatibilityKHR*    pCompatibility)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   if (memcmp(pVersionInfo->pVersionData,
+              device->physical->rt_uuid,
+              sizeof(device->physical->rt_uuid)) == 0) {
+      *pCompatibility = VK_ACCELERATION_STRUCTURE_COMPATIBILITY_COMPATIBLE_KHR;
+   } else {
+      *pCompatibility = VK_ACCELERATION_STRUCTURE_COMPATIBILITY_INCOMPATIBLE_KHR;
+   }
+}
+
+static inline uint8_t
+vk_to_grl_GeometryFlags(VkGeometryFlagsKHR flags)
+{
+   uint8_t grl_flags = GEOMETRY_FLAG_NONE;
+   unsigned mask = flags;
+   while (mask) {
+      int i = u_bit_scan(&mask);
+      switch ((VkGeometryFlagBitsKHR)(1u << i)) {
+      case VK_GEOMETRY_OPAQUE_BIT_KHR:
+         grl_flags |= GEOMETRY_FLAG_OPAQUE;
+         break;
+      case VK_GEOMETRY_NO_DUPLICATE_ANY_HIT_INVOCATION_BIT_KHR:
+         grl_flags |= GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION;
+         break;
+      default:
+         unreachable("Unsupported acceleration structure build flag");
+      }
+   }
+   return grl_flags;
+}
+
+static inline IndexFormat
+vk_to_grl_IndexFormat(VkIndexType type)
+{
+   switch (type) {
+   case VK_INDEX_TYPE_NONE_KHR:  return INDEX_FORMAT_NONE;
+   case VK_INDEX_TYPE_UINT8_KHR: unreachable("No UINT8 support yet");
+   case VK_INDEX_TYPE_UINT16:    return INDEX_FORMAT_R16_UINT;
+   case VK_INDEX_TYPE_UINT32:    return INDEX_FORMAT_R32_UINT;
+   default:
+      unreachable("Unsupported index type");
+   }
+}
+
+static inline VertexFormat
+vk_to_grl_VertexFormat(VkFormat format)
+{
+   switch (format) {
+   case VK_FORMAT_R32G32_SFLOAT:       return VERTEX_FORMAT_R32G32_FLOAT;
+   case VK_FORMAT_R32G32B32_SFLOAT:    return VERTEX_FORMAT_R32G32B32_FLOAT;
+   case VK_FORMAT_R16G16_SFLOAT:       return VERTEX_FORMAT_R16G16_FLOAT;
+   case VK_FORMAT_R16G16B16A16_SFLOAT: return VERTEX_FORMAT_R16G16B16A16_FLOAT;
+   case VK_FORMAT_R16G16_SNORM:        return VERTEX_FORMAT_R16G16_SNORM;
+   case VK_FORMAT_R16G16B16A16_SNORM:  return VERTEX_FORMAT_R16G16B16A16_SNORM;
+   case VK_FORMAT_R16G16B16A16_UNORM:  return VERTEX_FORMAT_R16G16B16A16_UNORM;
+   case VK_FORMAT_R16G16_UNORM:        return VERTEX_FORMAT_R16G16_UNORM;
+   /* case VK_FORMAT_R10G10B10A2_UNORM:   return VERTEX_FORMAT_R10G10B10A2_UNORM; */
+   case VK_FORMAT_R8G8B8A8_UNORM:      return VERTEX_FORMAT_R8G8B8A8_UNORM;
+   case VK_FORMAT_R8G8_UNORM:          return VERTEX_FORMAT_R8G8_UNORM;
+   case VK_FORMAT_R8G8B8A8_SNORM:      return VERTEX_FORMAT_R8G8B8A8_SNORM;
+   case VK_FORMAT_R8G8_SNORM:          return VERTEX_FORMAT_R8G8_SNORM;
+   default:
+      unreachable("Unsupported vertex format");
+   }
+}
+
+static struct Geo
+vk_to_grl_Geo(const VkAccelerationStructureGeometryKHR *pGeometry,
+              uint32_t prim_count,
+              uint32_t transform_offset,
+              uint32_t primitive_offset,
+              uint32_t first_vertex)
+{
+   struct Geo geo = {
+      .Flags = vk_to_grl_GeometryFlags(pGeometry->flags),
+   };
+
+   switch (pGeometry->geometryType) {
+   case VK_GEOMETRY_TYPE_TRIANGLES_KHR: {
+      const VkAccelerationStructureGeometryTrianglesDataKHR *vk_tri =
+         &pGeometry->geometry.triangles;
+
+      geo.Type = GEOMETRY_TYPE_TRIANGLES;
+
+      geo.Desc.Triangles.pTransformBuffer =
+         vk_tri->transformData.deviceAddress;
+      geo.Desc.Triangles.pIndexBuffer =
+         vk_tri->indexData.deviceAddress;
+      geo.Desc.Triangles.pVertexBuffer =
+         vk_tri->vertexData.deviceAddress;
+      geo.Desc.Triangles.VertexBufferByteStride = vk_tri->vertexStride;
+
+      if (geo.Desc.Triangles.pTransformBuffer)
+         geo.Desc.Triangles.pTransformBuffer += transform_offset;
+
+      if (vk_tri->indexType == VK_INDEX_TYPE_NONE_KHR) {
+         geo.Desc.Triangles.IndexCount = 0;
+         geo.Desc.Triangles.VertexCount = prim_count * 3;
+         geo.Desc.Triangles.IndexFormat = INDEX_FORMAT_NONE;
+         geo.Desc.Triangles.pVertexBuffer += primitive_offset;
+      } else {
+         geo.Desc.Triangles.IndexCount = prim_count * 3;
+         geo.Desc.Triangles.VertexCount = vk_tri->maxVertex;
+         geo.Desc.Triangles.IndexFormat =
+            vk_to_grl_IndexFormat(vk_tri->indexType);
+         geo.Desc.Triangles.pIndexBuffer += primitive_offset;
+      }
+
+      geo.Desc.Triangles.VertexFormat =
+         vk_to_grl_VertexFormat(vk_tri->vertexFormat);
+      geo.Desc.Triangles.pVertexBuffer += vk_tri->vertexStride * first_vertex;
+      break;
+   }
+
+   case VK_GEOMETRY_TYPE_AABBS_KHR: {
+      const VkAccelerationStructureGeometryAabbsDataKHR *vk_aabbs =
+         &pGeometry->geometry.aabbs;
+      geo.Type = GEOMETRY_TYPE_PROCEDURAL;
+      geo.Desc.Procedural.pAABBs_GPUVA =
+         vk_aabbs->data.deviceAddress + primitive_offset;
+      geo.Desc.Procedural.AABBByteStride = vk_aabbs->stride;
+      geo.Desc.Procedural.AABBCount = prim_count;
+      break;
+   }
+
+   default:
+      unreachable("Invalid geometry type");
+   }
+
+   return geo;
+}
+
+#include "grl/grl_metakernel_copy.h"
+#include "grl/grl_metakernel_misc.h"
+#include "grl/grl_metakernel_build_primref.h"
+#include "grl/grl_metakernel_new_sah_builder.h"
+#include "grl/grl_metakernel_build_leaf.h"
+
+struct build_state {
+   enum anv_rt_bvh_build_method build_method;
+
+   struct MKSizeEstimate estimate;
+   struct scratch_layout scratch;
+   struct MKBuilderState state;
+
+   struct anv_address bvh_addr;
+
+   size_t geom_size_prefix_sum_buffer;
+   size_t transient_size;
+
+   uint32_t leaf_type;
+   uint32_t leaf_size;
+
+   uint32_t num_geometries;
+   uint32_t num_instances;
+
+   uint64_t instances_addr;
+   bool array_of_instances_ptr;
+
+   const VkAccelerationStructureGeometryKHR *vk_geoms;
+};
+
+static void
+get_binnedsah_scratch_buffers(struct build_state *bs,
+                              uint64_t *p_qnode_buffer,
+                              uint64_t *p_primref_indices,
+                              uint64_t *p_bvh2)
+{
+    if (bs->estimate.numBuildPrimitives == 0)
+    {
+        *p_bvh2 = 0;
+	*p_qnode_buffer = 0;
+        *p_primref_indices = 0;
+        return;
+    }
+
+    size_t bvh2_size = get_bvh2_size(bs->estimate.numBuildPrimitives);
+    if (bs->estimate.leaf_data_size < bvh2_size) {
+       assert(bs->scratch.bvh2_buffer != 0);
+       *p_bvh2 = bs->scratch.bvh2_buffer;
+    } else {
+       *p_bvh2 = intel_canonical_address(bs->state.bvh_buffer +
+                                         bs->estimate.leaf_data_start);
+    }
+
+    assert(bs->scratch.qnode_buffer != 0);
+    *p_qnode_buffer = bs->scratch.qnode_buffer;
+
+    assert(bs->scratch.leaf_index_buffers != 0);
+    *p_primref_indices = bs->scratch.leaf_index_buffers;
+}
+
+static void
+write_memory(struct anv_cmd_alloc alloc, size_t offset, const void *data, size_t data_len)
+{
+   assert((offset + data_len) < alloc.size);
+   memcpy(alloc.map + offset, data, data_len);
+}
+
+static void
+cmd_build_acceleration_structures(
+   struct anv_cmd_buffer *cmd_buffer,
+   uint32_t infoCount,
+   const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
+   const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos,
+   const VkDeviceAddress *pIndirectDeviceAddresses,
+   const uint32_t *pIndirectStrides,
+   const uint32_t *const *ppMaxPrimitiveCounts)
+{
+   struct anv_device *device = cmd_buffer->device;
+   VK_MULTIALLOC(ma);
+
+   struct build_state *builds;
+   vk_multialloc_add(&ma, &builds, struct build_state, infoCount);
+
+   if (!vk_multialloc_zalloc(&ma,
+                             &cmd_buffer->device->vk.alloc,
+                             VK_SYSTEM_ALLOCATION_SCOPE_COMMAND)) {
+      anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
+      return;
+   }
+
+   trace_intel_begin_as_build(&cmd_buffer->trace);
+
+   /* TODO: Indirect */
+   assert(ppBuildRangeInfos != NULL);
+
+   size_t transient_mem_init_globals_size = 0;
+   size_t transient_mem_init_globals_offset = 0;
+
+   size_t transient_total     = 0;
+
+    size_t private_mem_total = 0;
+
+    size_t num_trivial_builds = 0;
+    size_t num_new_sah_builds = 0;
+
+   /* Prepare a bunch of data for the kernels we have to run. */
+   for (uint32_t i = 0; i < infoCount; i++) {
+      struct build_state *bs = &builds[i];
+
+      const VkAccelerationStructureBuildGeometryInfoKHR *pInfo = &pInfos[i];
+      struct anv_address scratch_addr =
+         anv_address_from_u64(pInfo->scratchData.deviceAddress);
+
+      const VkAccelerationStructureBuildRangeInfoKHR *pBuildRangeInfos =
+         ppBuildRangeInfos ? ppBuildRangeInfos[i] : NULL;
+      const uint32_t *pMaxPrimitiveCounts =
+         ppMaxPrimitiveCounts ? ppMaxPrimitiveCounts[i] : NULL;
+
+      ANV_FROM_HANDLE(vk_acceleration_structure, dst_accel,
+                      pInfo->dstAccelerationStructure);
+
+      bs->build_method = device->bvh_build_method;
+
+      bs->bvh_addr = anv_address_from_u64(vk_acceleration_structure_get_va(dst_accel));
+
+      bs->estimate = get_gpu_size_estimate(pInfo, pBuildRangeInfos,
+                                           pMaxPrimitiveCounts);
+      bs->scratch = get_gpu_scratch_layout(scratch_addr, bs->estimate,
+                                           bs->build_method);
+
+      uint32_t leaf_size, leaf_type;
+
+      switch (pInfo->type) {
+      case VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR: {
+         assert(pInfo->geometryCount == 1);
+
+         const VkAccelerationStructureGeometryKHR *pGeometry =
+            get_geometry(pInfo, 0);
+         assert(pGeometry->geometryType == VK_GEOMETRY_TYPE_INSTANCES_KHR);
+
+         const VkAccelerationStructureGeometryInstancesDataKHR *instances =
+            &pGeometry->geometry.instances;
+
+         bs->num_instances = pBuildRangeInfos[0].primitiveCount;
+         bs->instances_addr = instances->data.deviceAddress;
+         bs->array_of_instances_ptr = instances->arrayOfPointers;
+         leaf_type = NODE_TYPE_INSTANCE;
+         leaf_size = GENX(RT_BVH_INSTANCE_LEAF_length) * 4;
+         break;
+      }
+
+      case VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR: {
+         bs->num_geometries = pInfo->geometryCount;
+         leaf_type = NODE_TYPE_QUAD;
+         leaf_size = GENX(RT_BVH_QUAD_LEAF_length) * 4;
+         break;
+      }
+
+      default:
+         unreachable("Unsupported acceleration structure type");
+      }
+
+      size_t geom_struct_size = bs->num_geometries * sizeof(struct Geo);
+      size_t geom_prefix_sum_size = align_uintptr(sizeof(uint32_t) * (bs->num_geometries + 1), 64);
+
+      bs->transient_size = geom_prefix_sum_size + geom_struct_size;
+
+      bs->geom_size_prefix_sum_buffer = transient_total + 0;
+
+      bs->state = (struct MKBuilderState) {
+         .geomDesc_buffer = bs->geom_size_prefix_sum_buffer +
+                            geom_prefix_sum_size,
+         .build_primref_buffer = bs->scratch.primrefs,
+         .build_globals = bs->scratch.globals,
+         .bvh_buffer = anv_address_physical(bs->bvh_addr),
+         .leaf_type = leaf_type,
+         .leaf_size = leaf_size,
+      };
+
+      transient_total += bs->transient_size;
+
+      switch (device->bvh_build_method) {
+      case ANV_BVH_BUILD_METHOD_TRIVIAL:
+         num_trivial_builds++;
+         break;
+      case ANV_BVH_BUILD_METHOD_NEW_SAH:
+         num_new_sah_builds++;
+         break;
+      default:
+         unreachable("invalid BVH build method");
+      }
+
+      transient_mem_init_globals_size += sizeof(struct BatchedInitGlobalsData);
+   }
+
+   transient_total = align_transient_size(transient_total);
+   transient_mem_init_globals_offset = transient_total;
+   transient_total += align_transient_size(transient_mem_init_globals_size);
+
+   size_t transient_mem_binnedsah_size = 0;
+   size_t transient_mem_binnedsah_offset = 0;
+   size_t private_mem_binnedsah_size = 0;
+   size_t private_mem_binnedsah_offset = 0;
+
+   transient_mem_binnedsah_size = get_batched_binnedsah_transient_mem_size(num_new_sah_builds);
+   transient_mem_binnedsah_offset = transient_total;
+   transient_total += align_transient_size(transient_mem_binnedsah_size);
+
+   private_mem_binnedsah_size = get_batched_binnedsah_private_mem_size(num_new_sah_builds);
+   private_mem_binnedsah_offset = private_mem_total;
+   private_mem_total += align_private_size(private_mem_binnedsah_size);
+
+   /* Allocate required memory, unless we already have a suiteable buffer */
+   struct anv_cmd_alloc private_mem_alloc;
+   if (private_mem_total > cmd_buffer->state.rt.build_priv_mem_size) {
+      private_mem_alloc =
+         anv_cmd_buffer_alloc_space(cmd_buffer, private_mem_total, 64,
+                                    false /* mapped */);
+      if (anv_cmd_alloc_is_empty(private_mem_alloc)) {
+         anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+         goto error;
+      }
+
+      cmd_buffer->state.rt.build_priv_mem_addr = private_mem_alloc.address;
+      cmd_buffer->state.rt.build_priv_mem_size = private_mem_alloc.size;
+   } else {
+      private_mem_alloc = (struct anv_cmd_alloc) {
+         .address = cmd_buffer->state.rt.build_priv_mem_addr,
+         .map     = anv_address_map(cmd_buffer->state.rt.build_priv_mem_addr),
+         .size    = cmd_buffer->state.rt.build_priv_mem_size,
+      };
+   }
+
+   struct anv_cmd_alloc transient_mem_alloc =
+      anv_cmd_buffer_alloc_space(cmd_buffer, transient_total, 64,
+                                 true /* mapped */);
+   if (transient_total > 0 && anv_cmd_alloc_is_empty(transient_mem_alloc)) {
+      anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+      goto error;
+   }
+
+   uint64_t private_base = anv_address_physical(private_mem_alloc.address);
+   uint64_t transient_base = anv_address_physical(transient_mem_alloc.address);
+
+   /* Prepare transient memory */
+   for (uint32_t i = 0; i < infoCount; i++) {
+      struct build_state *bs = &builds[i];
+
+      const VkAccelerationStructureBuildGeometryInfoKHR *pInfo = &pInfos[i];
+
+      const VkAccelerationStructureBuildRangeInfoKHR *pBuildRangeInfos =
+         ppBuildRangeInfos ? ppBuildRangeInfos[i] : NULL;
+
+      struct Geo *geos = transient_mem_alloc.map + bs->state.geomDesc_buffer;
+      uint32_t *prefixes = transient_mem_alloc.map + bs->geom_size_prefix_sum_buffer;
+      uint32_t prefix_sum = 0;
+      for (unsigned g = 0; g < bs->num_geometries; g++) {
+         const VkAccelerationStructureGeometryKHR *pGeometry = get_geometry(pInfo, g);
+         uint32_t prim_count = pBuildRangeInfos[g].primitiveCount;
+         geos[g] = vk_to_grl_Geo(pGeometry, prim_count,
+                                 pBuildRangeInfos[g].transformOffset,
+                                 pBuildRangeInfos[g].primitiveOffset,
+                                 pBuildRangeInfos[g].firstVertex);
+
+         prefixes[g] = prefix_sum;
+         prefix_sum += prim_count;
+      }
+
+      prefixes[bs->num_geometries] = prefix_sum;
+
+      bs->geom_size_prefix_sum_buffer =
+         intel_canonical_address(bs->geom_size_prefix_sum_buffer +
+                                 transient_base);
+      bs->state.geomDesc_buffer =
+         intel_canonical_address(bs->state.geomDesc_buffer +
+                                 transient_base);
+
+      struct BatchedInitGlobalsData data = {
+         .p_build_globals = bs->scratch.globals,
+         .p_bvh_buffer = anv_address_physical(bs->bvh_addr),
+
+         .numPrimitives = 0,
+         .numGeometries = bs->num_geometries,
+         .numInstances = bs->num_instances,
+
+         .instance_descs_start = bs->estimate.instance_descs_start,
+         .geo_meta_data_start = bs->estimate.geo_meta_data_start,
+         .node_data_start = bs->estimate.node_data_start,
+         .leaf_data_start = bs->estimate.leaf_data_start,
+         .procedural_data_start = bs->estimate.procedural_data_start,
+         .back_pointer_start = bs->estimate.back_pointer_start,
+         .sizeTotal = bs->estimate.sizeTotal,
+
+         .leafType = bs->state.leaf_type,
+         .leafSize = bs->state.leaf_size,
+      };
+
+      write_memory(transient_mem_alloc,
+                   transient_mem_init_globals_offset + i * sizeof(data),
+                   &data, sizeof(data));
+   }
+
+   genX(flush_pipeline_select_gpgpu)(cmd_buffer);
+
+   /* Due to the nature of GRL and its heavy use of jumps/predication, we
+    * cannot tell exactly in what order the CFE_STATE we insert are going to
+    * be executed. So always use the largest possible size.
+    */
+   genX(cmd_buffer_ensure_cfe_state)(
+      cmd_buffer,
+      cmd_buffer->device->physical->max_grl_scratch_size);
+
+   /* Round 1 : init_globals kernel */
+   genX(grl_misc_batched_init_globals)(
+      cmd_buffer,
+      intel_canonical_address(transient_base +
+                              transient_mem_init_globals_offset),
+      infoCount);
+
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_GRL_FLUSH_FLAGS,
+                             "building accel struct");
+
+   /* Round 2 : Copy instance/geometry data from the application provided
+    *           buffers into the acceleration structures.
+    */
+   for (uint32_t i = 0; i < infoCount; i++) {
+      struct build_state *bs = &builds[i];
+
+      /* Metadata */
+      if (bs->num_instances) {
+         assert(bs->num_geometries == 0);
+
+         const uint64_t copy_size = bs->num_instances * sizeof(InstanceDesc);
+         /* This must be calculated in same way as
+          * groupCountForGeoMetaDataCopySize
+          */
+         const uint32_t num_threads = (copy_size >> 8) + 3;
+
+         if (bs->array_of_instances_ptr) {
+            genX(grl_misc_copy_instance_ptrs)(
+               cmd_buffer,
+               anv_address_physical(anv_address_add(bs->bvh_addr,
+                                                    bs->estimate.instance_descs_start)),
+               bs->instances_addr,
+               copy_size, num_threads);
+         } else {
+            genX(grl_misc_copy_instances)(
+               cmd_buffer,
+               anv_address_physical(anv_address_add(bs->bvh_addr,
+                                                    bs->estimate.instance_descs_start)),
+               bs->instances_addr,
+               copy_size, num_threads);
+         }
+      }
+
+      if (bs->num_geometries) {
+         assert(bs->num_instances == 0);
+         const uint64_t copy_size = bs->num_geometries * sizeof(struct GeoMetaData);
+
+         /* This must be calculated in same way as
+          * groupCountForGeoMetaDataCopySize
+          */
+         const uint32_t num_threads = (copy_size >> 6) + 1;
+
+         genX(grl_misc_copy_geo_meta_data)(
+            cmd_buffer,
+            anv_address_physical(anv_address_add(bs->bvh_addr,
+                                                 bs->estimate.geo_meta_data_start)),
+            bs->state.geomDesc_buffer,
+            copy_size,
+            num_threads);
+      }
+
+      /* Primrefs */
+      if (bs->num_instances) {
+         if (bs->array_of_instances_ptr) {
+            genX(grl_build_primref_buildPrimirefsFromInstancesArrOfPtrs)(
+               cmd_buffer,
+               bs->instances_addr,
+               PREFIX_MK_SIZE(grl_build_primref, bs->estimate),
+               PREFIX_MK_STATE(grl_build_primref, bs->state),
+               false /* allowUpdate */);
+         } else {
+            genX(grl_build_primref_buildPrimirefsFromInstances)(
+               cmd_buffer,
+               bs->instances_addr,
+               PREFIX_MK_SIZE(grl_build_primref, bs->estimate),
+               PREFIX_MK_STATE(grl_build_primref, bs->state),
+               false /* allowUpdate */);
+         }
+      }
+
+      if (bs->num_geometries) {
+         const VkAccelerationStructureBuildGeometryInfoKHR *pInfo = &pInfos[i];
+         const VkAccelerationStructureBuildRangeInfoKHR *pBuildRangeInfos =
+            ppBuildRangeInfos ? ppBuildRangeInfos[i] : NULL;
+
+         assert(pInfo->geometryCount == bs->num_geometries);
+         for (unsigned g = 0; g < pInfo->geometryCount; g++) {
+            const VkAccelerationStructureGeometryKHR *pGeometry =
+               get_geometry(pInfo, g);
+
+            switch (pGeometry->geometryType) {
+            case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
+               genX(grl_build_primref_primrefs_from_tris)(
+                  cmd_buffer,
+                  PREFIX_MK_STATE(grl_build_primref, bs->state),
+                  PREFIX_MK_SIZE(grl_build_primref, bs->estimate),
+                  bs->state.geomDesc_buffer + g * sizeof(struct Geo),
+                  g,
+                  vk_to_grl_GeometryFlags(pGeometry->flags),
+                  /* TODO: Indirect */
+                  pBuildRangeInfos[g].primitiveCount);
+               break;
+
+            case VK_GEOMETRY_TYPE_AABBS_KHR:
+               genX(grl_build_primref_primrefs_from_proc)(
+                  cmd_buffer,
+                  PREFIX_MK_STATE(grl_build_primref, bs->state),
+                  PREFIX_MK_SIZE(grl_build_primref, bs->estimate),
+                  bs->state.geomDesc_buffer + g * sizeof(struct Geo),
+                  g,
+                  vk_to_grl_GeometryFlags(pGeometry->flags),
+                  /* TODO: Indirect */
+                  pBuildRangeInfos[g].primitiveCount);
+               break;
+
+            default:
+               unreachable("Invalid geometry type");
+            }
+         }
+      }
+   }
+
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_GRL_FLUSH_FLAGS,
+                             "building accel struct");
+
+   /* Dispatch trivial builds */
+   if (num_trivial_builds) {
+      for (uint32_t i = 0; i < infoCount; i++) {
+         struct build_state *bs = &builds[i];
+
+         if (bs->build_method != ANV_BVH_BUILD_METHOD_TRIVIAL)
+            continue;
+
+         genX(grl_new_sah_builder_single_pass_binsah)(
+            cmd_buffer,
+            bs->scratch.globals,
+            bs->state.bvh_buffer,
+            bs->state.build_primref_buffer,
+            bs->scratch.leaf_index_buffers,
+            false /* alloc_backpointers */);
+      }
+   }
+
+   /* Dispatch new SAH builds */
+   if (num_new_sah_builds) {
+      size_t global_ptrs_offset  = transient_mem_binnedsah_offset;
+      size_t buffers_info_offset = transient_mem_binnedsah_offset + sizeof(gpuva_t) * num_new_sah_builds;
+
+      size_t scheduler_offset   = private_mem_binnedsah_offset;
+      size_t sah_globals_offset = private_mem_binnedsah_offset + get_scheduler_size(num_new_sah_builds);
+
+      struct SAHBuildArgsBatchable args = {
+         .num_builds                               = infoCount,
+         .p_globals_ptrs                           = intel_canonical_address(transient_base + global_ptrs_offset),
+         .p_buffers_info                           = intel_canonical_address(transient_base + buffers_info_offset),
+         .p_scheduler                              = intel_canonical_address(private_base + scheduler_offset),
+         .p_sah_globals                            = intel_canonical_address(private_base + sah_globals_offset),
+         .num_max_qnode_global_root_buffer_entries = MAX2(num_new_sah_builds, QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM),
+      };
+
+      for (uint32_t i = 0; i < infoCount; i++) {
+         struct build_state *bs = &builds[i];
+
+         if (bs->build_method != ANV_BVH_BUILD_METHOD_NEW_SAH)
+            continue;
+
+         uint64_t p_build_primref_index_buffers;
+         uint64_t p_bvh2;
+         uint64_t p_qnode_child_buffer;
+
+         get_binnedsah_scratch_buffers(bs,
+                                       &p_qnode_child_buffer,
+                                       &p_build_primref_index_buffers,
+                                       &p_bvh2);
+
+         struct SAHBuildBuffersInfo buffers = {
+            .p_primref_index_buffers  = bs->scratch.leaf_index_buffers,
+            .p_bvh_base               = bs->state.bvh_buffer,
+            .p_primrefs_buffer        = bs->state.build_primref_buffer,
+            .p_bvh2                   = p_bvh2,
+            .p_qnode_root_buffer      = p_qnode_child_buffer,
+            .sah_globals_flags        = 0,
+         };
+
+         write_memory(transient_mem_alloc, buffers_info_offset, &buffers, sizeof(buffers));
+         buffers_info_offset += sizeof(buffers);
+
+         write_memory(transient_mem_alloc, global_ptrs_offset, &bs->state.build_globals,
+                      sizeof(bs->state.build_globals));
+         global_ptrs_offset += sizeof(bs->state.build_globals);
+      }
+
+      genX(grl_new_sah_builder_new_sah_build_batchable)(
+         cmd_buffer, PREFIX_MK_SAH_BUILD_ARGS_BATCHABLE(grl_new_sah_builder, args));
+   }
+
+   if (num_new_sah_builds == 0)
+      anv_add_pending_pipe_bits(cmd_buffer,
+                              ANV_GRL_FLUSH_FLAGS,
+                             "building accel struct");
+
+   /* Finally write the leaves. */
+   for (uint32_t i = 0; i < infoCount; i++) {
+      struct build_state *bs = &builds[i];
+
+      if (bs->num_instances) {
+         assert(bs->num_geometries == 0);
+         if (bs->array_of_instances_ptr) {
+            genX(grl_leaf_builder_buildLeafDXR_instances_pointers)(cmd_buffer,
+               PREFIX_MK_STATE(grl_leaf_builder, bs->state),
+               bs->scratch.leaf_index_buffers,
+               bs->instances_addr,
+               bs->scratch.leaf_index_buffer_stride,
+               0 /* offset */,
+               bs->estimate.numBuildPrimitives);
+         } else {
+            genX(grl_leaf_builder_buildLeafDXR_instances)(cmd_buffer,
+               PREFIX_MK_STATE(grl_leaf_builder, bs->state),
+               bs->scratch.leaf_index_buffers,
+               bs->instances_addr,
+               bs->scratch.leaf_index_buffer_stride,
+               0 /* offset */,
+               bs->estimate.numBuildPrimitives);
+         }
+      }
+
+      if (bs->num_geometries) {
+         assert(bs->num_instances == 0);
+         const uint64_t p_numPrimitives =
+            bs->state.build_globals + offsetof(struct Globals, numPrimitives);
+
+         assert(bs->estimate.numProcedurals == 0 ||
+                bs->estimate.numTriangles == 0);
+         if (bs->estimate.numProcedurals) {
+            genX(grl_leaf_builder_buildLeafDXR_procedurals)(
+               cmd_buffer,
+               PREFIX_MK_STATE(grl_leaf_builder, bs->state),
+               bs->scratch.leaf_index_buffers,
+               bs->scratch.leaf_index_buffer_stride,
+               0 /* offset */,
+               p_numPrimitives);
+         } else {
+            genX(grl_leaf_builder_buildLeafDXR_quads)(
+               cmd_buffer,
+               PREFIX_MK_STATE(grl_leaf_builder, bs->state),
+               bs->scratch.leaf_index_buffers,
+               bs->scratch.leaf_index_buffer_stride,
+               0 /* offset */,
+               p_numPrimitives,
+               false /* allow_updates */);
+         }
+      }
+   }
+
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_GRL_FLUSH_FLAGS,
+                             "building accel struct");
+
+   trace_intel_end_as_build(&cmd_buffer->trace);
+
+ error:
+   vk_free(&cmd_buffer->device->vk.alloc, builds);
+}
+
+void
+genX(CmdBuildAccelerationStructuresKHR)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    infoCount,
+    const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
+    const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   cmd_build_acceleration_structures(cmd_buffer, infoCount, pInfos,
+                                     ppBuildRangeInfos, NULL, NULL, NULL);
+}
+
+void
+genX(CmdBuildAccelerationStructuresIndirectKHR)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    infoCount,
+    const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
+    const VkDeviceAddress*                      pIndirectDeviceAddresses,
+    const uint32_t*                             pIndirectStrides,
+    const uint32_t* const*                      ppMaxPrimitiveCounts)
+{
+   unreachable("Unimplemented");
+}
+
+void
+genX(CmdCopyAccelerationStructureKHR)(
+    VkCommandBuffer                             commandBuffer,
+    const VkCopyAccelerationStructureInfoKHR*   pInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(vk_acceleration_structure, src_accel, pInfo->src);
+   ANV_FROM_HANDLE(vk_acceleration_structure, dst_accel, pInfo->dst);
+
+   assert(pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_COMPACT_KHR ||
+          pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_KHR);
+
+   if (pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_KHR) {
+      uint64_t src_size_addr =
+         vk_acceleration_structure_get_va(src_accel) +
+         offsetof(struct BVHBase, Meta.allocationSize);
+      genX(grl_copy_clone_indirect)(
+         cmd_buffer,
+         vk_acceleration_structure_get_va(dst_accel),
+         vk_acceleration_structure_get_va(src_accel),
+         src_size_addr);
+   } else {
+      genX(grl_copy_compact)(
+         cmd_buffer,
+         vk_acceleration_structure_get_va(dst_accel),
+         vk_acceleration_structure_get_va(src_accel));
+   }
+
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                             "after copy acceleration struct");
+}
+
+void
+genX(CmdCopyAccelerationStructureToMemoryKHR)(
+    VkCommandBuffer                             commandBuffer,
+    const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(vk_acceleration_structure, src_accel, pInfo->src);
+   struct anv_device *device = cmd_buffer->device;
+   uint64_t src_size_addr =
+      vk_acceleration_structure_get_va(src_accel) +
+      offsetof(struct BVHBase, Meta.allocationSize);
+
+   assert(pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_SERIALIZE_KHR);
+
+   genX(grl_copy_serialize_indirect)(
+      cmd_buffer,
+      pInfo->dst.deviceAddress,
+      vk_acceleration_structure_get_va(src_accel),
+      anv_address_physical(device->rt_uuid_addr),
+      src_size_addr);
+
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                             "after copy acceleration struct");
+}
+
+void
+genX(CmdCopyMemoryToAccelerationStructureKHR)(
+    VkCommandBuffer                             commandBuffer,
+    const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(vk_acceleration_structure, dst_accel, pInfo->dst);
+
+   assert(pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_DESERIALIZE_KHR);
+
+   uint64_t src_size_addr = pInfo->src.deviceAddress +
+      offsetof(struct SerializationHeader, DeserializedSizeInBytes);
+   genX(grl_copy_deserialize_indirect)(
+      cmd_buffer,
+      vk_acceleration_structure_get_va(dst_accel),
+      pInfo->src.deviceAddress,
+      src_size_addr);
+
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                             "after copy acceleration struct");
+}
+
+/* TODO: Host commands */
+
+VkResult
+genX(BuildAccelerationStructuresKHR)(
+    VkDevice                                    _device,
+    VkDeferredOperationKHR                      deferredOperation,
+    uint32_t                                    infoCount,
+    const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
+    const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   unreachable("Unimplemented");
+   return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
+}
+
+VkResult
+genX(CopyAccelerationStructureKHR)(
+    VkDevice                                    _device,
+    VkDeferredOperationKHR                      deferredOperation,
+    const VkCopyAccelerationStructureInfoKHR*   pInfo)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   unreachable("Unimplemented");
+   return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
+}
+
+VkResult
+genX(CopyAccelerationStructureToMemoryKHR)(
+    VkDevice                                    _device,
+    VkDeferredOperationKHR                      deferredOperation,
+    const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   unreachable("Unimplemented");
+   return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
+}
+
+VkResult
+genX(CopyMemoryToAccelerationStructureKHR)(
+    VkDevice                                    _device,
+    VkDeferredOperationKHR                      deferredOperation,
+    const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   unreachable("Unimplemented");
+   return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
+}
+
+VkResult
+genX(WriteAccelerationStructuresPropertiesKHR)(
+    VkDevice                                    _device,
+    uint32_t                                    accelerationStructureCount,
+    const VkAccelerationStructureKHR*           pAccelerationStructures,
+    VkQueryType                                 queryType,
+    size_t                                      dataSize,
+    void*                                       pData,
+    size_t                                      stride)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   unreachable("Unimplemented");
+   return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
+}
+
+#endif /* GFX_VERx10 >= 125 */
diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c
index ced154f72e1..b9d1902d3b5 100644
--- a/src/intel/vulkan/genX_blorp_exec.c
+++ b/src/intel/vulkan/genX_blorp_exec.c
@@ -26,23 +26,41 @@
 #include "anv_private.h"
 #include "anv_measure.h"
 
-/* These are defined in anv_private.h and blorp_genX_exec.h */
+/* These are defined in anv_private.h and blorp_genX_exec_brw.h */
 #undef __gen_address_type
 #undef __gen_user_data
 #undef __gen_combine_address
 
 #include "common/intel_l3_config.h"
-#include "blorp/blorp_genX_exec.h"
+#include "blorp/blorp_genX_exec_brw.h"
+
+#include "ds/intel_tracepoints.h"
 
 static void blorp_measure_start(struct blorp_batch *_batch,
                                 const struct blorp_params *params)
 {
    struct anv_cmd_buffer *cmd_buffer = _batch->driver_batch;
+   trace_intel_begin_blorp(&cmd_buffer->trace);
    anv_measure_snapshot(cmd_buffer,
-                        params->snapshot_type,
+                        blorp_op_to_intel_measure_snapshot(params->op),
                         NULL, 0);
 }
 
+static void blorp_measure_end(struct blorp_batch *_batch,
+                              const struct blorp_params *params)
+{
+   struct anv_cmd_buffer *cmd_buffer = _batch->driver_batch;
+   trace_intel_end_blorp(&cmd_buffer->trace,
+                         params->op,
+                         params->x1 - params->x0,
+                         params->y1 - params->y0,
+                         params->num_samples,
+                         params->shader_pipeline,
+                         params->dst.view.format,
+                         params->src.view.format,
+                         (_batch->flags & BLORP_BATCH_PREDICATE_ENABLE));
+}
+
 static void *
 blorp_emit_dwords(struct blorp_batch *batch, unsigned n)
 {
@@ -55,10 +73,12 @@ blorp_emit_reloc(struct blorp_batch *batch,
                  void *location, struct blorp_address address, uint32_t delta)
 {
    struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
-   assert(cmd_buffer->batch.start <= location &&
-          location < cmd_buffer->batch.end);
-   return anv_batch_emit_reloc(&cmd_buffer->batch, location,
-                               address.buffer, address.offset + delta);
+   struct anv_address anv_addr = {
+      .bo = address.buffer,
+      .offset = address.offset,
+   };
+   anv_reloc_list_add_bo(cmd_buffer->batch.relocs, anv_addr.bo);
+   return anv_address_physical(anv_address_add(anv_addr, delta));
 }
 
 static void
@@ -66,59 +86,47 @@ blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset,
                     struct blorp_address address, uint32_t delta)
 {
    struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
-   VkResult result;
-
-   if (ANV_ALWAYS_SOFTPIN) {
-      result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
-                                     &cmd_buffer->pool->alloc,
-                                     address.buffer);
-      if (unlikely(result != VK_SUCCESS))
-         anv_batch_set_error(&cmd_buffer->batch, result);
-      return;
-   }
 
-   uint64_t address_u64 = 0;
-   result = anv_reloc_list_add(&cmd_buffer->surface_relocs,
-                               &cmd_buffer->pool->alloc,
-                               ss_offset, address.buffer,
-                               address.offset + delta,
-                               &address_u64);
-   if (result != VK_SUCCESS)
+   VkResult result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
+                                           address.buffer);
+   if (unlikely(result != VK_SUCCESS))
       anv_batch_set_error(&cmd_buffer->batch, result);
-
-   void *dest = anv_block_pool_map(
-      &cmd_buffer->device->surface_state_pool.block_pool, ss_offset, 8);
-   write_reloc(cmd_buffer->device, dest, address_u64, false);
 }
 
 static uint64_t
 blorp_get_surface_address(struct blorp_batch *blorp_batch,
                           struct blorp_address address)
 {
-   if (ANV_ALWAYS_SOFTPIN) {
-      struct anv_address anv_addr = {
-         .bo = address.buffer,
-         .offset = address.offset,
-      };
-      return anv_address_physical(anv_addr);
-   } else {
-      /* We'll let blorp_surface_reloc write the address. */
-      return 0;
-   }
+   struct anv_address anv_addr = {
+      .bo = address.buffer,
+      .offset = address.offset,
+   };
+   return anv_address_physical(anv_addr);
 }
 
-#if GFX_VER >= 7 && GFX_VER < 10
+#if GFX_VER == 9
 static struct blorp_address
 blorp_get_surface_base_address(struct blorp_batch *batch)
 {
    struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
    return (struct blorp_address) {
-      .buffer = cmd_buffer->device->surface_state_pool.block_pool.bo,
-      .offset = 0,
+      .buffer = cmd_buffer->device->internal_surface_state_pool.block_pool.bo,
+      .offset = -cmd_buffer->device->internal_surface_state_pool.start_offset,
    };
 }
 #endif
 
+static uint32_t
+blorp_get_dynamic_state(struct blorp_batch *batch,
+                        enum blorp_dynamic_state name)
+{
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+   return (cmd_buffer->state.current_db_mode ==
+           ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER) ?
+      cmd_buffer->device->blorp.dynamic_states[name].db_state.offset :
+      cmd_buffer->device->blorp.dynamic_states[name].state.offset;
+}
+
 static void *
 blorp_alloc_dynamic_state(struct blorp_batch *batch,
                           uint32_t size,
@@ -134,7 +142,22 @@ blorp_alloc_dynamic_state(struct blorp_batch *batch,
    return state.map;
 }
 
-static void
+UNUSED static void *
+blorp_alloc_general_state(struct blorp_batch *batch,
+                          uint32_t size,
+                          uint32_t alignment,
+                          uint32_t *offset)
+{
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+
+   struct anv_state state =
+      anv_cmd_buffer_alloc_general_state(cmd_buffer, size, alignment);
+
+   *offset = state.offset;
+   return state.map;
+}
+
+static bool
 blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
                           unsigned state_size, unsigned state_alignment,
                           uint32_t *bt_offset,
@@ -149,18 +172,30 @@ blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
       anv_cmd_buffer_alloc_blorp_binding_table(cmd_buffer, num_entries,
                                                &state_offset, &bt_state);
    if (result != VK_SUCCESS)
-      return;
+      return false;
 
    uint32_t *bt_map = bt_state.map;
    *bt_offset = bt_state.offset;
 
    for (unsigned i = 0; i < num_entries; i++) {
       struct anv_state surface_state =
-         anv_cmd_buffer_alloc_surface_state(cmd_buffer);
+         anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
+      if (surface_state.map == NULL)
+         return false;
+
       bt_map[i] = surface_state.offset + state_offset;
       surface_offsets[i] = surface_state.offset;
       surface_maps[i] = surface_state.map;
    }
+
+   return true;
+}
+
+static uint32_t
+blorp_binding_table_offset_to_pointer(struct blorp_batch *batch,
+                                      uint32_t offset)
+{
+   return offset;
 }
 
 static void *
@@ -169,11 +204,13 @@ blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
 {
    struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
    struct anv_state vb_state =
-      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 64);
+      anv_cmd_buffer_alloc_temporary_state(cmd_buffer, size, 64);
+   struct anv_address vb_addr =
+      anv_cmd_buffer_temporary_state_address(cmd_buffer, vb_state);
 
    *addr = (struct blorp_address) {
-      .buffer = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
-      .offset = vb_state.offset,
+      .buffer = vb_addr.bo,
+      .offset = vb_addr.offset,
       .mocs = isl_mocs(&cmd_buffer->device->isl_dev,
                        ISL_SURF_USAGE_VERTEX_BUFFER_BIT, false),
    };
@@ -187,6 +224,7 @@ blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
                                            uint32_t *sizes,
                                            unsigned num_vbs)
 {
+#if GFX_VER == 9
    struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
 
    for (unsigned i = 0; i < num_vbs; i++) {
@@ -206,6 +244,7 @@ blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
     */
    genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer, SEQUENTIAL,
                                                        (1 << num_vbs) - 1);
+#endif
 }
 
 UNUSED static struct blorp_address
@@ -226,6 +265,18 @@ blorp_flush_range(struct blorp_batch *batch, void *start, size_t size)
     */
 }
 
+static void
+blorp_pre_emit_urb_config(struct blorp_batch *blorp_batch,
+                          struct intel_urb_config *urb_cfg)
+{
+   struct anv_cmd_buffer *cmd_buffer = blorp_batch->driver_batch;
+   genX(urb_workaround)(cmd_buffer, urb_cfg);
+
+   /* Update urb config. */
+   memcpy(&cmd_buffer->state.gfx.urb_cfg, urb_cfg,
+          sizeof(struct intel_urb_config));
+}
+
 static const struct intel_l3_config *
 blorp_get_l3_config(struct blorp_batch *batch)
 {
@@ -233,17 +284,17 @@ blorp_get_l3_config(struct blorp_batch *batch)
    return cmd_buffer->state.current_l3_config;
 }
 
-void
-genX(blorp_exec)(struct blorp_batch *batch,
-                 const struct blorp_params *params)
+static void
+blorp_exec_on_render(struct blorp_batch *batch,
+                     const struct blorp_params *params)
 {
+   assert((batch->flags & BLORP_BATCH_USE_COMPUTE) == 0);
+
    struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+   assert(cmd_buffer->queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT);
 
-   if (!cmd_buffer->state.current_l3_config) {
-      const struct intel_l3_config *cfg =
-         intel_get_default_l3_config(&cmd_buffer->device->info);
-      genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
-   }
+   struct anv_gfx_dynamic_state *hw_state =
+      &cmd_buffer->state.gfx.dyn_state;
 
    const unsigned scale = params->fast_clear_op ? UINT_MAX : 1;
    genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, params->x1 - params->x0,
@@ -252,40 +303,52 @@ genX(blorp_exec)(struct blorp_batch *batch,
 #if GFX_VER >= 11
    /* The PIPE_CONTROL command description says:
     *
-    *    "Whenever a Binding Table Index (BTI) used by a Render Taget Message
+    *    "Whenever a Binding Table Index (BTI) used by a Render Target Message
     *     points to a different RENDER_SURFACE_STATE, SW must issue a Render
     *     Target Cache Flush by enabling this bit. When render target flush
     *     is set due to new association of BTI, PS Scoreboard Stall bit must
     *     be set in this packet."
     */
-   anv_add_pending_pipe_bits(cmd_buffer,
-                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
-                             ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
-                             "before blorp BTI change");
+   if (blorp_uses_bti_rt_writes(batch, params)) {
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                                ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
+                                "before blorp BTI change");
+   }
 #endif
 
-   if (params->depth.enabled &&
-       !(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL))
-      genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, &params->depth.surf);
-
-#if GFX_VER == 7
-   /* The MI_LOAD/STORE_REGISTER_MEM commands which BLORP uses to implement
-    * indirect fast-clear colors can cause GPU hangs if we don't stall first.
-    * See genX(cmd_buffer_mi_memcpy) for more details.
-    */
-   if (params->src.clear_color_addr.buffer ||
-       params->dst.clear_color_addr.buffer) {
-      anv_add_pending_pipe_bits(cmd_buffer,
-                                ANV_PIPE_CS_STALL_BIT,
-                                "before blorp prep fast clear");
+#if GFX_VERx10 >= 125
+   /* Check if blorp ds state matches ours. */
+   if (intel_needs_workaround(cmd_buffer->device->info, 18019816803)) {
+      bool blorp_ds_state = params->depth.enabled || params->stencil.enabled;
+      if (cmd_buffer->state.gfx.ds_write_state != blorp_ds_state) {
+         /* Flag the change in ds_write_state so that the next pipeline use
+          * will trigger a PIPE_CONTROL too.
+          */
+         cmd_buffer->state.gfx.ds_write_state = blorp_ds_state;
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_WA_18019816803);
+
+         /* Add the stall that will flush prior to the blorp operation by
+          * genX(cmd_buffer_apply_pipe_flushes)
+          */
+         anv_add_pending_pipe_bits(cmd_buffer,
+                                   ANV_PIPE_PSS_STALL_SYNC_BIT,
+                                   "Wa_18019816803");
+      }
    }
 #endif
 
-   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+   if (params->depth.enabled &&
+       !(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL))
+      genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, &params->depth.surf);
 
    genX(flush_pipeline_select_3d)(cmd_buffer);
 
-   genX(cmd_buffer_emit_gfx7_depth_flush)(cmd_buffer);
+   /* Wa_14015814527 */
+   genX(apply_task_urb_workaround)(cmd_buffer);
+
+   /* Apply any outstanding flushes in case pipeline select haven't. */
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 
    /* BLORP doesn't do anything fancy with depth such as discards, so we want
     * the PMA fix off.  Also, off is always the safe option.
@@ -297,19 +360,151 @@ genX(blorp_exec)(struct blorp_batch *batch,
 #if GFX_VER >= 11
    /* The PIPE_CONTROL command description says:
     *
-    *    "Whenever a Binding Table Index (BTI) used by a Render Taget Message
+    *    "Whenever a Binding Table Index (BTI) used by a Render Target Message
     *     points to a different RENDER_SURFACE_STATE, SW must issue a Render
     *     Target Cache Flush by enabling this bit. When render target flush
     *     is set due to new association of BTI, PS Scoreboard Stall bit must
     *     be set in this packet."
     */
-   anv_add_pending_pipe_bits(cmd_buffer,
-                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
-                             ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
-                             "after blorp BTI change");
+   if (blorp_uses_bti_rt_writes(batch, params)) {
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                                ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
+                                "after blorp BTI change");
+   }
+#endif
+
+   /* Flag all the instructions emitted by BLORP. */
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_URB);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
+#if GFX_VER >= 11
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
+#endif
+#if GFX_VER >= 12
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
 #endif
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_RASTER);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CLIP);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SAMPLE_MASK);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SF);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SBE);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SBE_SWIZ);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DEPTH_BOUNDS);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_WM);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_WM_DEPTH_STENCIL);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VS);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_HS);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DS);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TE);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_GS);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS_EXTRA);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE_PTR);
+   if (batch->blorp->config.use_mesh_shading) {
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL);
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL);
+   }
+   if (params->wm_prog_data) {
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CC_STATE_PTR);
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS_BLEND);
+   }
+
+   anv_cmd_dirty_mask_t dirty = ~(ANV_CMD_DIRTY_INDEX_BUFFER |
+                                  ANV_CMD_DIRTY_XFB_ENABLE);
 
    cmd_buffer->state.gfx.vb_dirty = ~0;
-   cmd_buffer->state.gfx.dirty = ~0;
-   cmd_buffer->state.push_constants_dirty = ~0;
+   cmd_buffer->state.gfx.dirty |= dirty;
+   cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
+}
+
+static void
+blorp_exec_on_compute(struct blorp_batch *batch,
+                      const struct blorp_params *params)
+{
+   assert(batch->flags & BLORP_BATCH_USE_COMPUTE);
+
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+   assert(cmd_buffer->queue_family->queueFlags & VK_QUEUE_COMPUTE_BIT);
+
+   genX(flush_pipeline_select_gpgpu)(cmd_buffer);
+
+   /* Apply any outstanding flushes in case pipeline select haven't. */
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   blorp_exec(batch, params);
+
+   cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+   cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+   cmd_buffer->state.compute.pipeline_dirty = true;
+}
+
+static void
+blorp_exec_on_blitter(struct blorp_batch *batch,
+                      const struct blorp_params *params)
+{
+   assert(batch->flags & BLORP_BATCH_USE_BLITTER);
+
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+   assert(cmd_buffer->queue_family->queueFlags == VK_QUEUE_TRANSFER_BIT);
+
+   blorp_exec(batch, params);
+}
+
+void
+genX(blorp_exec)(struct blorp_batch *batch,
+                 const struct blorp_params *params)
+{
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+
+   /* Turn on preemption if it was toggled off. */
+   if (!cmd_buffer->state.gfx.object_preemption)
+      genX(cmd_buffer_set_preemption)(cmd_buffer, true);
+
+   if (!cmd_buffer->state.current_l3_config) {
+      const struct intel_l3_config *cfg =
+         intel_get_default_l3_config(cmd_buffer->device->info);
+      genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
+   }
+
+   if (batch->flags & BLORP_BATCH_USE_BLITTER)
+      blorp_exec_on_blitter(batch, params);
+   else if (batch->flags & BLORP_BATCH_USE_COMPUTE)
+      blorp_exec_on_compute(batch, params);
+   else
+      blorp_exec_on_render(batch, params);
+}
+
+static void
+blorp_emit_pre_draw(struct blorp_batch *batch, const struct blorp_params *params)
+{
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+   blorp_measure_start(batch, params);
+   genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+}
+
+static void
+blorp_emit_post_draw(struct blorp_batch *batch, const struct blorp_params *params)
+{
+   struct anv_cmd_buffer *cmd_buffer = batch->driver_batch;
+
+   genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+                                         cmd_buffer->device,
+                                         _3DPRIM_RECTLIST,
+                                         3);
+
+   genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
+   blorp_measure_end(batch, params);
+}
+
+void
+genX(blorp_init_dynamic_states)(struct blorp_context *context)
+{
+   blorp_init_dynamic_states(context);
 }
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index 7d3e72f1711..390a8ac2bde 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -26,17 +26,16 @@
 
 #include "anv_private.h"
 #include "anv_measure.h"
-#include "vk_format.h"
+#include "vk_render_pass.h"
 #include "vk_util.h"
-#include "util/fast_idiv_by_const.h"
 
 #include "common/intel_aux_map.h"
-#include "common/intel_l3_config.h"
 #include "genxml/gen_macros.h"
 #include "genxml/genX_pack.h"
-#include "genxml/gen_rt_pack.h"
+#include "genxml/genX_rt_pack.h"
+#include "common/intel_genX_state_brw.h"
 
-#include "nir/nir_xfb_info.h"
+#include "ds/intel_tracepoints.h"
 
 /* We reserve :
  *    - GPR 14 for secondary command buffer returns
@@ -48,6 +47,8 @@
 #define __gen_get_batch_address(b, a) anv_batch_address(b, a)
 #include "common/mi_builder.h"
 
+#include "genX_cmd_draw_generated_flush.h"
+
 static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
                                         uint32_t pipeline);
 
@@ -56,11 +57,17 @@ convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {
    enum anv_pipe_bits bits = 0;
    bits |= (pc->DepthCacheFlushEnable) ?  ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0;
    bits |= (pc->DCFlushEnable) ?  ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
-#if GFX_VER >= 12
+#if GFX_VERx10 >= 125
+   bits |= (pc->PSSStallSyncEnable) ?  ANV_PIPE_PSS_STALL_SYNC_BIT : 0;
+#endif
+#if GFX_VER == 12
    bits |= (pc->TileCacheFlushEnable) ?  ANV_PIPE_TILE_CACHE_FLUSH_BIT : 0;
+#endif
+#if GFX_VER >= 12
    bits |= (pc->HDCPipelineFlushEnable) ?  ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : 0;
 #endif
    bits |= (pc->RenderTargetCacheFlushEnable) ?  ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0;
+   bits |= (pc->VFCacheInvalidationEnable) ?  ANV_PIPE_VF_CACHE_INVALIDATE_BIT : 0;
    bits |= (pc->StateCacheInvalidationEnable) ?  ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0;
    bits |= (pc->ConstantCacheInvalidationEnable) ?  ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0;
    bits |= (pc->TextureCacheInvalidationEnable) ?  ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0;
@@ -68,22 +75,29 @@ convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {
    bits |= (pc->StallAtPixelScoreboard) ?  ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0;
    bits |= (pc->DepthStallEnable) ?  ANV_PIPE_DEPTH_STALL_BIT : 0;
    bits |= (pc->CommandStreamerStallEnable) ?  ANV_PIPE_CS_STALL_BIT : 0;
+#if GFX_VERx10 == 125
+   bits |= (pc->UntypedDataPortCacheFlushEnable) ? ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT : 0;
+   bits |= (pc->CCSFlushEnable) ? ANV_PIPE_CCS_CACHE_FLUSH_BIT : 0;
+#endif
    return bits;
 }
 
-#define anv_debug_dump_pc(pc) \
-   if (unlikely(INTEL_DEBUG & DEBUG_PIPE_CONTROL)) { \
-      fputs("pc: emit PC=( ", stderr); \
-      anv_dump_pipe_bits(convert_pc_to_bits(&(pc))); \
-      fprintf(stderr, ") reason: %s\n", __FUNCTION__); \
+#define anv_debug_dump_pc(pc, reason) \
+   if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { \
+      fputs("pc: emit PC=( ", stdout); \
+      anv_dump_pipe_bits(convert_pc_to_bits(&(pc)), stdout);   \
+      fprintf(stdout, ") reason: %s\n", reason); \
    }
 
 void
 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
 {
+   if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) ||
+       anv_cmd_buffer_is_video_queue(cmd_buffer))
+      return;
+
    struct anv_device *device = cmd_buffer->device;
-   UNUSED const struct intel_device_info *devinfo = &device->info;
-   uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
+   const uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
 
    /* If we are emitting a new state base address we probably need to re-emit
     * binding tables.
@@ -93,33 +107,22 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
    /* Emit a render target cache flush.
     *
     * This isn't documented anywhere in the PRM.  However, it seems to be
-    * necessary prior to changing the surface state base adress.  Without
+    * necessary prior to changing the surface state base address.  Without
     * this, we get GPU hangs when using multi-level command buffers which
     * clear depth, reset state base address, and then go render stuff.
     */
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+   genx_batch_emit_pipe_control
+      (&cmd_buffer->batch, cmd_buffer->device->info,
+       cmd_buffer->state.current_pipeline,
 #if GFX_VER >= 12
-      pc.HDCPipelineFlushEnable = true;
+       ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
 #else
-      pc.DCFlushEnable = true;
-#endif
-      pc.RenderTargetCacheFlushEnable = true;
-      pc.CommandStreamerStallEnable = true;
-#if GFX_VER == 12
-      /* Wa_1606662791:
-       *
-       *   Software must program PIPE_CONTROL command with "HDC Pipeline
-       *   Flush" prior to programming of the below two non-pipeline state :
-       *      * STATE_BASE_ADDRESS
-       *      * 3DSTATE_BINDING_TABLE_POOL_ALLOC
-       */
-      if (devinfo->revision == 0 /* A0 */)
-         pc.HDCPipelineFlushEnable = true;
+       ANV_PIPE_DATA_CACHE_FLUSH_BIT |
 #endif
-      anv_debug_dump_pc(pc);
-   }
+       ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+       ANV_PIPE_CS_STALL_BIT);
 
-#if GFX_VER == 12
+#if INTEL_NEEDS_WA_1607854226
    /* Wa_1607854226:
     *
     *  Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline
@@ -129,94 +132,157 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
    genX(flush_pipeline_select_3d)(cmd_buffer);
 #endif
 
+   /* If no API entry point selected the current mode (this can happen if the
+    * first operation in the command buffer is a , select BUFFER if
+    * EXT_descriptor_buffer is enabled, otherwise LEGACY.
+    */
+   if (cmd_buffer->state.pending_db_mode ==
+       ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN) {
+      cmd_buffer->state.pending_db_mode =
+         cmd_buffer->device->vk.enabled_extensions.EXT_descriptor_buffer ?
+         ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER :
+         ANV_CMD_DESCRIPTOR_BUFFER_MODE_LEGACY;
+   }
+
    anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
       sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
       sba.GeneralStateMOCS = mocs;
+      sba.GeneralStateBufferSize = 0xfffff;
       sba.GeneralStateBaseAddressModifyEnable = true;
+      sba.GeneralStateBufferSizeModifyEnable = true;
 
       sba.StatelessDataPortAccessMOCS = mocs;
 
+#if GFX_VERx10 >= 125
+      sba.SurfaceStateBaseAddress =
+         (struct anv_address) { .offset =
+         device->physical->va.internal_surface_state_pool.addr,
+      };
+#else
       sba.SurfaceStateBaseAddress =
          anv_cmd_buffer_surface_base_address(cmd_buffer);
+#endif
       sba.SurfaceStateMOCS = mocs;
       sba.SurfaceStateBaseAddressModifyEnable = true;
 
-      sba.DynamicStateBaseAddress =
-         (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 };
-      sba.DynamicStateMOCS = mocs;
-      sba.DynamicStateBaseAddressModifyEnable = true;
-
       sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
       sba.IndirectObjectMOCS = mocs;
+      sba.IndirectObjectBufferSize = 0xfffff;
       sba.IndirectObjectBaseAddressModifyEnable = true;
+      sba.IndirectObjectBufferSizeModifyEnable  = true;
 
       sba.InstructionBaseAddress =
          (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 };
       sba.InstructionMOCS = mocs;
+      sba.InstructionBufferSize =
+         device->physical->va.instruction_state_pool.size / 4096;
       sba.InstructionBaseAddressModifyEnable = true;
+      sba.InstructionBuffersizeModifyEnable = true;
+
+#if GFX_VER >= 11
+      sba.BindlessSamplerStateBaseAddress = ANV_NULL_ADDRESS;
+      sba.BindlessSamplerStateBufferSize = 0;
+      sba.BindlessSamplerStateMOCS = mocs;
+      sba.BindlessSamplerStateBaseAddressModifyEnable = true;
+#endif
+
+      if (cmd_buffer->state.pending_db_mode == ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER) {
+         sba.DynamicStateBaseAddress = (struct anv_address) {
+            .offset = device->physical->va.dynamic_state_db_pool.addr,
+         };
+         sba.DynamicStateBufferSize =
+            (device->physical->va.dynamic_state_db_pool.size +
+             device->physical->va.descriptor_buffer_pool.size +
+             device->physical->va.push_descriptor_buffer_pool.size) / 4096;
+         sba.DynamicStateMOCS = mocs;
+         sba.DynamicStateBaseAddressModifyEnable = true;
+         sba.DynamicStateBufferSizeModifyEnable = true;
+
+#if GFX_VERx10 >= 125
+         sba.BindlessSurfaceStateBaseAddress = (struct anv_address) {
+            .offset = device->physical->va.descriptor_buffer_pool.addr,
+         };
+         sba.BindlessSurfaceStateSize =
+            (device->physical->va.descriptor_buffer_pool.size +
+             device->physical->va.push_descriptor_buffer_pool.size) - 1;
+         sba.BindlessSurfaceStateMOCS = mocs;
+         sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
+#else
+         const uint64_t surfaces_addr =
+            cmd_buffer->state.descriptor_buffers.surfaces_address != 0 ?
+            cmd_buffer->state.descriptor_buffers.surfaces_address :
+            anv_address_physical(device->workaround_address);
+         const uint64_t surfaces_size =
+            cmd_buffer->state.descriptor_buffers.surfaces_address != 0 ?
+            MIN2(device->physical->va.descriptor_buffer_pool.size -
+                 (cmd_buffer->state.descriptor_buffers.surfaces_address -
+                  device->physical->va.descriptor_buffer_pool.addr),
+                 anv_physical_device_bindless_heap_size(device->physical, true)) :
+            (device->workaround_bo->size - device->workaround_address.offset);
+         sba.BindlessSurfaceStateBaseAddress = (struct anv_address) {
+            .offset = surfaces_addr,
+         };
+         sba.BindlessSurfaceStateSize = surfaces_size / ANV_SURFACE_STATE_SIZE - 1;
+         sba.BindlessSurfaceStateMOCS = mocs;
+         sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
+#endif /* GFX_VERx10 < 125 */
+      } else if (!device->physical->indirect_descriptors) {
+#if GFX_VERx10 >= 125
+         sba.DynamicStateBaseAddress = (struct anv_address) {
+            .offset = device->physical->va.dynamic_state_pool.addr,
+         };
+         sba.DynamicStateBufferSize =
+            (device->physical->va.dynamic_state_pool.size +
+             device->physical->va.sampler_state_pool.size) / 4096;
+         sba.DynamicStateMOCS = mocs;
+         sba.DynamicStateBaseAddressModifyEnable = true;
+         sba.DynamicStateBufferSizeModifyEnable = true;
 
-#  if (GFX_VER >= 8)
-      /* Broadwell requires that we specify a buffer size for a bunch of
-       * these fields.  However, since we will be growing the BO's live, we
-       * just set them all to the maximum.
-       */
-      sba.GeneralStateBufferSize       = 0xfffff;
-      sba.IndirectObjectBufferSize     = 0xfffff;
-      if (anv_use_softpin(device->physical)) {
-         /* With softpin, we use fixed addresses so we actually know how big
-          * our base addresses are.
-          */
-         sba.DynamicStateBufferSize    = DYNAMIC_STATE_POOL_SIZE / 4096;
-         sba.InstructionBufferSize     = INSTRUCTION_STATE_POOL_SIZE / 4096;
-      } else {
-         sba.DynamicStateBufferSize    = 0xfffff;
-         sba.InstructionBufferSize     = 0xfffff;
-      }
-      sba.GeneralStateBufferSizeModifyEnable    = true;
-      sba.IndirectObjectBufferSizeModifyEnable  = true;
-      sba.DynamicStateBufferSizeModifyEnable    = true;
-      sba.InstructionBuffersizeModifyEnable     = true;
-#  else
-      /* On gfx7, we have upper bounds instead.  According to the docs,
-       * setting an upper bound of zero means that no bounds checking is
-       * performed so, in theory, we should be able to leave them zero.
-       * However, border color is broken and the GPU bounds-checks anyway.
-       * To avoid this and other potential problems, we may as well set it
-       * for everything.
-       */
-      sba.GeneralStateAccessUpperBound =
-         (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
-      sba.GeneralStateAccessUpperBoundModifyEnable = true;
-      sba.DynamicStateAccessUpperBound =
-         (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
-      sba.DynamicStateAccessUpperBoundModifyEnable = true;
-      sba.InstructionAccessUpperBound =
-         (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
-      sba.InstructionAccessUpperBoundModifyEnable = true;
-#  endif
-#  if (GFX_VER >= 9)
-      if (anv_use_softpin(device->physical)) {
          sba.BindlessSurfaceStateBaseAddress = (struct anv_address) {
-            .bo = device->surface_state_pool.block_pool.bo,
-            .offset = 0,
+            .offset = device->physical->va.internal_surface_state_pool.addr,
          };
-         sba.BindlessSurfaceStateSize = (1 << 20) - 1;
+         sba.BindlessSurfaceStateSize =
+            (device->physical->va.internal_surface_state_pool.size +
+             device->physical->va.bindless_surface_state_pool.size) - 1;
+         sba.BindlessSurfaceStateMOCS = mocs;
+         sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
+#else
+         unreachable("Direct descriptor not supported");
+#endif
       } else {
-         sba.BindlessSurfaceStateBaseAddress = ANV_NULL_ADDRESS;
-         sba.BindlessSurfaceStateSize = 0;
+         sba.DynamicStateBaseAddress = (struct anv_address) {
+            .offset = device->physical->va.dynamic_state_pool.addr,
+         };
+         sba.DynamicStateBufferSize =
+            (device->physical->va.dynamic_state_pool.size +
+             device->physical->va.sampler_state_pool.size) / 4096;
+         sba.DynamicStateMOCS = mocs;
+         sba.DynamicStateBaseAddressModifyEnable = true;
+         sba.DynamicStateBufferSizeModifyEnable = true;
+
+         sba.BindlessSurfaceStateBaseAddress =
+            (struct anv_address) { .offset =
+            device->physical->va.bindless_surface_state_pool.addr,
+         };
+         sba.BindlessSurfaceStateSize =
+            anv_physical_device_bindless_heap_size(device->physical, false) /
+            ANV_SURFACE_STATE_SIZE - 1;
+         sba.BindlessSurfaceStateMOCS = mocs;
+         sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
       }
-      sba.BindlessSurfaceStateMOCS = mocs;
-      sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
-#  endif
-#  if (GFX_VER >= 10)
-      sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 };
-      sba.BindlessSamplerStateMOCS = mocs;
-      sba.BindlessSamplerStateBaseAddressModifyEnable = true;
-      sba.BindlessSamplerStateBufferSize = 0;
-#  endif
+
+#if GFX_VERx10 >= 125
+      sba.L1CacheControl = L1CC_WB;
+#endif
    }
 
-#if GFX_VER == 12
+   bool db_mode_changed = false;
+   if (cmd_buffer->state.current_db_mode != cmd_buffer->state.pending_db_mode) {
+      cmd_buffer->state.current_db_mode = cmd_buffer->state.pending_db_mode;
+      db_mode_changed = true;
+   }
+
+#if INTEL_NEEDS_WA_1607854226
    /* Wa_1607854226:
     *
     *  Put the pipeline back into its current mode.
@@ -225,8 +291,12 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
       genX(flush_pipeline_select)(cmd_buffer, gfx12_wa_pipeline);
 #endif
 
+#if GFX_VERx10 >= 125
+   genX(cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
+#endif
+
    /* After re-setting the surface state base address, we have to do some
-    * cache flusing so that the sampler engine will pick up the new
+    * cache flushing so that the sampler engine will pick up the new
     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
     * Shared Function > 3D Sampler > State > State Caching (page 96):
     *
@@ -261,332 +331,145 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
     * sufficient.  The theory here is that all of the sampling/rendering
     * units cache the binding table in the texture cache.  However, we have
     * yet to be able to actually confirm this.
+    *
+    * Wa_14013910100:
+    *
+    *  "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
+    *   or program pipe control with Instruction cache invalidate post
+    *   STATE_BASE_ADDRESS command"
     */
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-      pc.TextureCacheInvalidationEnable = true;
-      pc.ConstantCacheInvalidationEnable = true;
-      pc.StateCacheInvalidationEnable = true;
-      anv_debug_dump_pc(pc);
+   enum anv_pipe_bits bits =
+      ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
+      ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
+      ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
+      (intel_needs_workaround(cmd_buffer->device->info, 16013000631) ?
+          ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0);
+
+#if GFX_VER >= 9 && GFX_VER <= 11
+      /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
+       *
+       *    "Workaround : “CS Stall” bit in PIPE_CONTROL command must be
+       *     always set for GPGPU workloads when “Texture Cache Invalidation
+       *     Enable” bit is set".
+       *
+       * Workaround stopped appearing in TGL PRMs.
+       */
+      if (cmd_buffer->state.current_pipeline == GPGPU)
+         bits |= ANV_PIPE_CS_STALL_BIT;
+#endif
+   genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
+                                cmd_buffer->state.current_pipeline,
+                                bits);
+
+   assert(cmd_buffer->state.current_db_mode !=
+          ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN);
+   if (db_mode_changed) {
+#if GFX_VER == 11 || GFX_VER == 125
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
+         ptr.SliceHashStatePointerValid = true;
+         ptr.SliceHashTableStatePointer = cmd_buffer->state.current_db_mode ==
+                                          ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER ?
+                                          device->slice_hash_db.offset :
+                                          device->slice_hash.offset;
+      }
+#endif
+
+      /* Changing the dynamic state location affects all the states having
+       * offset relative to that pointer.
+       */
+      struct anv_gfx_dynamic_state *hw_state = &cmd_buffer->state.gfx.dyn_state;
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP);
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC);
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SCISSOR);
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CC_STATE);
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE);
+      if (cmd_buffer->device->vk.enabled_extensions.KHR_fragment_shading_rate) {
+         struct vk_dynamic_graphics_state *dyn =
+            &cmd_buffer->vk.dynamic_graphics_state;
+         BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_FSR);
+      }
+
+#if GFX_VERx10 < 125
+      /* The push constant data for compute shader is an offset in the dynamic
+       * state heap. If we change it, we need to reemit the push constants.
+       */
+      cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+      cmd_buffer->state.compute.base.push_constants_data_dirty = true;
+#endif
    }
 }
 
-static void
-add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,
-                  struct anv_state state, struct anv_address addr)
+void
+genX(cmd_buffer_emit_bt_pool_base_address)(struct anv_cmd_buffer *cmd_buffer)
 {
-   VkResult result;
+   if (!anv_cmd_buffer_is_render_or_compute_queue(cmd_buffer))
+      return;
 
-   if (anv_use_softpin(cmd_buffer->device->physical)) {
-      result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
-                                     &cmd_buffer->pool->alloc,
-                                     addr.bo);
-   } else {
-      const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
-      result = anv_reloc_list_add(&cmd_buffer->surface_relocs,
-                                  &cmd_buffer->pool->alloc,
-                                  state.offset + isl_dev->ss.addr_offset,
-                                  addr.bo, addr.offset, NULL);
+   /* If we are emitting a new state base address we probably need to re-emit
+    * binding tables.
+    */
+   cmd_buffer->state.descriptors_dirty |= ~0;
+
+#if GFX_VERx10 >= 125
+   struct anv_device *device = cmd_buffer->device;
+   const uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
+
+   genx_batch_emit_pipe_control(&cmd_buffer->batch,
+                                cmd_buffer->device->info,
+                                cmd_buffer->state.current_pipeline,
+                                ANV_PIPE_CS_STALL_BIT);
+   anv_batch_emit(
+      &cmd_buffer->batch, GENX(3DSTATE_BINDING_TABLE_POOL_ALLOC), btpa) {
+      btpa.BindingTablePoolBaseAddress =
+         anv_cmd_buffer_surface_base_address(cmd_buffer);
+      btpa.BindingTablePoolBufferSize = device->physical->va.binding_table_pool.size / 4096;
+      btpa.MOCS = mocs;
    }
 
+   genx_batch_emit_pipe_control(&cmd_buffer->batch,
+                                cmd_buffer->device->info,
+                                cmd_buffer->state.current_pipeline,
+                                ANV_PIPE_STATE_CACHE_INVALIDATE_BIT);
+#else /* GFX_VERx10 < 125 */
+   genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
+#endif
+}
+
+static void
+add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,
+                  struct anv_address addr)
+{
+   VkResult result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
+                                           addr.bo);
+
    if (unlikely(result != VK_SUCCESS))
       anv_batch_set_error(&cmd_buffer->batch, result);
 }
 
 static void
 add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer,
-                         struct anv_surface_state state)
+                         const struct anv_surface_state *state)
 {
-   const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
-
-   assert(!anv_address_is_null(state.address));
-   add_surface_reloc(cmd_buffer, state.state, state.address);
+   assert(!anv_address_is_null(state->address));
+   add_surface_reloc(cmd_buffer, state->address);
 
-   if (!anv_address_is_null(state.aux_address)) {
+   if (!anv_address_is_null(state->aux_address)) {
       VkResult result =
-         anv_reloc_list_add(&cmd_buffer->surface_relocs,
-                            &cmd_buffer->pool->alloc,
-                            state.state.offset + isl_dev->ss.aux_addr_offset,
-                            state.aux_address.bo,
-                            state.aux_address.offset,
-                            NULL);
+         anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
+                               state->aux_address.bo);
       if (result != VK_SUCCESS)
          anv_batch_set_error(&cmd_buffer->batch, result);
    }
 
-   if (!anv_address_is_null(state.clear_address)) {
+   if (!anv_address_is_null(state->clear_address)) {
       VkResult result =
-         anv_reloc_list_add(&cmd_buffer->surface_relocs,
-                            &cmd_buffer->pool->alloc,
-                            state.state.offset +
-                            isl_dev->ss.clear_color_state_offset,
-                            state.clear_address.bo,
-                            state.clear_address.offset,
-                            NULL);
+         anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
+                               state->clear_address.bo);
       if (result != VK_SUCCESS)
          anv_batch_set_error(&cmd_buffer->batch, result);
    }
 }
 
-static bool
-isl_color_value_requires_conversion(union isl_color_value color,
-                                    const struct isl_surf *surf,
-                                    const struct isl_view *view)
-{
-   if (surf->format == view->format && isl_swizzle_is_identity(view->swizzle))
-      return false;
-
-   uint32_t surf_pack[4] = { 0, 0, 0, 0 };
-   isl_color_value_pack(&color, surf->format, surf_pack);
-
-   uint32_t view_pack[4] = { 0, 0, 0, 0 };
-   union isl_color_value swiz_color =
-      isl_color_value_swizzle_inv(color, view->swizzle);
-   isl_color_value_pack(&swiz_color, view->format, view_pack);
-
-   return memcmp(surf_pack, view_pack, sizeof(surf_pack)) != 0;
-}
-
-static bool
-anv_can_fast_clear_color_view(struct anv_device * device,
-                              struct anv_image_view *iview,
-                              VkImageLayout layout,
-                              union isl_color_value clear_color,
-                              uint32_t num_layers,
-                              VkRect2D render_area)
-{
-   if (iview->planes[0].isl.base_array_layer >=
-       anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT,
-                            iview->planes[0].isl.base_level))
-      return false;
-
-   /* Start by getting the fast clear type.  We use the first subpass
-    * layout here because we don't want to fast-clear if the first subpass
-    * to use the attachment can't handle fast-clears.
-    */
-   enum anv_fast_clear_type fast_clear_type =
-      anv_layout_to_fast_clear_type(&device->info, iview->image,
-                                    VK_IMAGE_ASPECT_COLOR_BIT,
-                                    layout);
-   switch (fast_clear_type) {
-   case ANV_FAST_CLEAR_NONE:
-      return false;
-   case ANV_FAST_CLEAR_DEFAULT_VALUE:
-      if (!isl_color_value_is_zero(clear_color, iview->planes[0].isl.format))
-         return false;
-      break;
-   case ANV_FAST_CLEAR_ANY:
-      break;
-   }
-
-   /* Potentially, we could do partial fast-clears but doing so has crazy
-    * alignment restrictions.  It's easier to just restrict to full size
-    * fast clears for now.
-    */
-   if (render_area.offset.x != 0 ||
-       render_area.offset.y != 0 ||
-       render_area.extent.width != iview->vk.extent.width ||
-       render_area.extent.height != iview->vk.extent.height)
-      return false;
-
-   /* On Broadwell and earlier, we can only handle 0/1 clear colors */
-   if (GFX_VER <= 8 &&
-       !isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format))
-      return false;
-
-   /* If the clear color is one that would require non-trivial format
-    * conversion on resolve, we don't bother with the fast clear.  This
-    * shouldn't be common as most clear colors are 0/1 and the most common
-    * format re-interpretation is for sRGB.
-    */
-   if (isl_color_value_requires_conversion(clear_color,
-                                           &iview->image->planes[0].primary_surface.isl,
-                                           &iview->planes[0].isl)) {
-      anv_perf_warn(device, &iview->vk.base,
-                    "Cannot fast-clear to colors which would require "
-                    "format conversion on resolve");
-      return false;
-   }
-
-   /* We only allow fast clears to the first slice of an image (level 0,
-    * layer 0) and only for the entire slice.  This guarantees us that, at
-    * any given time, there is only one clear color on any given image at
-    * any given time.  At the time of our testing (Jan 17, 2018), there
-    * were no known applications which would benefit from fast-clearing
-    * more than just the first slice.
-    */
-   if (iview->planes[0].isl.base_level > 0 ||
-       iview->planes[0].isl.base_array_layer > 0) {
-      anv_perf_warn(device, &iview->image->vk.base,
-                    "Rendering with multi-lod or multi-layer framebuffer "
-                    "with LOAD_OP_LOAD and baseMipLevel > 0 or "
-                    "baseArrayLayer > 0.  Not fast clearing.");
-      return false;
-   }
-
-   if (num_layers > 1) {
-      anv_perf_warn(device, &iview->image->vk.base,
-                    "Rendering to a multi-layer framebuffer with "
-                    "LOAD_OP_CLEAR.  Only fast-clearing the first slice");
-   }
-
-   return true;
-}
-
-static bool
-anv_can_hiz_clear_ds_view(struct anv_device *device,
-                          struct anv_image_view *iview,
-                          VkImageLayout layout,
-                          VkImageAspectFlags clear_aspects,
-                          float depth_clear_value,
-                          VkRect2D render_area)
-{
-   /* We don't do any HiZ or depth fast-clears on gfx7 yet */
-   if (GFX_VER == 7)
-      return false;
-
-   /* If we're just clearing stencil, we can always HiZ clear */
-   if (!(clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
-      return true;
-
-   /* We must have depth in order to have HiZ */
-   if (!(iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
-      return false;
-
-   const enum isl_aux_usage clear_aux_usage =
-      anv_layout_to_aux_usage(&device->info, iview->image,
-                              VK_IMAGE_ASPECT_DEPTH_BIT,
-                              VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
-                              layout);
-   if (!blorp_can_hiz_clear_depth(&device->info,
-                                  &iview->image->planes[0].primary_surface.isl,
-                                  clear_aux_usage,
-                                  iview->planes[0].isl.base_level,
-                                  iview->planes[0].isl.base_array_layer,
-                                  render_area.offset.x,
-                                  render_area.offset.y,
-                                  render_area.offset.x +
-                                  render_area.extent.width,
-                                  render_area.offset.y +
-                                  render_area.extent.height))
-      return false;
-
-   if (depth_clear_value != ANV_HZ_FC_VAL)
-      return false;
-
-   /* Only gfx9+ supports returning ANV_HZ_FC_VAL when sampling a fast-cleared
-    * portion of a HiZ buffer. Testing has revealed that Gfx8 only supports
-    * returning 0.0f. Gens prior to gfx8 do not support this feature at all.
-    */
-   if (GFX_VER == 8 && anv_can_sample_with_hiz(&device->info, iview->image))
-      return false;
-
-   /* If we got here, then we can fast clear */
-   return true;
-}
-
-#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
-
-#if GFX_VER == 12
-static void
-anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer,
-                      const struct anv_image *image,
-                      VkImageAspectFlagBits aspect,
-                      uint32_t base_level, uint32_t level_count,
-                      uint32_t base_layer, uint32_t layer_count)
-{
-   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
-
-   const struct anv_surface *surface = &image->planes[plane].primary_surface;
-   uint64_t base_address =
-      anv_address_physical(anv_image_address(image, &surface->memory_range));
-
-   const struct isl_surf *isl_surf = &image->planes[plane].primary_surface.isl;
-   uint64_t format_bits = intel_aux_map_format_bits_for_isl_surf(isl_surf);
-
-   /* We're about to live-update the AUX-TT.  We really don't want anyone else
-    * trying to read it while we're doing this.  We could probably get away
-    * with not having this stall in some cases if we were really careful but
-    * it's better to play it safe.  Full stall the GPU.
-    */
-   anv_add_pending_pipe_bits(cmd_buffer,
-                             ANV_PIPE_END_OF_PIPE_SYNC_BIT,
-                             "before update AUX-TT");
-   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-
-   struct mi_builder b;
-   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
-
-   for (uint32_t a = 0; a < layer_count; a++) {
-      const uint32_t layer = base_layer + a;
-
-      uint64_t start_offset_B = UINT64_MAX, end_offset_B = 0;
-      for (uint32_t l = 0; l < level_count; l++) {
-         const uint32_t level = base_level + l;
-
-         uint32_t logical_array_layer, logical_z_offset_px;
-         if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
-            logical_array_layer = 0;
-
-            /* If the given miplevel does not have this layer, then any higher
-             * miplevels won't either because miplevels only get smaller the
-             * higher the LOD.
-             */
-            assert(layer < image->vk.extent.depth);
-            if (layer >= anv_minify(image->vk.extent.depth, level))
-               break;
-            logical_z_offset_px = layer;
-         } else {
-            assert(layer < image->vk.array_layers);
-            logical_array_layer = layer;
-            logical_z_offset_px = 0;
-         }
-
-         uint64_t slice_start_offset_B, slice_end_offset_B;
-         isl_surf_get_image_range_B_tile(isl_surf, level,
-                                         logical_array_layer,
-                                         logical_z_offset_px,
-                                         &slice_start_offset_B,
-                                         &slice_end_offset_B);
-
-         start_offset_B = MIN2(start_offset_B, slice_start_offset_B);
-         end_offset_B = MAX2(end_offset_B, slice_end_offset_B);
-      }
-
-      /* Aux operates 64K at a time */
-      start_offset_B = align_down_u64(start_offset_B, 64 * 1024);
-      end_offset_B = align_u64(end_offset_B, 64 * 1024);
-
-      for (uint64_t offset = start_offset_B;
-           offset < end_offset_B; offset += 64 * 1024) {
-         uint64_t address = base_address + offset;
-
-         uint64_t aux_entry_addr64, *aux_entry_map;
-         aux_entry_map = intel_aux_map_get_entry(cmd_buffer->device->aux_map_ctx,
-                                                 address, &aux_entry_addr64);
-
-         assert(anv_use_softpin(cmd_buffer->device->physical));
-         struct anv_address aux_entry_address = {
-            .bo = NULL,
-            .offset = aux_entry_addr64,
-         };
-
-         const uint64_t old_aux_entry = READ_ONCE(*aux_entry_map);
-         uint64_t new_aux_entry =
-            (old_aux_entry & INTEL_AUX_MAP_ADDRESS_MASK) | format_bits;
-
-         if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage))
-            new_aux_entry |= INTEL_AUX_MAP_ENTRY_VALID_BIT;
-
-         mi_store(&b, mi_mem64(aux_entry_address), mi_imm(new_aux_entry));
-      }
-   }
-
-   anv_add_pending_pipe_bits(cmd_buffer,
-                             ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
-                             "after update AUX-TT");
-}
-#endif /* GFX_VER == 12 */
-
 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
  * the initial layout is undefined, the HiZ buffer and depth buffer will
  * represent the same data at the end of this operation.
@@ -594,6 +477,7 @@ anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer,
 static void
 transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
                         const struct anv_image *image,
+                        uint32_t base_level, uint32_t level_count,
                         uint32_t base_layer, uint32_t layer_count,
                         VkImageLayout initial_layout,
                         VkImageLayout final_layout,
@@ -604,32 +488,22 @@ transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
    if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE)
       return;
 
-#if GFX_VER == 12
-   if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
-        initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
-       cmd_buffer->device->physical->has_implicit_ccs &&
-       cmd_buffer->device->info.has_aux_map) {
-      anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
-                            0, 1, base_layer, layer_count);
-   }
-#endif
-
    /* If will_full_fast_clear is set, the caller promises to fast-clear the
-    * largest portion of the specified range as it can.  For depth images,
-    * that means the entire image because we don't support multi-LOD HiZ.
+    * largest portion of the specified range as it can.
     */
-   assert(image->planes[0].primary_surface.isl.levels == 1);
    if (will_full_fast_clear)
       return;
 
    const enum isl_aux_state initial_state =
-      anv_layout_to_aux_state(&cmd_buffer->device->info, image,
+      anv_layout_to_aux_state(cmd_buffer->device->info, image,
                               VK_IMAGE_ASPECT_DEPTH_BIT,
-                              initial_layout);
+                              initial_layout,
+                              cmd_buffer->queue_family->queueFlags);
    const enum isl_aux_state final_state =
-      anv_layout_to_aux_state(&cmd_buffer->device->info, image,
+      anv_layout_to_aux_state(cmd_buffer->device->info, image,
                               VK_IMAGE_ASPECT_DEPTH_BIT,
-                              final_layout);
+                              final_layout,
+                              cmd_buffer->queue_family->queueFlags);
 
    const bool initial_depth_valid =
       isl_aux_state_has_valid_primary(initial_state);
@@ -642,36 +516,49 @@ transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
 
    /* Getting into the pass-through state for Depth is tricky and involves
     * both a resolve and an ambiguate.  We don't handle that state right now
-    * as anv_layout_to_aux_state never returns it. Resolve/ambiguate will
-    * trigger depth clears which require tile cache flushes.
+    * as anv_layout_to_aux_state never returns it.
     */
    assert(final_state != ISL_AUX_STATE_PASS_THROUGH);
 
+   enum isl_aux_op hiz_op = ISL_AUX_OP_NONE;
    if (final_needs_depth && !initial_depth_valid) {
       assert(initial_hiz_valid);
-      anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
-                       0, base_layer, layer_count, ISL_AUX_OP_FULL_RESOLVE);
-      anv_add_pending_pipe_bits(cmd_buffer,
-                                ANV_PIPE_TILE_CACHE_FLUSH_BIT,
-                                "after depth resolve");
+      hiz_op = ISL_AUX_OP_FULL_RESOLVE;
    } else if (final_needs_hiz && !initial_hiz_valid) {
       assert(initial_depth_valid);
-      anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
-                       0, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE);
+      hiz_op = ISL_AUX_OP_AMBIGUATE;
+   }
+
+   if (hiz_op != ISL_AUX_OP_NONE) {
+      for (uint32_t l = 0; l < level_count; l++) {
+         const uint32_t level = base_level + l;
+
+         uint32_t aux_layers =
+            anv_image_aux_layers(image, VK_IMAGE_ASPECT_DEPTH_BIT, level);
+         if (base_layer >= aux_layers)
+            break; /* We will only get fewer layers as level increases */
+         uint32_t level_layer_count =
+            MIN2(layer_count, aux_layers - base_layer);
+
+         anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
+                          l, base_layer, level_layer_count, hiz_op);
+      }
+   }
+
+   /* Additional tile cache flush for MTL:
+    *
+    * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10420
+    * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10530
+    */
+   if (intel_device_info_is_mtl(cmd_buffer->device->info) &&
+       image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_HIZ_CCS &&
+       final_needs_depth && !initial_depth_valid) {
       anv_add_pending_pipe_bits(cmd_buffer,
                                 ANV_PIPE_TILE_CACHE_FLUSH_BIT,
-                                "after hiz resolve");
+                                "HIZ-CCS flush");
    }
 }
 
-static inline bool
-vk_image_layout_stencil_write_optimal(VkImageLayout layout)
-{
-   return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL ||
-          layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL ||
-          layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR;
-}
-
 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
  * the initial layout is undefined, the HiZ buffer and depth buffer will
  * represent the same data at the end of this operation.
@@ -685,35 +572,7 @@ transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
                           VkImageLayout final_layout,
                           bool will_full_fast_clear)
 {
-#if GFX_VER == 7
-   const uint32_t plane =
-      anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
-
-   /* On gfx7, we have to store a texturable version of the stencil buffer in
-    * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
-    * forth at strategic points. Stencil writes are only allowed in following
-    * layouts:
-    *
-    *  - VK_IMAGE_LAYOUT_GENERAL
-    *  - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
-    *  - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
-    *  - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
-    *  - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR
-    *
-    * For general, we have no nice opportunity to transition so we do the copy
-    * to the shadow unconditionally at the end of the subpass. For transfer
-    * destinations, we can update it as part of the transfer op. For the other
-    * layouts, we delay the copy until a transition into some other layout.
-    */
-   if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
-       vk_image_layout_stencil_write_optimal(initial_layout) &&
-       !vk_image_layout_stencil_write_optimal(final_layout)) {
-      anv_image_copy_to_shadow(cmd_buffer, image,
-                               VK_IMAGE_ASPECT_STENCIL_BIT,
-                               base_level, level_count,
-                               base_layer, layer_count);
-   }
-#elif GFX_VER == 12
+#if GFX_VER == 12
    const uint32_t plane =
       anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
    if (image->planes[plane].aux_usage == ISL_AUX_USAGE_NONE)
@@ -721,11 +580,7 @@ transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
 
    if ((initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
         initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) &&
-       cmd_buffer->device->physical->has_implicit_ccs &&
-       cmd_buffer->device->info.has_aux_map) {
-      anv_image_init_aux_tt(cmd_buffer, image, VK_IMAGE_ASPECT_STENCIL_BIT,
-                            base_level, level_count, base_layer, layer_count);
-
+       cmd_buffer->device->info->has_aux_map) {
       /* If will_full_fast_clear is set, the caller promises to fast-clear the
        * largest portion of the specified range as it can.
        */
@@ -737,8 +592,8 @@ transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
          const VkRect2D clear_rect = {
             .offset.x = 0,
             .offset.y = 0,
-            .extent.width = anv_minify(image->vk.extent.width, level),
-            .extent.height = anv_minify(image->vk.extent.height, level),
+            .extent.width = u_minify(image->vk.extent.width, level),
+            .extent.height = u_minify(image->vk.extent.height, level),
          };
 
          uint32_t aux_layers =
@@ -757,6 +612,17 @@ transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
                              clear_rect, 0 /* Stencil clear value */);
       }
    }
+
+   /* Additional tile cache flush for MTL:
+    *
+    * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10420
+    * https://gitlab.freedesktop.org/mesa/mesa/-/issues/10530
+    */
+   if (intel_device_info_is_mtl(cmd_buffer->device->info)) {
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_TILE_CACHE_FLUSH_BIT,
+                                "HIZ-CCS flush");
+   }
 #endif
 }
 
@@ -775,7 +641,7 @@ set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer,
    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
 
    /* We only have compression tracking for CCS_E */
-   if (image->planes[plane].aux_usage != ISL_AUX_USAGE_CCS_E)
+   if (!isl_aux_usage_has_ccs_e(image->planes[plane].aux_usage))
       return;
 
    for (uint32_t a = 0; a < layer_count; a++) {
@@ -787,6 +653,22 @@ set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer,
          sdi.ImmediateData = compressed ? UINT32_MAX : 0;
       }
    }
+
+   /* FCV_CCS_E images are automatically fast cleared to default value at
+    * render time. In order to account for this, anv should set the the
+    * appropriate fast clear state for level0/layer0.
+    *
+    * At the moment, tracking the fast clear state for higher levels/layers is
+    * neither supported, nor do we enter a situation where it is a concern.
+    */
+   if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E &&
+       base_layer == 0 && level == 0) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
+         sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device,
+                                                          image, aspect);
+         sdi.ImmediateData = ANV_FAST_CLEAR_DEFAULT_VALUE;
+      }
+   }
 }
 
 static void
@@ -811,7 +693,6 @@ set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer,
 /* This is only really practical on haswell and above because it requires
  * MI math in order to get it correct.
  */
-#if GFX_VERx10 >= 75
 static void
 anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
                                   const struct anv_image *image,
@@ -820,12 +701,14 @@ anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
                                   enum isl_aux_op resolve_op,
                                   enum anv_fast_clear_type fast_clear_supported)
 {
+   struct anv_address addr = anv_image_get_fast_clear_type_addr(cmd_buffer->device,
+                                                                image, aspect);
    struct mi_builder b;
-   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+   const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &addr);
+   mi_builder_set_mocs(&b, mocs);
 
-   const struct mi_value fast_clear_type =
-      mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
-                                                  image, aspect));
+   const struct mi_value fast_clear_type = mi_mem32(addr);
 
    if (resolve_op == ISL_AUX_OP_FULL_RESOLVE) {
       /* In this case, we're doing a full resolve which means we want the
@@ -892,50 +775,6 @@ anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
       mip.CompareOperation = COMPARE_SRCS_EQUAL;
    }
 }
-#endif /* GFX_VERx10 >= 75 */
-
-#if GFX_VER <= 8
-static void
-anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
-                                 const struct anv_image *image,
-                                 VkImageAspectFlagBits aspect,
-                                 uint32_t level, uint32_t array_layer,
-                                 enum isl_aux_op resolve_op,
-                                 enum anv_fast_clear_type fast_clear_supported)
-{
-   struct mi_builder b;
-   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
-
-   struct mi_value fast_clear_type_mem =
-      mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
-                                                      image, aspect));
-
-   /* This only works for partial resolves and only when the clear color is
-    * all or nothing.  On the upside, this emits less command streamer code
-    * and works on Ivybridge and Bay Trail.
-    */
-   assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
-   assert(fast_clear_supported != ANV_FAST_CLEAR_ANY);
-
-   /* We don't support fast clears on anything other than the first slice. */
-   if (level > 0 || array_layer > 0)
-      return;
-
-   /* On gfx8, we don't have a concept of default clear colors because we
-    * can't sample from CCS surfaces.  It's enough to just load the fast clear
-    * state into the predicate register.
-    */
-   mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), fast_clear_type_mem);
-   mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
-   mi_store(&b, fast_clear_type_mem, mi_imm(0));
-
-   anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
-      mip.LoadOperation    = LOAD_LOADINV;
-      mip.CombineOperation = COMBINE_SET;
-      mip.CompareOperation = COMPARE_SRCS_EQUAL;
-   }
-}
-#endif /* GFX_VER <= 8 */
 
 static void
 anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
@@ -949,15 +788,9 @@ anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
 {
    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
 
-#if GFX_VER >= 9
    anv_cmd_compute_resolve_predicate(cmd_buffer, image,
                                      aspect, level, array_layer,
                                      resolve_op, fast_clear_supported);
-#else /* GFX_VER <= 8 */
-   anv_cmd_simple_resolve_predicate(cmd_buffer, image,
-                                    aspect, level, array_layer,
-                                    resolve_op, fast_clear_supported);
-#endif
 
    /* CCS_D only supports full resolves and BLORP will assert on us if we try
     * to do a partial resolve on a CCS_D surface.
@@ -983,16 +816,12 @@ anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer,
    assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
    assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
 
-#if GFX_VERx10 >= 75
    anv_cmd_compute_resolve_predicate(cmd_buffer, image,
                                      aspect, 0, array_layer,
                                      resolve_op, fast_clear_supported);
 
    anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect,
                     array_layer, 1, resolve_op, NULL, true);
-#else
-   unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail");
-#endif
 }
 
 void
@@ -1007,13 +836,12 @@ genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
    /* The aspect must be exactly one of the image aspects. */
    assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects));
 
-   /* The only compression types with more than just fast-clears are MCS,
-    * CCS_E, and HiZ.  With HiZ we just trust the layout and don't actually
-    * track the current fast-clear and compression state.  This leaves us
-    * with just MCS and CCS_E.
+   /* Filter out aux usages that don't have any compression tracking.
+    * Note: We only have compression tracking for CCS_E images, but it's
+    * possible for a CCS_E enabled image to have a subresource with a different
+    * aux usage.
     */
-   if (aux_usage != ISL_AUX_USAGE_CCS_E &&
-       aux_usage != ISL_AUX_USAGE_MCS)
+   if (!isl_aux_usage_has_compression(aux_usage))
       return;
 
    set_image_compressed_bit(cmd_buffer, image, aspect,
@@ -1028,117 +856,119 @@ init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer,
    assert(cmd_buffer && image);
    assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
 
-   set_image_fast_clear_state(cmd_buffer, image, aspect,
-                              ANV_FAST_CLEAR_NONE);
-
-   /* Initialize the struct fields that are accessed for fast-clears so that
+   /* Initialize the struct fields that are accessed for fast clears so that
     * the HW restrictions on the field values are satisfied.
+    *
+    * On generations that do not support indirect clear color natively, we
+    * can just skip initializing the values, because they will be set by
+    * BLORP before actually doing the fast clear.
+    *
+    * For newer generations, we may not be able to skip initialization.
+    * Testing shows that writing to CLEAR_COLOR causes corruption if
+    * the surface is currently being used. So, care must be taken here.
+    * There are two cases that we consider:
+    *
+    *    1. For CCS_E without FCV, we can skip initializing the color-related
+    *       fields, just like on the older platforms. Also, DWORDS 6 and 7
+    *       are marked MBZ (or have a usable field on gfx11), but we can skip
+    *       initializing them because in practice these fields need other
+    *       state to be programmed for their values to matter.
+    *
+    *    2. When the FCV optimization is enabled, we must initialize the
+    *       color-related fields. Otherwise, the engine might reference their
+    *       uninitialized contents before we fill them for a manual fast clear
+    *       with BLORP. Although the surface may be in use, no synchronization
+    *       is needed before initialization. The only possible clear color we
+    *       support in this mode is 0.
     */
-   struct anv_address addr =
-      anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
-
-   if (GFX_VER >= 9) {
-      const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
-      const unsigned num_dwords = GFX_VER >= 10 ?
-                                  isl_dev->ss.clear_color_state_size / 4 :
-                                  isl_dev->ss.clear_value_size / 4;
+#if GFX_VER == 12
+   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+
+   if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E) {
+      assert(!image->planes[plane].can_non_zero_fast_clear);
+      assert(cmd_buffer->device->isl_dev.ss.clear_color_state_size == 32);
+
+      unsigned num_dwords = 6;
+      struct anv_address addr =
+         anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
+
       for (unsigned i = 0; i < num_dwords; i++) {
          anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
             sdi.Address = addr;
             sdi.Address.offset += i * 4;
             sdi.ImmediateData = 0;
-         }
-      }
-   } else {
-      anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
-         sdi.Address = addr;
-         if (GFX_VERx10 >= 75) {
-            /* Pre-SKL, the dword containing the clear values also contains
-             * other fields, so we need to initialize those fields to match the
-             * values that would be in a color attachment.
-             */
-            sdi.ImmediateData = ISL_CHANNEL_SELECT_RED   << 25 |
-                                ISL_CHANNEL_SELECT_GREEN << 22 |
-                                ISL_CHANNEL_SELECT_BLUE  << 19 |
-                                ISL_CHANNEL_SELECT_ALPHA << 16;
-         } else if (GFX_VER == 7) {
-            /* On IVB, the dword containing the clear values also contains
-             * other fields that must be zero or can be zero.
-             */
-            sdi.ImmediateData = 0;
+            sdi.ForceWriteCompletionCheck = i == (num_dwords - 1);
          }
       }
    }
+#endif
 }
 
 /* Copy the fast-clear value dword(s) between a surface state object and an
  * image's fast clear state buffer.
  */
-static void
-genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer,
+void
+genX(load_image_clear_color)(struct anv_cmd_buffer *cmd_buffer,
                              struct anv_state surface_state,
-                             const struct anv_image *image,
-                             VkImageAspectFlagBits aspect,
-                             bool copy_from_surface_state)
+                             const struct anv_image *image)
 {
+#if GFX_VER < 10
    assert(cmd_buffer && image);
    assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
 
-   struct anv_address ss_clear_addr = {
-      .bo = cmd_buffer->device->surface_state_pool.block_pool.bo,
-      .offset = surface_state.offset +
-                cmd_buffer->device->isl_dev.ss.clear_value_offset,
-   };
+   struct anv_address ss_clear_addr =
+      anv_state_pool_state_address(
+         &cmd_buffer->device->internal_surface_state_pool,
+         (struct anv_state) {
+            .offset = surface_state.offset +
+                      cmd_buffer->device->isl_dev.ss.clear_value_offset
+         });
    const struct anv_address entry_addr =
-      anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
+      anv_image_get_clear_color_addr(cmd_buffer->device, image,
+                                     VK_IMAGE_ASPECT_COLOR_BIT);
    unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size;
 
-#if GFX_VER == 7
-   /* On gfx7, the combination of commands used here(MI_LOAD_REGISTER_MEM
-    * and MI_STORE_REGISTER_MEM) can cause GPU hangs if any rendering is
-    * in-flight when they are issued even if the memory touched is not
-    * currently active for rendering.  The weird bit is that it is not the
-    * MI_LOAD/STORE_REGISTER_MEM commands which hang but rather the in-flight
-    * rendering hangs such that the next stalling command after the
-    * MI_LOAD/STORE_REGISTER_MEM commands will catch the hang.
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size);
+
+   /* Updating a surface state object may require that the state cache be
+    * invalidated. From the SKL PRM, Shared Functions -> State -> State
+    * Caching:
     *
-    * It is unclear exactly why this hang occurs.  Both MI commands come with
-    * warnings about the 3D pipeline but that doesn't seem to fully explain
-    * it.  My (Jason's) best theory is that it has something to do with the
-    * fact that we're using a GPU state register as our temporary and that
-    * something with reading/writing it is causing problems.
+    *    Whenever the RENDER_SURFACE_STATE object in memory pointed to by
+    *    the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
+    *    modified [...], the L1 state cache must be invalidated to ensure
+    *    the new surface or sampler state is fetched from system memory.
     *
-    * In order to work around this issue, we emit a PIPE_CONTROL with the
-    * command streamer stall bit set.
+    * In testing, SKL doesn't actually seem to need this, but HSW does.
     */
    anv_add_pending_pipe_bits(cmd_buffer,
-                             ANV_PIPE_CS_STALL_BIT,
-                             "after copy_fast_clear_dwords. Avoid potential hang");
-   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+                             ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
+                             "after load_image_clear_color surface state update");
 #endif
+}
 
-   struct mi_builder b;
-   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
-
-   if (copy_from_surface_state) {
-      mi_memcpy(&b, entry_addr, ss_clear_addr, copy_size);
-   } else {
-      mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size);
-
-      /* Updating a surface state object may require that the state cache be
-       * invalidated. From the SKL PRM, Shared Functions -> State -> State
-       * Caching:
-       *
-       *    Whenever the RENDER_SURFACE_STATE object in memory pointed to by
-       *    the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
-       *    modified [...], the L1 state cache must be invalidated to ensure
-       *    the new surface or sampler state is fetched from system memory.
-       *
-       * In testing, SKL doesn't actually seem to need this, but HSW does.
+void
+genX(set_fast_clear_state)(struct anv_cmd_buffer *cmd_buffer,
+                           const struct anv_image *image,
+                           const enum isl_format format,
+                           union isl_color_value clear_color)
+{
+   if (isl_color_value_is_zero(clear_color, format)) {
+      /* This image has the auxiliary buffer enabled. We can mark the
+       * subresource as not needing a resolve because the clear color
+       * will match what's in every RENDER_SURFACE_STATE object when
+       * it's being used for sampling.
        */
-      anv_add_pending_pipe_bits(cmd_buffer,
-                                ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
-                                "after copy_fast_clear_dwords surface state update");
+      set_image_fast_clear_state(cmd_buffer, image,
+                                 VK_IMAGE_ASPECT_COLOR_BIT,
+                                 ANV_FAST_CLEAR_DEFAULT_VALUE);
+   } else {
+      set_image_fast_clear_state(cmd_buffer, image,
+                                 VK_IMAGE_ASPECT_COLOR_BIT,
+                                 ANV_FAST_CLEAR_ANY);
    }
 }
 
@@ -1161,12 +991,12 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
                         uint32_t base_layer, uint32_t layer_count,
                         VkImageLayout initial_layout,
                         VkImageLayout final_layout,
-                        uint64_t src_queue_family,
-                        uint64_t dst_queue_family,
+                        uint32_t src_queue_family,
+                        uint32_t dst_queue_family,
                         bool will_full_fast_clear)
 {
    struct anv_device *device = cmd_buffer->device;
-   const struct intel_device_info *devinfo = &device->info;
+   const struct intel_device_info *devinfo = device->info;
    /* Validate the inputs. */
    assert(cmd_buffer);
    assert(image && image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
@@ -1175,13 +1005,16 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
           layer_count != VK_REMAINING_ARRAY_LAYERS);
    /* Ensure the subresource range is valid. */
    UNUSED uint64_t last_level_num = base_level + level_count;
-   const uint32_t max_depth = anv_minify(image->vk.extent.depth, base_level);
+   const uint32_t max_depth = u_minify(image->vk.extent.depth, base_level);
    UNUSED const uint32_t image_layers = MAX2(image->vk.array_layers, max_depth);
    assert((uint64_t)base_layer + layer_count  <= image_layers);
    assert(last_level_num <= image->vk.mip_levels);
-   /* The spec disallows these final layouts. */
-   assert(final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
-          final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED);
+   /* If there is a layout transfer, the final layout cannot be undefined or
+    * preinitialized (VUID-VkImageMemoryBarrier-newLayout-01198).
+    */
+   assert(initial_layout == final_layout ||
+          (final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
+           final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED));
    const struct isl_drm_modifier_info *isl_mod_info =
       image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT
       ? isl_drm_modifier_get_info(image->vk.drm_format_mod)
@@ -1195,6 +1028,18 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
       dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
       dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
 
+   /* If the queues are external, consider the first queue family flags
+    * (should be the most capable)
+    */
+   const VkQueueFlagBits src_queue_flags =
+      device->physical->queue.families[
+         (src_queue_external || src_queue_family == VK_QUEUE_FAMILY_IGNORED) ?
+         0 : src_queue_family].queueFlags;
+   const VkQueueFlagBits dst_queue_flags =
+      device->physical->queue.families[
+         (dst_queue_external || dst_queue_family == VK_QUEUE_FAMILY_IGNORED) ?
+         0 : dst_queue_family].queueFlags;
+
    /* Simultaneous acquire and release on external queues is illegal. */
    assert(!src_queue_external || !dst_queue_external);
 
@@ -1202,43 +1047,81 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
     * image has a DRM format modifier because we store image data in
     * a driver-private bo which is inaccessible to the external queue.
     */
-   const bool mod_acquire =
+   const bool private_binding_acquire =
       src_queue_external &&
-      image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
+      anv_image_is_externally_shared(image) &&
+      anv_image_has_private_binding(image);
 
-   const bool mod_release =
+   const bool private_binding_release =
       dst_queue_external &&
-      image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT;
+      anv_image_is_externally_shared(image) &&
+      anv_image_has_private_binding(image);
 
    if (initial_layout == final_layout &&
-       !mod_acquire && !mod_release) {
+       !private_binding_acquire && !private_binding_release) {
       /* No work is needed. */
        return;
    }
 
-   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
-
-   if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
-       final_layout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
-      /* This surface is a linear compressed image with a tiled shadow surface
-       * for texturing.  The client is about to use it in READ_ONLY_OPTIMAL so
-       * we need to ensure the shadow copy is up-to-date.
-       */
-      assert(image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT);
-      assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
-      assert(image->planes[plane].primary_surface.isl.tiling == ISL_TILING_LINEAR);
-      assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR);
-      assert(isl_format_is_compressed(image->planes[plane].primary_surface.isl.format));
-      assert(plane == 0);
-      anv_image_copy_to_shadow(cmd_buffer, image,
-                               VK_IMAGE_ASPECT_COLOR_BIT,
-                               base_level, level_count,
-                               base_layer, layer_count);
+   /**
+    * Section 7.7.4 of the Vulkan 1.3.260 spec says:
+    *
+    *    If the transfer is via an image memory barrier, and an image layout
+    *    transition is desired, then the values of oldLayout and newLayout in the
+    *    release operation's memory barrier must be equal to values of oldLayout
+    *    and newLayout in the acquire operation's memory barrier. Although the
+    *    image layout transition is submitted twice, it will only be executed
+    *    once. A layout transition specified in this way happens-after the
+    *    release operation and happens-before the acquire operation.
+    *
+    * Because we know that we get match transition on each queue, we choose to
+    * only do the work on one queue type : RENDER. In the cases where we do
+    * transitions between COMPUTE & TRANSFER, we should have matching
+    * aux/fast_clear value which would trigger no work in the code below.
+    */
+   if (!(src_queue_external || dst_queue_external) &&
+       src_queue_family != VK_QUEUE_FAMILY_IGNORED &&
+       dst_queue_family != VK_QUEUE_FAMILY_IGNORED &&
+       src_queue_family != dst_queue_family) {
+      enum intel_engine_class src_engine =
+         cmd_buffer->queue_family->engine_class;
+      if (src_engine != INTEL_ENGINE_CLASS_RENDER)
+         return;
    }
 
+   const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
+
    if (base_layer >= anv_image_aux_layers(image, aspect, base_level))
       return;
 
+   enum isl_aux_usage initial_aux_usage =
+      anv_layout_to_aux_usage(devinfo, image, aspect, 0,
+                              initial_layout, src_queue_flags);
+   enum isl_aux_usage final_aux_usage =
+      anv_layout_to_aux_usage(devinfo, image, aspect, 0,
+                              final_layout, dst_queue_flags);
+   enum anv_fast_clear_type initial_fast_clear =
+      anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout,
+                                    src_queue_flags);
+   enum anv_fast_clear_type final_fast_clear =
+      anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout,
+                                    dst_queue_flags);
+
+   /* We must override the anv_layout_to_* functions because they are unaware
+    * of acquire/release direction.
+    */
+   if (private_binding_acquire) {
+      initial_aux_usage = isl_drm_modifier_has_aux(isl_mod_info->modifier) ?
+         image->planes[plane].aux_usage : ISL_AUX_USAGE_NONE;
+      initial_fast_clear = isl_mod_info->supports_clear_color ?
+         initial_fast_clear : ANV_FAST_CLEAR_NONE;
+   } else if (private_binding_release) {
+      final_aux_usage = isl_drm_modifier_has_aux(isl_mod_info->modifier) ?
+         image->planes[plane].aux_usage : ISL_AUX_USAGE_NONE;
+      final_fast_clear = isl_mod_info->supports_clear_color ?
+         final_fast_clear : ANV_FAST_CLEAR_NONE;
+   }
+
    assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR);
 
    /* The following layouts are equivalent for non-linear images. */
@@ -1254,8 +1137,43 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
        * data.
        */
       must_init_fast_clear_state = true;
-      must_init_aux_surface = true;
-   } else if (mod_acquire) {
+
+      if (image->planes[plane].aux_usage == ISL_AUX_USAGE_MCS ||
+          devinfo->has_illegal_ccs_values) {
+
+         must_init_aux_surface = true;
+
+      } else {
+         assert(isl_aux_usage_has_ccs_e(image->planes[plane].aux_usage));
+
+         /* We can start using the CCS immediately without ambiguating. The
+          * two conditions that enable this are:
+          *
+          * 1) The device treats all possible CCS values as legal. In other
+          *    words, we can't confuse the hardware with random bits in the
+          *    CCS.
+          *
+          * 2) We enable compression on all writable image layouts. The CCS
+          *    will receive all writes and will therefore always be in sync
+          *    with the main surface.
+          *
+          *    If we were to disable compression on some writable layouts, the
+          *    CCS could get out of sync with the main surface and the app
+          *    could lose the data it wrote previously. For example, this
+          *    could happen if an app: transitions from UNDEFINED w/o
+          *    ambiguating -> renders with AUX_NONE -> samples with AUX_CCS.
+          *
+          * The second condition is asserted below, but could be moved
+          * elsewhere for more coverage (we're only checking transitions from
+          * an undefined layout).
+          */
+         assert(vk_image_layout_is_read_only(final_layout, aspect) ||
+                (final_aux_usage != ISL_AUX_USAGE_NONE));
+
+         must_init_aux_surface = false;
+      }
+
+   } else if (private_binding_acquire) {
       /* The fast clear state lives in a driver-private bo, and therefore the
        * external/foreign queue is unaware of it.
        *
@@ -1272,18 +1190,14 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
               ANV_IMAGE_MEMORY_BINDING_PRIVATE);
       must_init_fast_clear_state = true;
 
-      if (image->planes[plane].aux_surface.memory_range.binding ==
+      if (anv_image_get_aux_memory_range(image, plane)->binding ==
           ANV_IMAGE_MEMORY_BINDING_PRIVATE) {
-         assert(isl_mod_info->aux_usage == ISL_AUX_USAGE_NONE);
-
          /* The aux surface, like the fast clear state, lives in
           * a driver-private bo.  We must initialize the aux surface for the
           * same reasons we must initialize the fast clear state.
           */
          must_init_aux_surface = true;
       } else {
-         assert(isl_mod_info->aux_usage != ISL_AUX_USAGE_NONE);
-
          /* The aux surface, unlike the fast clear state, lives in
           * application-visible VkDeviceMemory and is shared with the
           * external/foreign queue. Therefore, when we acquire ownership of the
@@ -1294,24 +1208,12 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
       }
    }
 
-#if GFX_VER == 12
-   /* We do not yet support modifiers with aux on gen12. */
-   assert(image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT);
-
-   if (initial_layout_undefined) {
-      if (device->physical->has_implicit_ccs && devinfo->has_aux_map) {
-         anv_image_init_aux_tt(cmd_buffer, image, aspect,
-                               base_level, level_count,
-                               base_layer, layer_count);
-      }
-   }
-#else
-   assert(!(device->physical->has_implicit_ccs && devinfo->has_aux_map));
-#endif
-
    if (must_init_fast_clear_state) {
-      if (base_level == 0 && base_layer == 0)
-         init_fast_clear_color(cmd_buffer, image, aspect);
+      if (base_level == 0 && base_layer == 0) {
+         set_image_fast_clear_state(cmd_buffer, image, aspect,
+                                    ANV_FAST_CLEAR_NONE);
+      }
+      init_fast_clear_color(cmd_buffer, image, aspect);
    }
 
    if (must_init_aux_surface) {
@@ -1341,14 +1243,15 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
        *
        * For MCS, (2) is never an issue because we don't support multisampled
        * storage images.  In theory, issue (1) is a problem with MCS but we've
-       * never seen it in the wild.  For 4x and 16x, all bit patters could, in
-       * theory, be interpreted as something but we don't know that all bit
+       * never seen it in the wild.  For 4x and 16x, all bit patterns could,
+       * in theory, be interpreted as something but we don't know that all bit
        * patterns are actually valid.  For 2x and 8x, you could easily end up
        * with the MCS referring to an invalid plane because not all bits of
        * the MCS value are actually used.  Even though we've never seen issues
        * in the wild, it's best to play it safe and initialize the MCS.  We
-       * can use a fast-clear for MCS because we only ever touch from render
-       * and texture (no image load store).
+       * could use a fast-clear for MCS because we only ever touch from render
+       * and texture (no image load store). However, due to WA 14013111325,
+       * we choose to ambiguate MCS as well.
        */
       if (image->vk.samples == 1) {
          for (uint32_t l = 0; l < level_count; l++) {
@@ -1377,19 +1280,10 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
                              aspect, level, base_layer, level_layer_count,
                              ISL_AUX_OP_AMBIGUATE, NULL, false);
 
-            if (image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_E) {
-               set_image_compressed_bit(cmd_buffer, image, aspect,
-                                        level, base_layer, level_layer_count,
-                                        false);
-            }
+            set_image_compressed_bit(cmd_buffer, image, aspect, level,
+                                     base_layer, level_layer_count, false);
          }
       } else {
-         if (image->vk.samples == 4 || image->vk.samples == 16) {
-            anv_perf_warn(cmd_buffer->device, &image->vk.base,
-                          "Doing a potentially unnecessary fast-clear to "
-                          "define an MCS buffer.");
-         }
-
          /* If will_full_fast_clear is set, the caller promises to fast-clear
           * the largest portion of the specified range as it can.
           */
@@ -1401,25 +1295,11 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
                           image->planes[plane].primary_surface.isl.format,
                           ISL_SWIZZLE_IDENTITY,
                           aspect, base_layer, layer_count,
-                          ISL_AUX_OP_FAST_CLEAR, NULL, false);
+                          ISL_AUX_OP_AMBIGUATE, NULL, false);
       }
       return;
    }
 
-   enum isl_aux_usage initial_aux_usage =
-      anv_layout_to_aux_usage(devinfo, image, aspect, 0, initial_layout);
-   enum isl_aux_usage final_aux_usage =
-      anv_layout_to_aux_usage(devinfo, image, aspect, 0, final_layout);
-
-   /* We must override the anv_layout_to_* functions because they are unaware of
-    * acquire/release direction.
-    */
-   if (mod_acquire) {
-      initial_aux_usage = isl_mod_info->aux_usage;
-   } else if (mod_release) {
-      final_aux_usage = isl_mod_info->aux_usage;
-   }
-
    /* The current code assumes that there is no mixing of CCS_E and CCS_D.
     * We can handle transitions between CCS_D/E to and from NONE.  What we
     * don't yet handle is switching between CCS_E and CCS_D within a given
@@ -1440,15 +1320,29 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
    /* If the initial layout supports more fast clear than the final layout
     * then we need at least a partial resolve.
     */
-   const enum anv_fast_clear_type initial_fast_clear =
-      anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout);
-   const enum anv_fast_clear_type final_fast_clear =
-      anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout);
-   if (final_fast_clear < initial_fast_clear)
+   if (final_fast_clear < initial_fast_clear) {
+      /* Partial resolves will actually only occur on layer 0/level 0. This
+       * is generally okay because anv only allows explicit fast clears to
+       * the first subresource.
+       *
+       * The situation is a bit different with FCV_CCS_E. With that aux
+       * usage, implicit fast clears can occur on any layer and level.
+       * anv doesn't track fast clear states for more than the first
+       * subresource, so we need to assert that a layout transition doesn't
+       * attempt to partial resolve the other subresources.
+       *
+       * At the moment, we don't enter such a situation, and partial resolves
+       * for higher level/layer resources shouldn't be a concern.
+       */
+      if (image->planes[plane].aux_usage == ISL_AUX_USAGE_FCV_CCS_E) {
+         assert(base_level == 0 && level_count == 1 &&
+                base_layer == 0 && layer_count == 1);
+      }
       resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE;
+   }
 
-   if (initial_aux_usage == ISL_AUX_USAGE_CCS_E &&
-       final_aux_usage != ISL_AUX_USAGE_CCS_E)
+   if (isl_aux_usage_has_ccs_e(initial_aux_usage) &&
+       !isl_aux_usage_has_ccs_e(final_aux_usage))
       resolve_op = ISL_AUX_OP_FULL_RESOLVE;
 
    if (resolve_op == ISL_AUX_OP_NONE)
@@ -1474,7 +1368,7 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
    anv_add_pending_pipe_bits(cmd_buffer,
                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
-                             "after transition RT");
+                             "before transition RT");
 
    for (uint32_t l = 0; l < level_count; l++) {
       uint32_t level = base_level + l;
@@ -1525,519 +1419,55 @@ transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
                              "after transition RT");
 }
 
-static VkResult
-genX(cmd_buffer_setup_attachments)(struct anv_cmd_buffer *cmd_buffer,
-                                   const struct anv_render_pass *pass,
-                                   const struct anv_framebuffer *framebuffer,
-                                   const VkRenderPassBeginInfo *begin)
-{
-   struct anv_cmd_state *state = &cmd_buffer->state;
-
-   vk_free(&cmd_buffer->pool->alloc, state->attachments);
-
-   if (pass->attachment_count > 0) {
-      state->attachments = vk_zalloc(&cmd_buffer->pool->alloc,
-                                     pass->attachment_count *
-                                          sizeof(state->attachments[0]),
-                                     8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-      if (state->attachments == NULL) {
-         /* Propagate VK_ERROR_OUT_OF_HOST_MEMORY to vkEndCommandBuffer */
-         return anv_batch_set_error(&cmd_buffer->batch,
-                                    VK_ERROR_OUT_OF_HOST_MEMORY);
-      }
-   } else {
-      state->attachments = NULL;
-   }
-
-   const VkRenderPassAttachmentBeginInfoKHR *attach_begin =
-      vk_find_struct_const(begin, RENDER_PASS_ATTACHMENT_BEGIN_INFO_KHR);
-   if (begin && !attach_begin)
-      assert(pass->attachment_count == framebuffer->attachment_count);
-
-   for (uint32_t i = 0; i < pass->attachment_count; ++i) {
-      if (attach_begin && attach_begin->attachmentCount != 0) {
-         assert(attach_begin->attachmentCount == pass->attachment_count);
-         ANV_FROM_HANDLE(anv_image_view, iview, attach_begin->pAttachments[i]);
-         state->attachments[i].image_view = iview;
-      } else if (framebuffer && i < framebuffer->attachment_count) {
-         state->attachments[i].image_view = framebuffer->attachments[i];
-      } else {
-         state->attachments[i].image_view = NULL;
-      }
-   }
-
-   if (begin) {
-      for (uint32_t i = 0; i < pass->attachment_count; ++i) {
-         const struct anv_render_pass_attachment *pass_att = &pass->attachments[i];
-         struct anv_attachment_state *att_state = &state->attachments[i];
-         VkImageAspectFlags att_aspects = vk_format_aspects(pass_att->format);
-         VkImageAspectFlags clear_aspects = 0;
-         VkImageAspectFlags load_aspects = 0;
-
-         if (att_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
-            /* color attachment */
-            if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
-               clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
-            } else if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) {
-               load_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
-            }
-         } else {
-            /* depthstencil attachment */
-            if (att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
-               if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
-                  clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
-               } else if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD) {
-                  load_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
-               }
-            }
-            if (att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
-               if (pass_att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
-                  clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
-               } else if (pass_att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_LOAD) {
-                  load_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
-               }
-            }
-         }
-
-         att_state->current_layout = pass_att->initial_layout;
-         att_state->current_stencil_layout = pass_att->stencil_initial_layout;
-         att_state->pending_clear_aspects = clear_aspects;
-         att_state->pending_load_aspects = load_aspects;
-         if (clear_aspects)
-            att_state->clear_value = begin->pClearValues[i];
-
-         struct anv_image_view *iview = state->attachments[i].image_view;
-         anv_assert(iview->vk.format == pass_att->format);
-
-         const uint32_t num_layers = iview->planes[0].isl.array_len;
-         att_state->pending_clear_views = (1 << num_layers) - 1;
-
-         /* This will be initialized after the first subpass transition. */
-         att_state->aux_usage = ISL_AUX_USAGE_NONE;
-
-         att_state->fast_clear = false;
-         if (clear_aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
-            assert(clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT);
-            att_state->fast_clear =
-               anv_can_fast_clear_color_view(cmd_buffer->device, iview,
-                                             pass_att->first_subpass_layout,
-                                             vk_to_isl_color(att_state->clear_value.color),
-                                             framebuffer->layers,
-                                             begin->renderArea);
-         } else if (clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
-                                     VK_IMAGE_ASPECT_STENCIL_BIT)) {
-            att_state->fast_clear =
-               anv_can_hiz_clear_ds_view(cmd_buffer->device, iview,
-                                         pass_att->first_subpass_layout,
-                                         clear_aspects,
-                                         att_state->clear_value.depthStencil.depth,
-                                         begin->renderArea);
-         }
-      }
-   }
-
-   return VK_SUCCESS;
-}
-
-/**
- * Setup anv_cmd_state::attachments for vkCmdBeginRenderPass.
- */
-static VkResult
-genX(cmd_buffer_alloc_att_surf_states)(struct anv_cmd_buffer *cmd_buffer,
-                                       const struct anv_render_pass *pass,
-                                       const struct anv_subpass *subpass)
+static MUST_CHECK VkResult
+anv_cmd_buffer_init_attachments(struct anv_cmd_buffer *cmd_buffer,
+                                uint32_t color_att_count)
 {
-   const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
-   struct anv_cmd_state *state = &cmd_buffer->state;
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
 
    /* Reserve one for the NULL state. */
-   unsigned num_states = 1;
-   for (uint32_t i = 0; i < subpass->attachment_count; i++) {
-      uint32_t att = subpass->attachments[i].attachment;
-      if (att == VK_ATTACHMENT_UNUSED)
-         continue;
-
-      assert(att < pass->attachment_count);
-      if (!vk_format_is_color(pass->attachments[att].format))
-         continue;
-
-      const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage;
-      assert(util_bitcount(att_usage) == 1);
-
-      if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT ||
-          att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)
-         num_states++;
-   }
-
-   const uint32_t ss_stride = align_u32(isl_dev->ss.size, isl_dev->ss.align);
-   state->attachment_states =
-      anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
-                             num_states * ss_stride, isl_dev->ss.align);
-   if (state->attachment_states.map == NULL) {
-      return anv_batch_set_error(&cmd_buffer->batch,
-                                 VK_ERROR_OUT_OF_DEVICE_MEMORY);
-   }
+   unsigned num_states = 1 + color_att_count;
+   const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
+   const uint32_t ss_stride = align(isl_dev->ss.size, isl_dev->ss.align);
+   gfx->att_states =
+      anv_cmd_buffer_alloc_surface_states(cmd_buffer, num_states);
+   if (gfx->att_states.map == NULL)
+      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
 
-   struct anv_state next_state = state->attachment_states;
+   struct anv_state next_state = gfx->att_states;
    next_state.alloc_size = isl_dev->ss.size;
 
-   state->null_surface_state = next_state;
+   gfx->null_surface_state = next_state;
    next_state.offset += ss_stride;
    next_state.map += ss_stride;
 
-   for (uint32_t i = 0; i < subpass->attachment_count; i++) {
-      uint32_t att = subpass->attachments[i].attachment;
-      if (att == VK_ATTACHMENT_UNUSED)
-         continue;
-
-      assert(att < pass->attachment_count);
-      if (!vk_format_is_color(pass->attachments[att].format))
-         continue;
-
-      const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage;
-      assert(util_bitcount(att_usage) == 1);
-
-      if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)
-         state->attachments[att].color.state = next_state;
-      else if (att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT)
-         state->attachments[att].input.state = next_state;
-      else
-         continue;
-
-      state->attachments[att].color.state = next_state;
+   gfx->color_att_count = color_att_count;
+   for (uint32_t i = 0; i < color_att_count; i++) {
+      gfx->color_att[i] = (struct anv_attachment) {
+         .surface_state.state = next_state,
+      };
       next_state.offset += ss_stride;
       next_state.map += ss_stride;
    }
-
-   assert(next_state.offset == state->attachment_states.offset +
-                               state->attachment_states.alloc_size);
+   gfx->depth_att = (struct anv_attachment) { };
+   gfx->stencil_att = (struct anv_attachment) { };
 
    return VK_SUCCESS;
 }
 
-VkResult
-genX(BeginCommandBuffer)(
-    VkCommandBuffer                             commandBuffer,
-    const VkCommandBufferBeginInfo*             pBeginInfo)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   /* If this is the first vkBeginCommandBuffer, we must *initialize* the
-    * command buffer's state. Otherwise, we must *reset* its state. In both
-    * cases we reset it.
-    *
-    * From the Vulkan 1.0 spec:
-    *
-    *    If a command buffer is in the executable state and the command buffer
-    *    was allocated from a command pool with the
-    *    VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
-    *    vkBeginCommandBuffer implicitly resets the command buffer, behaving
-    *    as if vkResetCommandBuffer had been called with
-    *    VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
-    *    the command buffer in the recording state.
-    */
-   anv_cmd_buffer_reset(cmd_buffer);
-
-   cmd_buffer->usage_flags = pBeginInfo->flags;
-
-   /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
-    * primary level command buffers.
-    *
-    * From the Vulkan 1.0 spec:
-    *
-    *    VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
-    *    secondary command buffer is considered to be entirely inside a render
-    *    pass. If this is a primary command buffer, then this bit is ignored.
-    */
-   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
-      cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
-
-   genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
-
-   /* We sometimes store vertex data in the dynamic state buffer for blorp
-    * operations and our dynamic state stream may re-use data from previous
-    * command buffers.  In order to prevent stale cache data, we flush the VF
-    * cache.  We could do this on every blorp call but that's not really
-    * needed as all of the data will get written by the CPU prior to the GPU
-    * executing anything.  The chances are fairly high that they will use
-    * blorp at least once per primary command buffer so it shouldn't be
-    * wasted.
-    *
-    * There is also a workaround on gfx8 which requires us to invalidate the
-    * VF cache occasionally.  It's easier if we can assume we start with a
-    * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).)
-    */
-   anv_add_pending_pipe_bits(cmd_buffer,
-                             ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
-                             "new cmd buffer");
-
-   /* Re-emit the aux table register in every command buffer.  This way we're
-    * ensured that we have the table even if this command buffer doesn't
-    * initialize any images.
-    */
-   if (cmd_buffer->device->info.has_aux_map) {
-      anv_add_pending_pipe_bits(cmd_buffer,
-                                ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
-                                "new cmd buffer with aux-tt");
-   }
-
-   /* We send an "Indirect State Pointers Disable" packet at
-    * EndCommandBuffer, so all push contant packets are ignored during a
-    * context restore. Documentation says after that command, we need to
-    * emit push constants again before any rendering operation. So we
-    * flag them dirty here to make sure they get emitted.
-    */
-   cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
-
-   VkResult result = VK_SUCCESS;
-   if (cmd_buffer->usage_flags &
-       VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
-      assert(pBeginInfo->pInheritanceInfo);
-      ANV_FROM_HANDLE(anv_render_pass, pass,
-                      pBeginInfo->pInheritanceInfo->renderPass);
-      struct anv_subpass *subpass =
-         &pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
-      ANV_FROM_HANDLE(anv_framebuffer, framebuffer,
-                      pBeginInfo->pInheritanceInfo->framebuffer);
-
-      cmd_buffer->state.pass = pass;
-      cmd_buffer->state.subpass = subpass;
-
-      /* This is optional in the inheritance info. */
-      cmd_buffer->state.framebuffer = framebuffer;
-
-      result = genX(cmd_buffer_setup_attachments)(cmd_buffer, pass,
-                                                  framebuffer, NULL);
-      if (result != VK_SUCCESS)
-         return result;
-
-      result = genX(cmd_buffer_alloc_att_surf_states)(cmd_buffer, pass,
-                                                      subpass);
-      if (result != VK_SUCCESS)
-         return result;
-
-      /* Record that HiZ is enabled if we can. */
-      if (cmd_buffer->state.framebuffer) {
-         const struct anv_image_view * const iview =
-            anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
-
-         if (iview) {
-            VkImageLayout layout =
-                cmd_buffer->state.subpass->depth_stencil_attachment->layout;
-
-            enum isl_aux_usage aux_usage =
-               anv_layout_to_aux_usage(&cmd_buffer->device->info, iview->image,
-                                       VK_IMAGE_ASPECT_DEPTH_BIT,
-                                       VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
-                                       layout);
-
-            cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(aux_usage);
-         }
-      }
-
-      cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
-   }
-
-#if GFX_VERx10 >= 75
-   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
-      const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =
-         vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT);
-
-      /* If secondary buffer supports conditional rendering
-       * we should emit commands as if conditional rendering is enabled.
-       */
-      cmd_buffer->state.conditional_render_enabled =
-         conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable;
-   }
-#endif
-
-   return result;
-}
-
-/* From the PRM, Volume 2a:
- *
- *    "Indirect State Pointers Disable
- *
- *    At the completion of the post-sync operation associated with this pipe
- *    control packet, the indirect state pointers in the hardware are
- *    considered invalid; the indirect pointers are not saved in the context.
- *    If any new indirect state commands are executed in the command stream
- *    while the pipe control is pending, the new indirect state commands are
- *    preserved.
- *
- *    [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
- *    restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
- *    commands are only considered as Indirect State Pointers. Once ISP is
- *    issued in a context, SW must initialize by programming push constant
- *    commands for all the shaders (at least to zero length) before attempting
- *    any rendering operation for the same context."
- *
- * 3DSTATE_CONSTANT_* packets are restored during a context restore,
- * even though they point to a BO that has been already unreferenced at
- * the end of the previous batch buffer. This has been fine so far since
- * we are protected by these scratch page (every address not covered by
- * a BO should be pointing to the scratch page). But on CNL, it is
- * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
- * instruction.
- *
- * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
- * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
- * context restore, so the mentioned hang doesn't happen. However,
- * software must program push constant commands for all stages prior to
- * rendering anything. So we flag them dirty in BeginCommandBuffer.
- *
- * Finally, we also make sure to stall at pixel scoreboard to make sure the
- * constants have been loaded into the EUs prior to disable the push constants
- * so that it doesn't hang a previous 3DPRIMITIVE.
- */
 static void
-emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)
-{
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-         pc.StallAtPixelScoreboard = true;
-         pc.CommandStreamerStallEnable = true;
-         anv_debug_dump_pc(pc);
-   }
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-         pc.IndirectStatePointersDisable = true;
-         pc.CommandStreamerStallEnable = true;
-         anv_debug_dump_pc(pc);
-   }
-}
-
-VkResult
-genX(EndCommandBuffer)(
-    VkCommandBuffer                             commandBuffer)
+anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer *cmd_buffer)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
 
-   if (anv_batch_has_error(&cmd_buffer->batch))
-      return cmd_buffer->batch.status;
+   gfx->render_area = (VkRect2D) { };
+   gfx->layer_count = 0;
+   gfx->samples = 0;
 
-   anv_measure_endcommandbuffer(cmd_buffer);
-
-   /* We want every command buffer to start with the PMA fix in a known state,
-    * so we disable it at the end of the command buffer.
-    */
-   genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
-
-   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-
-   emit_isp_disable(cmd_buffer);
-
-   anv_cmd_buffer_end_batch_buffer(cmd_buffer);
-
-   return VK_SUCCESS;
-}
-
-void
-genX(CmdExecuteCommands)(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    commandBufferCount,
-    const VkCommandBuffer*                      pCmdBuffers)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);
-
-   assert(primary->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
-
-   if (anv_batch_has_error(&primary->batch))
-      return;
-
-   /* The secondary command buffers will assume that the PMA fix is disabled
-    * when they begin executing.  Make sure this is true.
-    */
-   genX(cmd_buffer_enable_pma_fix)(primary, false);
-
-   /* The secondary command buffer doesn't know which textures etc. have been
-    * flushed prior to their execution.  Apply those flushes now.
-    */
-   genX(cmd_buffer_apply_pipe_flushes)(primary);
-
-   for (uint32_t i = 0; i < commandBufferCount; i++) {
-      ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
-
-      assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
-      assert(!anv_batch_has_error(&secondary->batch));
-
-#if GFX_VERx10 >= 75
-      if (secondary->state.conditional_render_enabled) {
-         if (!primary->state.conditional_render_enabled) {
-            /* Secondary buffer is constructed as if it will be executed
-             * with conditional rendering, we should satisfy this dependency
-             * regardless of conditional rendering being enabled in primary.
-             */
-            struct mi_builder b;
-            mi_builder_init(&b, &primary->device->info, &primary->batch);
-            mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
-                         mi_imm(UINT64_MAX));
-         }
-      }
-#endif
-
-      if (secondary->usage_flags &
-          VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
-         /* If we're continuing a render pass from the primary, we need to
-          * copy the surface states for the current subpass into the storage
-          * we allocated for them in BeginCommandBuffer.
-          */
-         struct anv_bo *ss_bo =
-            primary->device->surface_state_pool.block_pool.bo;
-         struct anv_state src_state = primary->state.attachment_states;
-         struct anv_state dst_state = secondary->state.attachment_states;
-         assert(src_state.alloc_size == dst_state.alloc_size);
-
-         genX(cmd_buffer_so_memcpy)(primary,
-                                    (struct anv_address) {
-                                       .bo = ss_bo,
-                                       .offset = dst_state.offset,
-                                    },
-                                    (struct anv_address) {
-                                       .bo = ss_bo,
-                                       .offset = src_state.offset,
-                                    },
-                                    src_state.alloc_size);
-      }
-
-      anv_cmd_buffer_add_secondary(primary, secondary);
-
-      assert(secondary->perf_query_pool == NULL || primary->perf_query_pool == NULL ||
-             secondary->perf_query_pool == primary->perf_query_pool);
-      if (secondary->perf_query_pool)
-         primary->perf_query_pool = secondary->perf_query_pool;
-
-#if GFX_VERx10 == 120
-      if (secondary->state.depth_reg_mode != ANV_DEPTH_REG_MODE_UNKNOWN)
-         primary->state.depth_reg_mode = secondary->state.depth_reg_mode;
-#endif
-   }
-
-   /* The secondary isn't counted in our VF cache tracking so we need to
-    * invalidate the whole thing.
-    */
-   if (GFX_VER >= 8 && GFX_VER <= 9) {
-      anv_add_pending_pipe_bits(primary,
-                                ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
-                                "Secondary cmd buffer not tracked in VF cache");
-   }
-
-   /* The secondary may have selected a different pipeline (3D or compute) and
-    * may have changed the current L3$ configuration.  Reset our tracking
-    * variables to invalid values to ensure that we re-emit these in the case
-    * where we do any draws or compute dispatches from the primary after the
-    * secondary has returned.
-    */
-   primary->state.current_pipeline = UINT32_MAX;
-   primary->state.current_l3_config = NULL;
-   primary->state.current_hash_scale = 0;
-
-   /* Each of the secondary command buffers will use its own state base
-    * address.  We need to re-emit state base address for the primary after
-    * all of the secondaries are done.
-    *
-    * TODO: Maybe we want to make this a dirty bit to avoid extra state base
-    * address calls?
-    */
-   genX(cmd_buffer_emit_state_base_address)(primary);
+   gfx->color_att_count = 0;
+   gfx->depth_att = (struct anv_attachment) { };
+   gfx->stencil_att = (struct anv_attachment) { };
+   gfx->null_surface_state = ANV_STATE_NULL;
 }
 
 /**
@@ -2057,7 +1487,7 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
     */
    assert(cfg == cmd_buffer->device->l3_config);
 #else
-   if (INTEL_DEBUG & DEBUG_L3) {
+   if (INTEL_DEBUG(DEBUG_L3)) {
       mesa_logd("L3 config transition: ");
       intel_dump_l3_config(cfg, stderr);
    }
@@ -2066,12 +1496,10 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
     * while the pipeline is completely drained and the caches are flushed,
     * which involves a first PIPE_CONTROL flush which stalls the pipeline...
     */
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-      pc.DCFlushEnable = true;
-      pc.PostSyncOperation = NoWrite;
-      pc.CommandStreamerStallEnable = true;
-      anv_debug_dump_pc(pc);
-   }
+   genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
+                                cmd_buffer->state.current_pipeline,
+                                ANV_PIPE_DATA_CACHE_FLUSH_BIT |
+                                ANV_PIPE_CS_STALL_BIT);
 
    /* ...followed by a second pipelined PIPE_CONTROL that initiates
     * invalidation of the relevant caches.  Note that because RO invalidation
@@ -2087,40 +1515,128 @@ genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
     * already guarantee that there is no concurrent GPGPU kernel execution
     * (see SKL HSD 2132585).
     */
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-      pc.TextureCacheInvalidationEnable = true;
-      pc.ConstantCacheInvalidationEnable = true;
-      pc.InstructionCacheInvalidateEnable = true;
-      pc.StateCacheInvalidationEnable = true;
-      pc.PostSyncOperation = NoWrite;
-      anv_debug_dump_pc(pc);
-   }
+   genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
+                                cmd_buffer->state.current_pipeline,
+                                ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
+                                ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
+                                ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
+                                ANV_PIPE_STATE_CACHE_INVALIDATE_BIT);
 
    /* Now send a third stalling flush to make sure that invalidation is
     * complete when the L3 configuration registers are modified.
     */
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-      pc.DCFlushEnable = true;
-      pc.PostSyncOperation = NoWrite;
-      pc.CommandStreamerStallEnable = true;
-      anv_debug_dump_pc(pc);
-   }
+   genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
+                                cmd_buffer->state.current_pipeline,
+                                ANV_PIPE_DATA_CACHE_FLUSH_BIT |
+                                ANV_PIPE_CS_STALL_BIT);
 
    genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg);
 #endif /* GFX_VER >= 11 */
    cmd_buffer->state.current_l3_config = cfg;
 }
 
-void
-genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
+ALWAYS_INLINE void
+genX(invalidate_aux_map)(struct anv_batch *batch,
+                         struct anv_device *device,
+                         enum intel_engine_class engine_class,
+                         enum anv_pipe_bits bits)
 {
-   UNUSED const struct intel_device_info *devinfo = &cmd_buffer->device->info;
-   enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
+#if GFX_VER == 12
+   if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) && device->info->has_aux_map) {
+      uint32_t register_addr = 0;
+      switch (engine_class) {
+      case INTEL_ENGINE_CLASS_COMPUTE:
+         register_addr = GENX(COMPCS0_CCS_AUX_INV_num);
+         break;
+      case INTEL_ENGINE_CLASS_COPY:
+#if GFX_VERx10 >= 125
+         register_addr = GENX(BCS_CCS_AUX_INV_num);
+#endif
+         break;
+      case INTEL_ENGINE_CLASS_VIDEO:
+         register_addr = GENX(VD0_CCS_AUX_INV_num);
+         break;
+      case INTEL_ENGINE_CLASS_RENDER:
+      default:
+         register_addr = GENX(GFX_CCS_AUX_INV_num);
+         break;
+      }
 
-   if (unlikely(cmd_buffer->device->physical->always_flush_cache))
-      bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
-   else if (bits == 0)
-      return;
+      anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+         lri.RegisterOffset = register_addr;
+         lri.DataDWord = 1;
+      }
+
+      /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
+      if (intel_needs_workaround(device->info, 16018063123) &&
+          engine_class == INTEL_ENGINE_CLASS_COPY) {
+         genX(batch_emit_fast_color_dummy_blit)(batch, device);
+      }
+
+      /* HSD 22012751911: SW Programming sequence when issuing aux invalidation:
+       *
+       *    "Poll Aux Invalidation bit once the invalidation is set
+       *     (Register 4208 bit 0)"
+       */
+      anv_batch_emit(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
+         sem.CompareOperation = COMPARE_SAD_EQUAL_SDD;
+         sem.WaitMode = PollingMode;
+         sem.RegisterPollMode = true;
+         sem.SemaphoreDataDword = 0x0;
+         sem.SemaphoreAddress =
+            anv_address_from_u64(register_addr);
+      }
+   }
+#else
+   assert(!device->info->has_aux_map);
+#endif
+}
+
+ALWAYS_INLINE enum anv_pipe_bits
+genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
+                              struct anv_device *device,
+                              uint32_t current_pipeline,
+                              enum anv_pipe_bits bits,
+                              enum anv_pipe_bits *emitted_flush_bits)
+{
+#if GFX_VER >= 12
+   /* From the TGL PRM, Volume 2a, "PIPE_CONTROL":
+    *
+    *     "SW must follow below programming restrictions when programming
+    *      PIPE_CONTROL command [for ComputeCS]:
+    *      ...
+    *      Following bits must not be set when programmed for ComputeCS:
+    *      - "Render Target Cache Flush Enable", "Depth Cache Flush Enable"
+    *         and "Tile Cache Flush Enable"
+    *      - "Depth Stall Enable", Stall at Pixel Scoreboard and
+    *         "PSD Sync Enable".
+    *      - "OVR Tile 0 Flush", "TBIMR Force Batch Closure",
+    *         "AMFS Flush Enable", "VF Cache Invalidation Enable" and
+    *         "Global Snapshot Count Reset"."
+    *
+    * XXX: According to spec this should not be a concern for a regular
+    * RCS in GPGPU mode, but during testing it was found that at least
+    * "VF Cache Invalidation Enable" bit is ignored in such case.
+    * This can cause us to miss some important invalidations
+    * (e.g. from CmdPipelineBarriers) and have incoherent data.
+    *
+    * There is also a Wa_1606932921 "RCS is not waking up fixed function clock
+    * when specific 3d related bits are programmed in pipecontrol in
+    * compute mode" that suggests us not to use "RT Cache Flush" in GPGPU mode.
+    *
+    * The other bits are not confirmed to cause problems, but included here
+    * just to be safe, as they're also not really relevant in the GPGPU mode,
+    * and having them doesn't seem to cause any regressions.
+    *
+    * So if we're currently in GPGPU mode, we hide some bits from
+    * this flush, and will flush them only when we'll be able to.
+    * Similar thing with GPGPU-only bits.
+    */
+   enum anv_pipe_bits defer_bits = bits &
+      (current_pipeline == GPGPU ? ANV_PIPE_GFX_BITS: ANV_PIPE_GPGPU_BITS);
+
+   bits &= ~defer_bits;
+#endif
 
    /*
     * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
@@ -2158,9 +1674,34 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
     *    "Driver must ensure that the engine is IDLE but ensure it doesn't
     *    add extra flushes in the case it knows that the engine is already
     *    IDLE."
+    *
+    * HSD 22012751911: SW Programming sequence when issuing aux invalidation:
+    *
+    *    "Render target Cache Flush + L3 Fabric Flush + State Invalidation + CS Stall"
+    *
+    * Notice we don't set the L3 Fabric Flush here, because we have
+    * ANV_PIPE_END_OF_PIPE_SYNC_BIT which inserts a CS stall. The
+    * PIPE_CONTROL::L3 Fabric Flush documentation says :
+    *
+    *    "L3 Fabric Flush will ensure all the pending transactions in the L3
+    *     Fabric are flushed to global observation point. HW does implicit L3
+    *     Fabric Flush on all stalling flushes (both explicit and implicit)
+    *     and on PIPECONTROL having Post Sync Operation enabled."
+    *
+    * Therefore setting L3 Fabric Flush here would be redundant.
     */
-   if (GFX_VER == 12 && (bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT))
-      bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
+   if (GFX_VER == 12 && (bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT)) {
+      if (current_pipeline == GPGPU) {
+         bits |= (ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT |
+                  ANV_PIPE_DATA_CACHE_FLUSH_BIT |
+                  (GFX_VERx10 == 125 ? ANV_PIPE_CCS_CACHE_FLUSH_BIT: 0));
+      } else if (current_pipeline == _3D) {
+         bits |= (ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT |
+                  ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                  ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
+                  (GFX_VERx10 == 125 ? ANV_PIPE_CCS_CACHE_FLUSH_BIT: 0));
+      }
+   }
 
    /* If we're going to do an invalidate and we have a pending end-of-pipe
     * sync that has yet to be resolved, we do the end-of-pipe sync now.
@@ -2169,24 +1710,12 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
        (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) {
       bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
       bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
-   }
 
-   /* Wa_1409226450, Wait for EU to be idle before pipe control which
-    * invalidates the instruction cache
-    */
-   if (GFX_VER == 12 && (bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT))
-      bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
-
-   if ((GFX_VER >= 8 && GFX_VER <= 9) &&
-       (bits & ANV_PIPE_CS_STALL_BIT) &&
-       (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
-      /* If we are doing a VF cache invalidate AND a CS stall (it must be
-       * both) then we can reset our vertex cache tracking.
-       */
-      memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
-             sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
-      memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
-             sizeof(cmd_buffer->state.gfx.ib_dirty_range));
+      if (INTEL_DEBUG(DEBUG_PIPE_CONTROL) && bits) {
+         fputs("pc: add ", stderr);
+         anv_dump_pipe_bits(ANV_PIPE_END_OF_PIPE_SYNC_BIT, stdout);
+         fprintf(stderr, "reason: Ensure flushes done before invalidate\n");
+      }
    }
 
    /* Project: SKL / Argument: LRI Post Sync Operation [23]
@@ -2197,146 +1726,86 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
     *  PIPELINE_SELECT command is set to GPGPU mode of operation)."
     *
     * The same text exists a few rows below for Post Sync Op.
-    *
-    * On Gfx12 this is Wa_1607156449.
     */
    if (bits & ANV_PIPE_POST_SYNC_BIT) {
-      if ((GFX_VER == 9 || (GFX_VER == 12 && devinfo->revision == 0 /* A0 */)) &&
-          cmd_buffer->state.current_pipeline == GPGPU)
+      if (GFX_VER == 9 && current_pipeline == GPGPU)
          bits |= ANV_PIPE_CS_STALL_BIT;
       bits &= ~ANV_PIPE_POST_SYNC_BIT;
    }
 
    if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
                ANV_PIPE_END_OF_PIPE_SYNC_BIT)) {
-      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
-#if GFX_VER >= 12
-         pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT;
-         pipe.HDCPipelineFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
-#else
-         /* Flushing HDC pipeline requires DC Flush on earlier HW. */
-         pipe.DCFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
-#endif
-         pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
-         pipe.DCFlushEnable |= bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
-         pipe.RenderTargetCacheFlushEnable =
-            bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+      enum anv_pipe_bits flush_bits =
+         bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
+                 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
 
-         /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must
-          * be set with any PIPE_CONTROL with Depth Flush Enable bit set.
-          */
-#if GFX_VER >= 12
-         pipe.DepthStallEnable =
-            pipe.DepthCacheFlushEnable || (bits & ANV_PIPE_DEPTH_STALL_BIT);
-#else
-         pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
+#if GFX_VERx10 >= 125
+      if (current_pipeline != GPGPU) {
+         if (flush_bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT)
+            flush_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+      } else {
+         if (flush_bits & (ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
+                           ANV_PIPE_DATA_CACHE_FLUSH_BIT))
+            flush_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+      }
+
+      /* BSpec 47112: PIPE_CONTROL::Untyped Data-Port Cache Flush:
+       *
+       *    "'HDC Pipeline Flush' bit must be set for this bit to take
+       *     effect."
+       */
+      if (flush_bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT)
+         flush_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
 #endif
 
-         pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
-         pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
+#if GFX_VER < 12
+      if (flush_bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT)
+         flush_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
+#endif
 
-         /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
-          *
-          *    "The most common action to perform upon reaching a
-          *    synchronization point is to write a value out to memory. An
-          *    immediate value (included with the synchronization command) may
-          *    be written."
-          *
-          *
-          * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
-          *
-          *    "In case the data flushed out by the render engine is to be
-          *    read back in to the render engine in coherent manner, then the
-          *    render engine has to wait for the fence completion before
-          *    accessing the flushed data. This can be achieved by following
-          *    means on various products: PIPE_CONTROL command with CS Stall
-          *    and the required write caches flushed with Post-Sync-Operation
-          *    as Write Immediate Data.
-          *
-          *    Example:
-          *       - Workload-1 (3D/GPGPU/MEDIA)
-          *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
-          *         Immediate Data, Required Write Cache Flush bits set)
-          *       - Workload-2 (Can use the data produce or output by
-          *         Workload-1)
-          */
-         if (bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {
-            pipe.CommandStreamerStallEnable = true;
-            pipe.PostSyncOperation = WriteImmediateData;
-            pipe.Address = cmd_buffer->device->workaround_address;
-         }
+      uint32_t sync_op = NoWrite;
+      struct anv_address addr = ANV_NULL_ADDRESS;
 
-         /*
-          * According to the Broadwell documentation, any PIPE_CONTROL with the
-          * "Command Streamer Stall" bit set must also have another bit set,
-          * with five different options:
-          *
-          *  - Render Target Cache Flush
-          *  - Depth Cache Flush
-          *  - Stall at Pixel Scoreboard
-          *  - Post-Sync Operation
-          *  - Depth Stall
-          *  - DC Flush Enable
-          *
-          * I chose "Stall at Pixel Scoreboard" since that's what we use in
-          * mesa and it seems to work fine. The choice is fairly arbitrary.
-          */
-         if (pipe.CommandStreamerStallEnable &&
-             !pipe.RenderTargetCacheFlushEnable &&
-             !pipe.DepthCacheFlushEnable &&
-             !pipe.StallAtPixelScoreboard &&
-             !pipe.PostSyncOperation &&
-             !pipe.DepthStallEnable &&
-             !pipe.DCFlushEnable)
-            pipe.StallAtPixelScoreboard = true;
-         anv_debug_dump_pc(pipe);
+      /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
+       *
+       *    "The most common action to perform upon reaching a
+       *    synchronization point is to write a value out to memory. An
+       *    immediate value (included with the synchronization command) may
+       *    be written."
+       *
+       *
+       * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
+       *
+       *    "In case the data flushed out by the render engine is to be
+       *    read back in to the render engine in coherent manner, then the
+       *    render engine has to wait for the fence completion before
+       *    accessing the flushed data. This can be achieved by following
+       *    means on various products: PIPE_CONTROL command with CS Stall
+       *    and the required write caches flushed with Post-Sync-Operation
+       *    as Write Immediate Data.
+       *
+       *    Example:
+       *       - Workload-1 (3D/GPGPU/MEDIA)
+       *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
+       *         Immediate Data, Required Write Cache Flush bits set)
+       *       - Workload-2 (Can use the data produce or output by
+       *         Workload-1)
+       */
+      if (flush_bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {
+         flush_bits |= ANV_PIPE_CS_STALL_BIT;
+         sync_op = WriteImmediateData;
+         addr = device->workaround_address;
       }
 
-      /* If a render target flush was emitted, then we can toggle off the bit
-       * saying that render target writes are ongoing.
-       */
-      if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
-         bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES);
+      /* Flush PC. */
+      genx_batch_emit_pipe_control_write(batch, device->info, current_pipeline,
+                                         sync_op, addr, 0, flush_bits);
 
-      if (GFX_VERx10 == 75) {
-         /* Haswell needs addition work-arounds:
-          *
-          * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
-          *
-          *    Option 1:
-          *    PIPE_CONTROL command with the CS Stall and the required write
-          *    caches flushed with Post-SyncOperation as Write Immediate Data
-          *    followed by eight dummy MI_STORE_DATA_IMM (write to scratch
-          *    spce) commands.
-          *
-          *    Example:
-          *       - Workload-1
-          *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
-          *         Immediate Data, Required Write Cache Flush bits set)
-          *       - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address)
-          *       - Workload-2 (Can use the data produce or output by
-          *         Workload-1)
-          *
-          * Unfortunately, both the PRMs and the internal docs are a bit
-          * out-of-date in this regard.  What the windows driver does (and
-          * this appears to actually work) is to emit a register read from the
-          * memory address written by the pipe control above.
-          *
-          * What register we load into doesn't matter.  We choose an indirect
-          * rendering register because we know it always exists and it's one
-          * of the first registers the command parser allows us to write.  If
-          * you don't have command parser support in your kernel (pre-4.2),
-          * this will get turned into MI_NOOP and you won't get the
-          * workaround.  Unfortunately, there's just not much we can do in
-          * that case.  This register is perfectly safe to write since we
-          * always re-load all of the indirect draw registers right before
-          * 3DPRIMITIVE when needed anyway.
-          */
-         anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
-            lrm.RegisterAddress  = 0x243C; /* GFX7_3DPRIM_START_INSTANCE */
-            lrm.MemoryAddress = cmd_buffer->device->workaround_address;
-         }
-      }
+      /* If the caller wants to know what flushes have been emitted,
+       * provide the bits based off the PIPE_CONTROL programmed bits.
+       */
+      if (emitted_flush_bits != NULL)
+         *emitted_flush_bits = flush_bits;
 
       bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
                 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
@@ -2354,202 +1823,328 @@ genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
        * This appears to hang Broadwell, so we restrict it to just gfx9.
        */
       if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT))
-         anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe);
-
-      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
-         pipe.StateCacheInvalidationEnable =
-            bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
-         pipe.ConstantCacheInvalidationEnable =
-            bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
-         pipe.VFCacheInvalidationEnable =
-            bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
-         pipe.TextureCacheInvalidationEnable =
-            bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
-         pipe.InstructionCacheInvalidateEnable =
-            bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
-
-         /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
-          *
-          *    "When VF Cache Invalidate is set “Post Sync Operation” must be
-          *    enabled to “Write Immediate Data” or “Write PS Depth Count” or
-          *    “Write Timestamp”.
-          */
-         if (GFX_VER == 9 && pipe.VFCacheInvalidationEnable) {
-            pipe.PostSyncOperation = WriteImmediateData;
-            pipe.Address = cmd_buffer->device->workaround_address;
-         }
-         anv_debug_dump_pc(pipe);
-      }
+         anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe);
 
-#if GFX_VER == 12
-      if ((bits & ANV_PIPE_AUX_TABLE_INVALIDATE_BIT) &&
-          cmd_buffer->device->info.has_aux_map) {
-         anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
-            lri.RegisterOffset = GENX(GFX_CCS_AUX_INV_num);
-            lri.DataDWord = 1;
-         }
-      }
+#if GFX_VER >= 9 && GFX_VER <= 11
+      /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
+       *
+       *    "Workaround : “CS Stall” bit in PIPE_CONTROL command must be
+       *     always set for GPGPU workloads when “Texture Cache
+       *     Invalidation Enable” bit is set".
+       *
+       * Workaround stopped appearing in TGL PRMs.
+       */
+      if (current_pipeline == GPGPU &&
+          (bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT))
+         bits |= ANV_PIPE_CS_STALL_BIT;
 #endif
 
+      uint32_t sync_op = NoWrite;
+      struct anv_address addr = ANV_NULL_ADDRESS;
+
+      /* From the SKL PRM, Vol. 2a, "PIPE_CONTROL",
+       *
+       *    "When VF Cache Invalidate is set “Post Sync Operation” must be
+       *    enabled to “Write Immediate Data” or “Write PS Depth Count” or
+       *    “Write Timestamp”.
+       */
+      if (GFX_VER == 9 && (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
+         sync_op = WriteImmediateData;
+         addr = device->workaround_address;
+      }
+
+      /* Invalidate PC. */
+      genx_batch_emit_pipe_control_write(batch, device->info, current_pipeline,
+                                         sync_op, addr, 0, bits);
+
+      enum intel_engine_class engine_class =
+         current_pipeline == GPGPU ? INTEL_ENGINE_CLASS_COMPUTE :
+                                     INTEL_ENGINE_CLASS_RENDER;
+      genX(invalidate_aux_map)(batch, device, engine_class, bits);
+
       bits &= ~ANV_PIPE_INVALIDATE_BITS;
    }
 
-   cmd_buffer->state.pending_pipe_bits = bits;
+#if GFX_VER >= 12
+   bits |= defer_bits;
+#endif
+
+   return bits;
 }
 
-void genX(CmdPipelineBarrier)(
-    VkCommandBuffer                             commandBuffer,
-    VkPipelineStageFlags                        srcStageMask,
-    VkPipelineStageFlags                        destStageMask,
-    VkBool32                                    byRegion,
-    uint32_t                                    memoryBarrierCount,
-    const VkMemoryBarrier*                      pMemoryBarriers,
-    uint32_t                                    bufferMemoryBarrierCount,
-    const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
-    uint32_t                                    imageMemoryBarrierCount,
-    const VkImageMemoryBarrier*                 pImageMemoryBarriers)
+ALWAYS_INLINE void
+genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   /* XXX: Right now, we're really dumb and just flush whatever categories
-    * the app asks for.  One of these days we may make this a bit better
-    * but right now that's all the hardware allows for in most areas.
+#if INTEL_NEEDS_WA_1508744258
+   /* If we're changing the state of the RHWO optimization, we need to have
+    * sb_stall+cs_stall.
     */
-   VkAccessFlags src_flags = 0;
-   VkAccessFlags dst_flags = 0;
-
-   for (uint32_t i = 0; i < memoryBarrierCount; i++) {
-      src_flags |= pMemoryBarriers[i].srcAccessMask;
-      dst_flags |= pMemoryBarriers[i].dstAccessMask;
+   const bool rhwo_opt_change =
+      cmd_buffer->state.rhwo_optimization_enabled !=
+      cmd_buffer->state.pending_rhwo_optimization_enabled;
+   if (rhwo_opt_change) {
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_STALL_AT_SCOREBOARD_BIT |
+                                ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                                "change RHWO optimization");
    }
+#endif
 
-   for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
-      src_flags |= pBufferMemoryBarriers[i].srcAccessMask;
-      dst_flags |= pBufferMemoryBarriers[i].dstAccessMask;
-   }
+   enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
 
-   for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
-      src_flags |= pImageMemoryBarriers[i].srcAccessMask;
-      dst_flags |= pImageMemoryBarriers[i].dstAccessMask;
-      ANV_FROM_HANDLE(anv_image, image, pImageMemoryBarriers[i].image);
-      const VkImageSubresourceRange *range =
-         &pImageMemoryBarriers[i].subresourceRange;
+   if (unlikely(cmd_buffer->device->physical->always_flush_cache))
+      bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
+   else if (bits == 0)
+      return;
 
-      uint32_t base_layer, layer_count;
-      if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
-         base_layer = 0;
-         layer_count = anv_minify(image->vk.extent.depth, range->baseMipLevel);
-      } else {
-         base_layer = range->baseArrayLayer;
-         layer_count = vk_image_subresource_layer_count(&image->vk, range);
+   if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) ||
+       anv_cmd_buffer_is_video_queue(cmd_buffer)) {
+      if (bits & ANV_PIPE_INVALIDATE_BITS) {
+         genX(invalidate_aux_map)(&cmd_buffer->batch, cmd_buffer->device,
+                                  cmd_buffer->queue_family->engine_class, bits);
+         bits &= ~ANV_PIPE_INVALIDATE_BITS;
       }
-      const uint32_t level_count =
-         vk_image_subresource_level_count(&image->vk, range);
+      cmd_buffer->state.pending_pipe_bits = bits;
+      return;
+   }
 
-      if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
-         transition_depth_buffer(cmd_buffer, image,
-                                 base_layer, layer_count,
-                                 pImageMemoryBarriers[i].oldLayout,
-                                 pImageMemoryBarriers[i].newLayout,
-                                 false /* will_full_fast_clear */);
-      }
+   const bool trace_flush =
+      (bits & (ANV_PIPE_FLUSH_BITS |
+               ANV_PIPE_STALL_BITS |
+               ANV_PIPE_INVALIDATE_BITS |
+               ANV_PIPE_END_OF_PIPE_SYNC_BIT)) != 0;
+   if (trace_flush)
+      trace_intel_begin_stall(&cmd_buffer->trace);
 
-      if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
-         transition_stencil_buffer(cmd_buffer, image,
-                                   range->baseMipLevel, level_count,
-                                   base_layer, layer_count,
-                                   pImageMemoryBarriers[i].oldLayout,
-                                   pImageMemoryBarriers[i].newLayout,
-                                   false /* will_full_fast_clear */);
-      }
+   if (GFX_VER == 9 &&
+       (bits & ANV_PIPE_CS_STALL_BIT) &&
+       (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
+      /* If we are doing a VF cache invalidate AND a CS stall (it must be
+       * both) then we can reset our vertex cache tracking.
+       */
+      memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
+             sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
+      memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
+             sizeof(cmd_buffer->state.gfx.ib_dirty_range));
+   }
 
-      if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
-         VkImageAspectFlags color_aspects =
-            vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
-         anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) {
-            transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit,
-                                    range->baseMipLevel, level_count,
-                                    base_layer, layer_count,
-                                    pImageMemoryBarriers[i].oldLayout,
-                                    pImageMemoryBarriers[i].newLayout,
-                                    pImageMemoryBarriers[i].srcQueueFamilyIndex,
-                                    pImageMemoryBarriers[i].dstQueueFamilyIndex,
-                                    false /* will_full_fast_clear */);
-         }
+
+   enum anv_pipe_bits emitted_bits = 0;
+   cmd_buffer->state.pending_pipe_bits =
+      genX(emit_apply_pipe_flushes)(&cmd_buffer->batch,
+                                    cmd_buffer->device,
+                                    cmd_buffer->state.current_pipeline,
+                                    bits,
+                                    &emitted_bits);
+   anv_cmd_buffer_update_pending_query_bits(cmd_buffer, emitted_bits);
+
+#if INTEL_NEEDS_WA_1508744258
+   if (rhwo_opt_change) {
+      anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
+         c1.RCCRHWOOptimizationDisable =
+            !cmd_buffer->state.pending_rhwo_optimization_enabled;
+         c1.RCCRHWOOptimizationDisableMask = true;
       }
+      cmd_buffer->state.rhwo_optimization_enabled =
+         cmd_buffer->state.pending_rhwo_optimization_enabled;
    }
+#endif
 
-   anv_add_pending_pipe_bits(cmd_buffer,
-                             anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) |
-                             anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags),
-                             "pipe barrier");
+   if (trace_flush) {
+      trace_intel_end_stall(&cmd_buffer->trace,
+                            bits & ~cmd_buffer->state.pending_pipe_bits,
+                            anv_pipe_flush_bit_to_ds_stall_flag,
+                            cmd_buffer->state.pc_reasons[0],
+                            cmd_buffer->state.pc_reasons[1],
+                            cmd_buffer->state.pc_reasons[2],
+                            cmd_buffer->state.pc_reasons[3]);
+      cmd_buffer->state.pc_reasons[0] = NULL;
+      cmd_buffer->state.pc_reasons[1] = NULL;
+      cmd_buffer->state.pc_reasons[2] = NULL;
+      cmd_buffer->state.pc_reasons[3] = NULL;
+      cmd_buffer->state.pc_reasons_count = 0;
+   }
 }
 
-static void
-cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
+static inline struct anv_state
+emit_dynamic_buffer_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
+                                        struct anv_cmd_pipeline_state *pipe_state,
+                                        struct anv_pipeline_binding *binding,
+                                        const struct anv_descriptor *desc)
 {
-   VkShaderStageFlags stages =
-      cmd_buffer->state.gfx.pipeline->active_stages;
+   if (!desc->buffer)
+      return anv_null_surface_state_for_binding_table(cmd_buffer->device);
+
+   /* Compute the offset within the buffer */
+   uint32_t dynamic_offset =
+      pipe_state->dynamic_offsets[
+         binding->set].offsets[binding->dynamic_offset_index];
+   uint64_t offset = desc->offset + dynamic_offset;
+   /* Clamp to the buffer size */
+   offset = MIN2(offset, desc->buffer->vk.size);
+   /* Clamp the range to the buffer size */
+   uint32_t range = MIN2(desc->range, desc->buffer->vk.size - offset);
+
+   /* Align the range for consistency */
+   if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
+      range = align(range, ANV_UBO_ALIGNMENT);
+
+   struct anv_address address =
+      anv_address_add(desc->buffer->address, offset);
+
+   struct anv_state surface_state =
+      anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
+   if (surface_state.map == NULL)
+      return ANV_STATE_NULL;
+
+   enum isl_format format =
+      anv_isl_format_for_descriptor_type(cmd_buffer->device,
+                                         desc->type);
+
+   isl_surf_usage_flags_t usage =
+      anv_isl_usage_for_descriptor_type(desc->type);
+
+   anv_fill_buffer_surface_state(cmd_buffer->device,
+                                 surface_state.map,
+                                 format, ISL_SWIZZLE_IDENTITY,
+                                 usage, address, range, 1);
+
+   return surface_state;
+}
+
+static uint32_t
+emit_indirect_descriptor_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
+                                             struct anv_cmd_pipeline_state *pipe_state,
+                                             struct anv_pipeline_binding *binding,
+                                             const struct anv_descriptor *desc)
+{
+   struct anv_device *device = cmd_buffer->device;
+   struct anv_state surface_state;
 
-   /* In order to avoid thrash, we assume that vertex and fragment stages
-    * always exist.  In the rare case where one is missing *and* the other
-    * uses push concstants, this may be suboptimal.  However, avoiding stalls
-    * seems more important.
+   /* Relative offset in the STATE_BASE_ADDRESS::SurfaceStateBaseAddress heap.
+    * Depending on where the descriptor surface state is allocated, they can
+    * either come from device->internal_surface_state_pool or
+    * device->bindless_surface_state_pool.
     */
-   stages |= VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT;
+   switch (desc->type) {
+   case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+   case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+   case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
+      if (desc->image_view) {
+         const struct anv_surface_state *sstate =
+            anv_image_view_texture_surface_state(desc->image_view,
+                                                 binding->plane,
+                                                 desc->layout);
+         surface_state = desc->image_view->use_surface_state_stream ?
+            sstate->state :
+            anv_bindless_state_for_binding_table(device, sstate->state);
+         assert(surface_state.alloc_size);
+      } else {
+         surface_state = anv_null_surface_state_for_binding_table(device);
+      }
+      break;
+   }
 
-   if (stages == cmd_buffer->state.gfx.push_constant_stages)
-      return;
+   case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
+      if (desc->image_view) {
+         const struct anv_surface_state *sstate =
+            anv_image_view_storage_surface_state(desc->image_view);
+         surface_state = desc->image_view->use_surface_state_stream ?
+            sstate->state :
+            anv_bindless_state_for_binding_table(device, sstate->state);
+         assert(surface_state.alloc_size);
+      } else {
+         surface_state =
+            anv_null_surface_state_for_binding_table(device);
+      }
+      break;
+   }
 
-#if GFX_VER >= 8
-   const unsigned push_constant_kb = 32;
-#elif GFX_VERx10 == 75
-   const unsigned push_constant_kb = cmd_buffer->device->info.gt == 3 ? 32 : 16;
-#else
-   const unsigned push_constant_kb = 16;
-#endif
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+      if (desc->set_buffer_view) {
+         surface_state = desc->set_buffer_view->general.state;
+         assert(surface_state.alloc_size);
+      } else {
+         surface_state = anv_null_surface_state_for_binding_table(device);
+      }
+      break;
+
+   case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+      if (desc->buffer_view) {
+         surface_state = anv_bindless_state_for_binding_table(
+            device,
+            desc->buffer_view->general.state);
+         assert(surface_state.alloc_size);
+      } else {
+         surface_state = anv_null_surface_state_for_binding_table(device);
+      }
+      break;
 
-   const unsigned num_stages =
-      util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
-   unsigned size_per_stage = push_constant_kb / num_stages;
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
+      surface_state =
+         emit_dynamic_buffer_binding_table_entry(cmd_buffer, pipe_state,
+                                                 binding, desc);
+      break;
 
-   /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
-    * units of 2KB.  Incidentally, these are the same platforms that have
-    * 32KB worth of push constant space.
-    */
-   if (push_constant_kb == 32)
-      size_per_stage &= ~1u;
-
-   uint32_t kb_used = 0;
-   for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
-      unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
-      anv_batch_emit(&cmd_buffer->batch,
-                     GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
-         alloc._3DCommandSubOpcode  = 18 + i;
-         alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
-         alloc.ConstantBufferSize   = push_size;
+   case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+      if (desc->buffer_view) {
+         surface_state = anv_bindless_state_for_binding_table(
+            device, desc->buffer_view->storage.state);
+         assert(surface_state.alloc_size);
+      } else {
+         surface_state = anv_null_surface_state_for_binding_table(device);
       }
-      kb_used += push_size;
-   }
+      break;
 
-   anv_batch_emit(&cmd_buffer->batch,
-                  GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
-      alloc.ConstantBufferOffset = kb_used;
-      alloc.ConstantBufferSize = push_constant_kb - kb_used;
+   default:
+      unreachable("Invalid descriptor type");
    }
 
-   cmd_buffer->state.gfx.push_constant_stages = stages;
+   return surface_state.offset;
+}
+
+static uint32_t
+emit_direct_descriptor_binding_table_entry(struct anv_cmd_buffer *cmd_buffer,
+                                           struct anv_cmd_pipeline_state *pipe_state,
+                                           const struct anv_descriptor_set *set,
+                                           struct anv_pipeline_binding *binding,
+                                           const struct anv_descriptor *desc)
+{
+   uint32_t desc_offset;
 
-   /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
-    *
-    *    "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
-    *    the next 3DPRIMITIVE command after programming the
-    *    3DSTATE_PUSH_CONSTANT_ALLOC_VS"
-    *
-    * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
-    * pipeline setup, we need to dirty push constants.
+   /* Relative offset in the STATE_BASE_ADDRESS::SurfaceStateBaseAddress heap.
+    * Depending on where the descriptor surface state is allocated, they can
+    * either come from device->internal_surface_state_pool or
+    * device->bindless_surface_state_pool.
     */
-   cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
+   switch (desc->type) {
+   case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
+   case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
+   case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+   case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
+   case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+   case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
+      desc_offset = set->desc_offset + binding->set_offset;
+      break;
+
+   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
+      struct anv_state state =
+         emit_dynamic_buffer_binding_table_entry(cmd_buffer, pipe_state,
+                                                 binding, desc);
+      desc_offset = state.offset;
+      break;
+   }
+
+   default:
+      unreachable("Invalid descriptor type");
+   }
+
+   return desc_offset;
 }
 
 static VkResult
@@ -2558,7 +2153,6 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
                    struct anv_shader_bin *shader,
                    struct anv_state *bt_state)
 {
-   struct anv_subpass *subpass = cmd_buffer->state.subpass;
    uint32_t state_offset;
 
    struct anv_pipeline_bind_map *map = &shader->bind_map;
@@ -2575,13 +2169,6 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
    if (bt_state->map == NULL)
       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
 
-   /* We only need to emit relocs if we're not using softpin.  If we are using
-    * softpin then we always keep all user-allocated memory objects resident.
-    */
-   const bool need_client_mem_relocs =
-      !anv_use_softpin(cmd_buffer->device->physical);
-   struct anv_push_constants *push = &pipe_state->push_constants;
-
    for (uint32_t s = 0; s < map->surface_count; s++) {
       struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
 
@@ -2595,90 +2182,66 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
       case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
          /* Color attachment binding */
          assert(shader->stage == MESA_SHADER_FRAGMENT);
-         if (binding->index < subpass->color_count) {
-            const unsigned att =
-               subpass->color_attachments[binding->index].attachment;
-
-            /* From the Vulkan 1.0.46 spec:
-             *
-             *    "If any color or depth/stencil attachments are
-             *    VK_ATTACHMENT_UNUSED, then no writes occur for those
-             *    attachments."
-             */
-            if (att == VK_ATTACHMENT_UNUSED) {
-               surface_state = cmd_buffer->state.null_surface_state;
-            } else {
-               surface_state = cmd_buffer->state.attachments[att].color.state;
-            }
+         if (binding->index < cmd_buffer->state.gfx.color_att_count) {
+            const struct anv_attachment *att =
+               &cmd_buffer->state.gfx.color_att[binding->index];
+            surface_state = att->surface_state.state;
          } else {
-            surface_state = cmd_buffer->state.null_surface_state;
+            surface_state = cmd_buffer->state.gfx.null_surface_state;
          }
-
-         assert(surface_state.map);
-         bt_map[s] = surface_state.offset + state_offset;
-         break;
-
-      case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: {
-         struct anv_state surface_state =
-            anv_cmd_buffer_alloc_surface_state(cmd_buffer);
-
-         struct anv_address constant_data = {
-            .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
-            .offset = shader->kernel.offset +
-                      shader->prog_data->const_data_offset,
-         };
-         unsigned constant_data_size = shader->prog_data->const_data_size;
-
-         const enum isl_format format =
-            anv_isl_format_for_descriptor_type(cmd_buffer->device,
-                                               VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
-         anv_fill_buffer_surface_state(cmd_buffer->device,
-                                       surface_state, format,
-                                       ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
-                                       constant_data, constant_data_size, 1);
-
          assert(surface_state.map);
          bt_map[s] = surface_state.offset + state_offset;
-         add_surface_reloc(cmd_buffer, surface_state, constant_data);
          break;
-      }
 
       case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: {
          /* This is always the first binding for compute shaders */
          assert(shader->stage == MESA_SHADER_COMPUTE && s == 0);
 
          struct anv_state surface_state =
-            anv_cmd_buffer_alloc_surface_state(cmd_buffer);
+            anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
+         if (surface_state.map == NULL)
+            return VK_ERROR_OUT_OF_DEVICE_MEMORY;
 
          const enum isl_format format =
             anv_isl_format_for_descriptor_type(cmd_buffer->device,
                                                VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
-         anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
-                                       format,
+         anv_fill_buffer_surface_state(cmd_buffer->device, surface_state.map,
+                                       format, ISL_SWIZZLE_IDENTITY,
                                        ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
                                        cmd_buffer->state.compute.num_workgroups,
                                        12, 1);
 
          assert(surface_state.map);
          bt_map[s] = surface_state.offset + state_offset;
-         if (need_client_mem_relocs) {
-            add_surface_reloc(cmd_buffer, surface_state,
-                              cmd_buffer->state.compute.num_workgroups);
-         }
          break;
       }
 
       case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
+         struct anv_descriptor_set *set =
+            pipe_state->descriptors[binding->index];
+
+         /* If the shader doesn't access the set buffer, just put the null
+          * surface.
+          */
+         if (set->is_push && !shader->push_desc_info.used_set_buffer) {
+            bt_map[s] = 0;
+            break;
+         }
+
          /* This is a descriptor set buffer so the set index is actually
           * given by binding->binding.  (Yes, that's confusing.)
           */
-         struct anv_descriptor_set *set =
-            pipe_state->descriptors[binding->index];
-         assert(set->desc_mem.alloc_size);
+         assert(set->desc_surface_mem.alloc_size);
          assert(set->desc_surface_state.alloc_size);
          bt_map[s] = set->desc_surface_state.offset + state_offset;
-         add_surface_reloc(cmd_buffer, set->desc_surface_state,
-                           anv_descriptor_set_address(set));
+         add_surface_reloc(cmd_buffer, anv_descriptor_set_address(set));
+         break;
+      }
+
+      case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER: {
+         assert(pipe_state->descriptor_buffers[binding->index].state.alloc_size);
+         bt_map[s] = pipe_state->descriptor_buffers[binding->index].state.offset +
+                     state_offset;
          break;
       }
 
@@ -2686,6 +2249,7 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
          assert(binding->set < MAX_SETS);
          const struct anv_descriptor_set *set =
             pipe_state->descriptors[binding->set];
+
          if (binding->index >= set->descriptor_count) {
             /* From the Vulkan spec section entitled "DescriptorSet and
              * Binding Assignment":
@@ -2702,162 +2266,45 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
             assert(binding->index < set->layout->descriptor_count);
             continue;
          }
-         const struct anv_descriptor *desc = &set->descriptors[binding->index];
 
-         switch (desc->type) {
-         case VK_DESCRIPTOR_TYPE_SAMPLER:
-            /* Nothing for us to do here */
-            continue;
+         /* For push descriptor, if the binding is fully promoted to push
+          * constants, just reference the null surface in the binding table.
+          * It's unused and we didn't allocate/pack a surface state for it .
+          */
+         if (set->is_push) {
+            uint32_t desc_idx = set->layout->binding[binding->binding].descriptor_index;
+            assert(desc_idx < MAX_PUSH_DESCRIPTORS);
 
-         case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
-         case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: {
-            if (desc->image_view) {
-               struct anv_surface_state sstate =
-                  (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?
-                  desc->image_view->planes[binding->plane].general_sampler_surface_state :
-                  desc->image_view->planes[binding->plane].optimal_sampler_surface_state;
-               surface_state = sstate.state;
-               assert(surface_state.alloc_size);
-               if (need_client_mem_relocs)
-                  add_surface_state_relocs(cmd_buffer, sstate);
-            } else {
-               surface_state = cmd_buffer->device->null_surface_state;
+            if (shader->push_desc_info.fully_promoted_ubo_descriptors & BITFIELD_BIT(desc_idx)) {
+               surface_state =
+                  anv_null_surface_state_for_binding_table(cmd_buffer->device);
+               break;
             }
-            break;
          }
-         case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
-            assert(shader->stage == MESA_SHADER_FRAGMENT);
-            assert(desc->image_view != NULL);
-            if ((desc->image_view->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) == 0) {
-               /* For depth and stencil input attachments, we treat it like any
-                * old texture that a user may have bound.
-                */
-               assert(desc->image_view->n_planes == 1);
-               struct anv_surface_state sstate =
-                  (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?
-                  desc->image_view->planes[0].general_sampler_surface_state :
-                  desc->image_view->planes[0].optimal_sampler_surface_state;
-               surface_state = sstate.state;
-               assert(surface_state.alloc_size);
-               if (need_client_mem_relocs)
-                  add_surface_state_relocs(cmd_buffer, sstate);
-            } else {
-               /* For color input attachments, we create the surface state at
-                * vkBeginRenderPass time so that we can include aux and clear
-                * color information.
-                */
-               assert(binding->input_attachment_index < subpass->input_count);
-               const unsigned subpass_att = binding->input_attachment_index;
-               const unsigned att = subpass->input_attachments[subpass_att].attachment;
-               surface_state = cmd_buffer->state.attachments[att].input.state;
-            }
-            break;
 
-         case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
-            if (desc->image_view) {
-               struct anv_surface_state sstate = (binding->write_only)
-                  ? desc->image_view->planes[binding->plane].writeonly_storage_surface_state
-                  : desc->image_view->planes[binding->plane].storage_surface_state;
-               surface_state = sstate.state;
-               assert(surface_state.alloc_size);
-               if (surface_state.offset == 0) {
-                  mesa_loge("Bound a image to a descriptor where the "
-                            "descriptor does not have NonReadable "
-                            "set and the image does not have a "
-                            "corresponding SPIR-V format enum.");
-                  vk_debug_report(&cmd_buffer->device->physical->instance->vk,
-                                  VK_DEBUG_REPORT_ERROR_BIT_EXT,
-                                  &desc->image_view->vk.base,
-                                  __LINE__, 0, "anv",
-                                  "Bound a image to a descriptor where the "
-                                  "descriptor does not have NonReadable "
-                                  "set and the image does not have a "
-                                  "corresponding SPIR-V format enum.");
-               }
-               if (surface_state.offset && need_client_mem_relocs)
-                  add_surface_state_relocs(cmd_buffer, sstate);
-            } else {
-               surface_state = cmd_buffer->device->null_surface_state;
-            }
-            break;
+         const struct anv_descriptor *desc = &set->descriptors[binding->index];
+         if (desc->type == VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR ||
+             desc->type == VK_DESCRIPTOR_TYPE_SAMPLER) {
+            /* Nothing for us to do here */
+            continue;
          }
 
-         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
-         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
-         case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
-            if (desc->buffer_view) {
-               surface_state = desc->buffer_view->surface_state;
-               assert(surface_state.alloc_size);
-               if (need_client_mem_relocs) {
-                  add_surface_reloc(cmd_buffer, surface_state,
-                                    desc->buffer_view->address);
-               }
-            } else {
-               surface_state = cmd_buffer->device->null_surface_state;
-            }
-            break;
-
-         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
-         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
-            if (desc->buffer) {
-               /* Compute the offset within the buffer */
-               uint32_t dynamic_offset =
-                  push->dynamic_offsets[binding->dynamic_offset_index];
-               uint64_t offset = desc->offset + dynamic_offset;
-               /* Clamp to the buffer size */
-               offset = MIN2(offset, desc->buffer->size);
-               /* Clamp the range to the buffer size */
-               uint32_t range = MIN2(desc->range, desc->buffer->size - offset);
-
-               /* Align the range for consistency */
-               if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
-                  range = align_u32(range, ANV_UBO_ALIGNMENT);
-
-               struct anv_address address =
-                  anv_address_add(desc->buffer->address, offset);
-
-               surface_state =
-                  anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
-               enum isl_format format =
-                  anv_isl_format_for_descriptor_type(cmd_buffer->device,
-                                                     desc->type);
-
-               isl_surf_usage_flags_t usage =
-                  desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ?
-                  ISL_SURF_USAGE_CONSTANT_BUFFER_BIT :
-                  ISL_SURF_USAGE_STORAGE_BIT;
-
-               anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
-                                             format, usage, address, range, 1);
-               if (need_client_mem_relocs)
-                  add_surface_reloc(cmd_buffer, surface_state, address);
-            } else {
-               surface_state = cmd_buffer->device->null_surface_state;
-            }
-            break;
+         const struct anv_pipeline *pipeline = pipe_state->pipeline;
+         uint32_t surface_state_offset;
+         if (pipeline->layout.type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_INDIRECT) {
+            surface_state_offset =
+               emit_indirect_descriptor_binding_table_entry(cmd_buffer,
+                                                            pipe_state,
+                                                            binding, desc);
+         } else {
+            assert(pipeline->layout.type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_DIRECT ||
+                   pipeline->layout.type == ANV_PIPELINE_DESCRIPTOR_SET_LAYOUT_TYPE_BUFFER);
+            surface_state_offset =
+               emit_direct_descriptor_binding_table_entry(cmd_buffer, pipe_state,
+                                                          set, binding, desc);
          }
 
-         case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
-            if (desc->buffer_view) {
-               surface_state = (binding->write_only)
-                  ? desc->buffer_view->writeonly_storage_surface_state
-                  : desc->buffer_view->storage_surface_state;
-               assert(surface_state.alloc_size);
-               if (need_client_mem_relocs) {
-                  add_surface_reloc(cmd_buffer, surface_state,
-                                    desc->buffer_view->address);
-               }
-            } else {
-               surface_state = cmd_buffer->device->null_surface_state;
-            }
-            break;
-
-         default:
-            assert(!"Invalid descriptor type");
-            continue;
-         }
-         assert(surface_state.map);
-         bt_map[s] = surface_state.offset + state_offset;
+         bt_map[s] = surface_state_offset + state_offset;
          break;
       }
       }
@@ -2902,18 +2349,22 @@ emit_samplers(struct anv_cmd_buffer *cmd_buffer,
          continue;
 
       memcpy(state->map + (s * 16),
-             sampler->state[binding->plane], sizeof(sampler->state[0]));
+             cmd_buffer->state.current_db_mode ==
+             ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER ?
+             sampler->db_state[binding->plane] :
+             sampler->state[binding->plane],
+             sizeof(sampler->state[0]));
    }
 
    return VK_SUCCESS;
 }
 
-static uint32_t
-flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer,
-                      struct anv_cmd_pipeline_state *pipe_state,
-                      const VkShaderStageFlags dirty,
-                      struct anv_shader_bin **shaders,
-                      uint32_t num_shaders)
+uint32_t
+genX(cmd_buffer_flush_descriptor_sets)(struct anv_cmd_buffer *cmd_buffer,
+                                       struct anv_cmd_pipeline_state *pipe_state,
+                                       const VkShaderStageFlags dirty,
+                                       struct anv_shader_bin **shaders,
+                                       uint32_t num_shaders)
 {
    VkShaderStageFlags flushed = 0;
 
@@ -2949,10 +2400,10 @@ flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer,
       if (result != VK_SUCCESS)
          return 0;
 
-      /* Re-emit state base addresses so we get the new surface state base
+      /* Re-emit the BT base address so we get the new surface state base
        * address before we start emitting binding tables etc.
        */
-      genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
+      genX(cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
 
       /* Re-emit all active binding tables */
       flushed = 0;
@@ -2983,2403 +2434,1789 @@ flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer,
    return flushed;
 }
 
-static void
-cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
-                                    uint32_t stages)
+/* This function generates the surface state used to read the content of the
+ * descriptor buffer.
+ */
+void
+genX(cmd_buffer_emit_push_descriptor_buffer_surface)(struct anv_cmd_buffer *cmd_buffer,
+                                                     struct anv_descriptor_set *set)
 {
-   static const uint32_t sampler_state_opcodes[] = {
-      [MESA_SHADER_VERTEX]                      = 43,
-      [MESA_SHADER_TESS_CTRL]                   = 44, /* HS */
-      [MESA_SHADER_TESS_EVAL]                   = 45, /* DS */
-      [MESA_SHADER_GEOMETRY]                    = 46,
-      [MESA_SHADER_FRAGMENT]                    = 47,
-      [MESA_SHADER_COMPUTE]                     = 0,
-   };
+   assert(set->desc_surface_state.map == NULL);
 
-   static const uint32_t binding_table_opcodes[] = {
-      [MESA_SHADER_VERTEX]                      = 38,
-      [MESA_SHADER_TESS_CTRL]                   = 39,
-      [MESA_SHADER_TESS_EVAL]                   = 40,
-      [MESA_SHADER_GEOMETRY]                    = 41,
-      [MESA_SHADER_FRAGMENT]                    = 42,
-      [MESA_SHADER_COMPUTE]                     = 0,
-   };
-
-   anv_foreach_stage(s, stages) {
-      assert(s < ARRAY_SIZE(binding_table_opcodes));
-      assert(binding_table_opcodes[s] > 0);
+   struct anv_descriptor_set_layout *layout = set->layout;
+   enum isl_format format =
+      anv_isl_format_for_descriptor_type(cmd_buffer->device,
+                                         VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
 
-      if (cmd_buffer->state.samplers[s].alloc_size > 0) {
-         anv_batch_emit(&cmd_buffer->batch,
-                        GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
-            ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
-            ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
-         }
-      }
+   set->desc_surface_state =
+      anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
+   if (set->desc_surface_state.map == NULL)
+      return;
+   anv_fill_buffer_surface_state(cmd_buffer->device,
+                                 set->desc_surface_state.map,
+                                 format, ISL_SWIZZLE_IDENTITY,
+                                 ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
+                                 set->desc_surface_addr,
+                                 layout->descriptor_buffer_surface_size, 1);
+}
 
-      /* Always emit binding table pointers if we're asked to, since on SKL
-       * this is what flushes push constants. */
-      anv_batch_emit(&cmd_buffer->batch,
-                     GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
-         btp._3DCommandSubOpcode = binding_table_opcodes[s];
-         btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
+/* This functions generates surface states used by a pipeline for push
+ * descriptors. This is delayed to the draw/dispatch time to avoid allocation
+ * and surface state generation when a pipeline is not going to use the
+ * binding table to access any push descriptor data.
+ */
+void
+genX(cmd_buffer_emit_push_descriptor_surfaces)(struct anv_cmd_buffer *cmd_buffer,
+                                               struct anv_descriptor_set *set)
+{
+   while (set->generate_surface_states) {
+      int desc_idx = u_bit_scan(&set->generate_surface_states);
+      struct anv_descriptor *desc = &set->descriptors[desc_idx];
+      struct anv_buffer_view *bview = desc->set_buffer_view;
+
+      if (bview != NULL && bview->general.state.map == NULL) {
+         bview->general.state =
+            anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
+         if (bview->general.state.map == NULL)
+            return;
+         anv_descriptor_write_surface_state(cmd_buffer->device, desc,
+                                            bview->general.state);
       }
    }
 }
 
-static struct anv_address
-get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
-                       const struct anv_shader_bin *shader,
-                       const struct anv_push_range *range)
+ALWAYS_INLINE void
+genX(batch_emit_pipe_control)(struct anv_batch *batch,
+                              const struct intel_device_info *devinfo,
+                              uint32_t current_pipeline,
+                              enum anv_pipe_bits bits,
+                              const char *reason)
 {
-   struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
-   switch (range->set) {
-   case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
-      /* This is a descriptor set buffer so the set index is
-       * actually given by binding->binding.  (Yes, that's
-       * confusing.)
-       */
-      struct anv_descriptor_set *set =
-         gfx_state->base.descriptors[range->index];
-      return anv_descriptor_set_address(set);
-   }
+   genX(batch_emit_pipe_control_write)(batch,
+                                       devinfo,
+                                       current_pipeline,
+                                       NoWrite,
+                                       ANV_NULL_ADDRESS,
+                                       0,
+                                       bits,
+                                       reason);
+}
 
-   case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
-      if (gfx_state->base.push_constants_state.alloc_size == 0) {
-         gfx_state->base.push_constants_state =
-            anv_cmd_buffer_gfx_push_constants(cmd_buffer);
-      }
-      return (struct anv_address) {
-         .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
-         .offset = gfx_state->base.push_constants_state.offset,
-      };
-   }
+ALWAYS_INLINE void
+genX(batch_emit_pipe_control_write)(struct anv_batch *batch,
+                                    const struct intel_device_info *devinfo,
+                                    uint32_t current_pipeline,
+                                    uint32_t post_sync_op,
+                                    struct anv_address address,
+                                    uint32_t imm_data,
+                                    enum anv_pipe_bits bits,
+                                    const char *reason)
+{
+   if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
+       (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO))
+      unreachable("Trying to emit unsupported PIPE_CONTROL command.");
+
+   /* XXX - insert all workarounds and GFX specific things below. */
 
-   case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
-      return (struct anv_address) {
-         .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
-         .offset = shader->kernel.offset +
-                   shader->prog_data->const_data_offset,
+   /* Wa_14014966230: For COMPUTE Workload - Any PIPE_CONTROL command with
+    * POST_SYNC Operation Enabled MUST be preceded by a PIPE_CONTROL
+    * with CS_STALL Bit set (with No POST_SYNC ENABLED)
+    */
+   if (intel_device_info_is_adln(devinfo) &&
+       current_pipeline == GPGPU &&
+       post_sync_op != NoWrite) {
+      anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
+         pipe.CommandStreamerStallEnable = true;
+         anv_debug_dump_pc(pipe, "Wa_14014966230");
       };
+   }
 
-   default: {
-      assert(range->set < MAX_SETS);
-      struct anv_descriptor_set *set =
-         gfx_state->base.descriptors[range->set];
-      const struct anv_descriptor *desc =
-         &set->descriptors[range->index];
+   /* SKL PRMs, Volume 7: 3D-Media-GPGPU, Programming Restrictions for
+    * PIPE_CONTROL, Flush Types:
+    *   "Requires stall bit ([20] of DW) set for all GPGPU Workloads."
+    * For newer platforms this is documented in the PIPE_CONTROL instruction
+    * page.
+    */
+   if (current_pipeline == GPGPU &&
+       (bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT))
+      bits |= ANV_PIPE_CS_STALL_BIT;
 
-      if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
-         if (desc->buffer_view)
-            return desc->buffer_view->address;
-      } else {
-         assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
-         if (desc->buffer) {
-            const struct anv_push_constants *push =
-               &gfx_state->base.push_constants;
-            uint32_t dynamic_offset =
-               push->dynamic_offsets[range->dynamic_offset_index];
-            return anv_address_add(desc->buffer->address,
-                                   desc->offset + dynamic_offset);
-         }
-      }
+#if INTEL_NEEDS_WA_1409600907
+   /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must
+    * be set with any PIPE_CONTROL with Depth Flush Enable bit set.
+    */
+   if (bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT)
+      bits |= ANV_PIPE_DEPTH_STALL_BIT;
+#endif
 
-      /* For NULL UBOs, we just return an address in the workaround BO.  We do
-       * writes to it for workarounds but always at the bottom.  The higher
-       * bytes should be all zeros.
-       */
-      assert(range->length * 32 <= 2048);
-      return (struct anv_address) {
-         .bo = cmd_buffer->device->workaround_bo,
-         .offset = 1024,
-      };
-   }
-   }
-}
+   anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
+#if GFX_VERx10 >= 125
+      pipe.UntypedDataPortCacheFlushEnable =
+         bits & ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+      pipe.CCSFlushEnable = bits & ANV_PIPE_CCS_CACHE_FLUSH_BIT;
+#endif
+#if GFX_VER == 12
+      pipe.TileCacheFlushEnable = bits & ANV_PIPE_TILE_CACHE_FLUSH_BIT;
+#endif
+#if GFX_VER > 11
+      pipe.HDCPipelineFlushEnable = bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
+#endif
+      pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
+      pipe.DCFlushEnable = bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
+      pipe.RenderTargetCacheFlushEnable =
+         bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
 
+      pipe.DepthStallEnable = bits & ANV_PIPE_DEPTH_STALL_BIT;
 
-/** Returns the size in bytes of the bound buffer
- *
- * The range is relative to the start of the buffer, not the start of the
- * range.  The returned range may be smaller than
- *
- *    (range->start + range->length) * 32;
- */
-static uint32_t
-get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
-                          const struct anv_shader_bin *shader,
-                          const struct anv_push_range *range)
-{
-   assert(shader->stage != MESA_SHADER_COMPUTE);
-   const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
-   switch (range->set) {
-   case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
-      struct anv_descriptor_set *set =
-         gfx_state->base.descriptors[range->index];
-      assert(range->start * 32 < set->desc_mem.alloc_size);
-      assert((range->start + range->length) * 32 <= set->desc_mem.alloc_size);
-      return set->desc_mem.alloc_size;
-   }
-
-   case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
-      return (range->start + range->length) * 32;
-
-   case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
-      return ALIGN(shader->prog_data->const_data_size, ANV_UBO_ALIGNMENT);
-
-   default: {
-      assert(range->set < MAX_SETS);
-      struct anv_descriptor_set *set =
-         gfx_state->base.descriptors[range->set];
-      const struct anv_descriptor *desc =
-         &set->descriptors[range->index];
+      pipe.TLBInvalidate = bits & ANV_PIPE_TLB_INVALIDATE_BIT;
 
-      if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
-         if (!desc->buffer_view)
-            return 0;
+#if GFX_VERx10 >= 125
+      pipe.PSSStallSyncEnable = bits & ANV_PIPE_PSS_STALL_SYNC_BIT;
+#endif
+      pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
+      pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
 
-         if (range->start * 32 > desc->buffer_view->range)
-            return 0;
+      pipe.StateCacheInvalidationEnable =
+         bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
+      pipe.ConstantCacheInvalidationEnable =
+         bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
+#if GFX_VER >= 12
+      /* Invalidates the L3 cache part in which index & vertex data is loaded
+       * when VERTEX_BUFFER_STATE::L3BypassDisable is set.
+       */
+      pipe.L3ReadOnlyCacheInvalidationEnable =
+         bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+#endif
+      pipe.VFCacheInvalidationEnable =
+         bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+      pipe.TextureCacheInvalidationEnable =
+         bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
+      pipe.InstructionCacheInvalidateEnable =
+         bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
 
-         return desc->buffer_view->range;
-      } else {
-         if (!desc->buffer)
-            return 0;
+      pipe.PostSyncOperation = post_sync_op;
+      pipe.Address = address;
+      pipe.DestinationAddressType = DAT_PPGTT;
+      pipe.ImmediateData = imm_data;
 
-         assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
-         /* Compute the offset within the buffer */
-         const struct anv_push_constants *push =
-            &gfx_state->base.push_constants;
-         uint32_t dynamic_offset =
-            push->dynamic_offsets[range->dynamic_offset_index];
-         uint64_t offset = desc->offset + dynamic_offset;
-         /* Clamp to the buffer size */
-         offset = MIN2(offset, desc->buffer->size);
-         /* Clamp the range to the buffer size */
-         uint32_t bound_range = MIN2(desc->range, desc->buffer->size - offset);
-
-         /* Align the range for consistency */
-         bound_range = align_u32(bound_range, ANV_UBO_ALIGNMENT);
-
-         return bound_range;
-      }
-   }
+      anv_debug_dump_pc(pipe, reason);
    }
 }
 
-static void
-cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
-                              gl_shader_stage stage,
-                              struct anv_address *buffers,
-                              unsigned buffer_count)
+/* Set preemption on/off. */
+void
+genX(batch_set_preemption)(struct anv_batch *batch,
+                           const struct intel_device_info *devinfo,
+                           uint32_t current_pipeline,
+                           bool value)
 {
-   const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
-   const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
-
-   static const uint32_t push_constant_opcodes[] = {
-      [MESA_SHADER_VERTEX]                      = 21,
-      [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
-      [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
-      [MESA_SHADER_GEOMETRY]                    = 22,
-      [MESA_SHADER_FRAGMENT]                    = 23,
-      [MESA_SHADER_COMPUTE]                     = 0,
-   };
-
-   assert(stage < ARRAY_SIZE(push_constant_opcodes));
-   assert(push_constant_opcodes[stage] > 0);
-
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
-      c._3DCommandSubOpcode = push_constant_opcodes[stage];
+#if GFX_VERx10 >= 120
+   anv_batch_write_reg(batch, GENX(CS_CHICKEN1), cc1) {
+      cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = !value;
+      cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
+   }
 
-      if (anv_pipeline_has_stage(pipeline, stage)) {
-         const struct anv_pipeline_bind_map *bind_map =
-            &pipeline->shaders[stage]->bind_map;
+   /* Wa_16013994831 - we need to insert CS_STALL and 250 noops. */
+   genx_batch_emit_pipe_control(batch, devinfo, current_pipeline,
+                                ANV_PIPE_CS_STALL_BIT);
 
-#if GFX_VER >= 9
-         /* This field exists since Gfx8.  However, the Broadwell PRM says:
-          *
-          *    "Constant Buffer Object Control State must be always programmed
-          *    to zero."
-          *
-          * This restriction does not exist on any newer platforms.
-          *
-          * We only have one MOCS field for the whole packet, not one per
-          * buffer.  We could go out of our way here to walk over all of the
-          * buffers and see if any of them are used externally and use the
-          * external MOCS.  However, the notion that someone would use the
-          * same bit of memory for both scanout and a UBO is nuts.  Let's not
-          * bother and assume it's all internal.
-          */
-         c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
+   for (unsigned i = 0; i < 250; i++)
+      anv_batch_emit(batch, GENX(MI_NOOP), noop);
 #endif
+}
 
-#if GFX_VERx10 >= 75
-         /* The Skylake PRM contains the following restriction:
-          *
-          *    "The driver must ensure The following case does not occur
-          *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
-          *     buffer 3 read length equal to zero committed followed by a
-          *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
-          *     zero committed."
-          *
-          * To avoid this, we program the buffers in the highest slots.
-          * This way, slot 0 is only used if slot 3 is also used.
-          */
-         assert(buffer_count <= 4);
-         const unsigned shift = 4 - buffer_count;
-         for (unsigned i = 0; i < buffer_count; i++) {
-            const struct anv_push_range *range = &bind_map->push_ranges[i];
-
-            /* At this point we only have non-empty ranges */
-            assert(range->length > 0);
-
-            /* For Ivy Bridge, make sure we only set the first range (actual
-             * push constants)
-             */
-            assert((GFX_VERx10 >= 75) || i == 0);
+void
+genX(cmd_buffer_set_preemption)(struct anv_cmd_buffer *cmd_buffer, bool value)
+{
+#if GFX_VERx10 >= 120
+   if (cmd_buffer->state.gfx.object_preemption == value)
+      return;
 
-            c.ConstantBody.ReadLength[i + shift] = range->length;
-            c.ConstantBody.Buffer[i + shift] =
-               anv_address_add(buffers[i], range->start * 32);
-         }
-#else
-         /* For Ivy Bridge, push constants are relative to dynamic state
-          * base address and we only ever push actual push constants.
-          */
-         if (bind_map->push_ranges[0].length > 0) {
-            assert(buffer_count == 1);
-            assert(bind_map->push_ranges[0].set ==
-                   ANV_DESCRIPTOR_SET_PUSH_CONSTANTS);
-            assert(buffers[0].bo ==
-                   cmd_buffer->device->dynamic_state_pool.block_pool.bo);
-            c.ConstantBody.ReadLength[0] = bind_map->push_ranges[0].length;
-            c.ConstantBody.Buffer[0].bo = NULL;
-            c.ConstantBody.Buffer[0].offset = buffers[0].offset;
-         }
-         assert(bind_map->push_ranges[1].length == 0);
-         assert(bind_map->push_ranges[2].length == 0);
-         assert(bind_map->push_ranges[3].length == 0);
+   genX(batch_set_preemption)(&cmd_buffer->batch, cmd_buffer->device->info,
+                              cmd_buffer->state.current_pipeline,
+                              value);
+   cmd_buffer->state.gfx.object_preemption = value;
 #endif
-      }
-   }
 }
 
-#if GFX_VER >= 12
-static void
-cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer,
-                                  uint32_t shader_mask,
-                                  struct anv_address *buffers,
-                                  uint32_t buffer_count)
+ALWAYS_INLINE static void
+update_descriptor_set_surface_state(struct anv_cmd_buffer *cmd_buffer,
+                                    struct anv_cmd_pipeline_state *pipe_state,
+                                    uint32_t set_idx)
 {
-   if (buffer_count == 0) {
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
-         c.ShaderUpdateEnable = shader_mask;
-         c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
-      }
+   if (!pipe_state->descriptor_buffers[set_idx].bound)
       return;
-   }
 
-   const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
-   const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
-
-   static const UNUSED uint32_t push_constant_opcodes[] = {
-      [MESA_SHADER_VERTEX]                      = 21,
-      [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
-      [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
-      [MESA_SHADER_GEOMETRY]                    = 22,
-      [MESA_SHADER_FRAGMENT]                    = 23,
-      [MESA_SHADER_COMPUTE]                     = 0,
-   };
-
-   gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask);
-   assert(stage < ARRAY_SIZE(push_constant_opcodes));
-   assert(push_constant_opcodes[stage] > 0);
-
-   const struct anv_pipeline_bind_map *bind_map =
-      &pipeline->shaders[stage]->bind_map;
-
-   uint32_t *dw;
-   const uint32_t buffer_mask = (1 << buffer_count) - 1;
-   const uint32_t num_dwords = 2 + 2 * buffer_count;
-
-   dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
-                        GENX(3DSTATE_CONSTANT_ALL),
-                        .ShaderUpdateEnable = shader_mask,
-                        .PointerBufferMask = buffer_mask,
-                        .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false));
-
-   for (int i = 0; i < buffer_count; i++) {
-      const struct anv_push_range *range = &bind_map->push_ranges[i];
-      GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
-         &cmd_buffer->batch, dw + 2 + i * 2,
-         &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
-            .PointerToConstantBuffer =
-               anv_address_add(buffers[i], range->start * 32),
-            .ConstantBufferReadLength = range->length,
-         });
+   const struct anv_physical_device *device = cmd_buffer->device->physical;
+   const int32_t buffer_index =
+      pipe_state->descriptor_buffers[set_idx].buffer_index;
+   const struct anv_va_range *push_va_range =
+      GFX_VERx10 >= 125 ?
+      &device->va.push_descriptor_buffer_pool :
+      &device->va.internal_surface_state_pool;
+   const struct anv_va_range *va_range =
+      buffer_index == -1 ? push_va_range : &device->va.descriptor_buffer_pool;
+   const uint64_t descriptor_set_addr =
+      (buffer_index == -1 ? va_range->addr :
+       cmd_buffer->state.descriptor_buffers.address[buffer_index]) +
+      pipe_state->descriptor_buffers[set_idx].buffer_offset;
+   const uint64_t set_size =
+      MIN2(va_range->size - (descriptor_set_addr - va_range->addr),
+           anv_physical_device_bindless_heap_size(device, true));
+
+   if (descriptor_set_addr != pipe_state->descriptor_buffers[set_idx].address) {
+      pipe_state->descriptor_buffers[set_idx].address = descriptor_set_addr;
+
+      struct anv_state surface_state =
+         anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1);
+      const enum isl_format format =
+         anv_isl_format_for_descriptor_type(cmd_buffer->device,
+                                            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
+      anv_fill_buffer_surface_state(
+         cmd_buffer->device, surface_state.map,
+         format, ISL_SWIZZLE_IDENTITY,
+         ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
+         anv_address_from_u64(pipe_state->descriptor_buffers[set_idx].address),
+         set_size, 1);
+
+      pipe_state->descriptor_buffers[set_idx].state = surface_state;
    }
 }
-#endif
 
-static void
-cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer,
-                                VkShaderStageFlags dirty_stages)
+ALWAYS_INLINE static uint32_t
+compute_descriptor_set_surface_offset(const struct anv_cmd_buffer *cmd_buffer,
+                                      const struct anv_cmd_pipeline_state *pipe_state,
+                                      const uint32_t set_idx)
 {
-   VkShaderStageFlags flushed = 0;
-   struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
-   const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
-
-#if GFX_VER >= 12
-   uint32_t nobuffer_stages = 0;
-#endif
-
-   /* Compute robust pushed register access mask for each stage. */
-   if (cmd_buffer->device->robust_buffer_access) {
-      anv_foreach_stage(stage, dirty_stages) {
-         if (!anv_pipeline_has_stage(pipeline, stage))
-            continue;
-
-         const struct anv_shader_bin *shader = pipeline->shaders[stage];
-         const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
-         struct anv_push_constants *push = &gfx_state->base.push_constants;
-
-         push->push_reg_mask[stage] = 0;
-         /* Start of the current range in the shader, relative to the start of
-          * push constants in the shader.
-          */
-         unsigned range_start_reg = 0;
-         for (unsigned i = 0; i < 4; i++) {
-            const struct anv_push_range *range = &bind_map->push_ranges[i];
-            if (range->length == 0)
-               continue;
-
-            unsigned bound_size =
-               get_push_range_bound_size(cmd_buffer, shader, range);
-            if (bound_size >= range->start * 32) {
-               unsigned bound_regs =
-                  MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
-                       range->length);
-               assert(range_start_reg + bound_regs <= 64);
-               push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,
-                                                              bound_regs);
-            }
+   const struct anv_physical_device *device = cmd_buffer->device->physical;
 
-            cmd_buffer->state.push_constants_dirty |=
-               mesa_to_vk_shader_stage(stage);
+   if (device->uses_ex_bso) {
+      int32_t buffer_index =
+         pipe_state->descriptor_buffers[set_idx].buffer_index;
+      uint64_t buffer_address =
+         buffer_index == -1 ?
+         device->va.push_descriptor_buffer_pool.addr :
+         cmd_buffer->state.descriptor_buffers.address[buffer_index];
 
-            range_start_reg += range->length;
-         }
-      }
+      return (buffer_address - device->va.descriptor_buffer_pool.addr) +
+              pipe_state->descriptor_buffers[set_idx].buffer_offset;
    }
 
-   /* Resets the push constant state so that we allocate a new one if
-    * needed.
-    */
-   gfx_state->base.push_constants_state = ANV_STATE_NULL;
-
-   anv_foreach_stage(stage, dirty_stages) {
-      unsigned buffer_count = 0;
-      flushed |= mesa_to_vk_shader_stage(stage);
-      UNUSED uint32_t max_push_range = 0;
-
-      struct anv_address buffers[4] = {};
-      if (anv_pipeline_has_stage(pipeline, stage)) {
-         const struct anv_shader_bin *shader = pipeline->shaders[stage];
-         const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
-
-         /* We have to gather buffer addresses as a second step because the
-          * loop above puts data into the push constant area and the call to
-          * get_push_range_address is what locks our push constants and copies
-          * them into the actual GPU buffer.  If we did the two loops at the
-          * same time, we'd risk only having some of the sizes in the push
-          * constant buffer when we did the copy.
-          */
-         for (unsigned i = 0; i < 4; i++) {
-            const struct anv_push_range *range = &bind_map->push_ranges[i];
-            if (range->length == 0)
-               break;
+   return pipe_state->descriptor_buffers[set_idx].buffer_offset << 6;
+}
 
-            buffers[i] = get_push_range_address(cmd_buffer, shader, range);
-            max_push_range = MAX2(max_push_range, range->length);
-            buffer_count++;
-         }
+ALWAYS_INLINE static uint32_t
+compute_descriptor_set_sampler_offset(const struct anv_cmd_buffer *cmd_buffer,
+                                      const struct anv_cmd_pipeline_state *pipe_state,
+                                      const uint32_t set_idx)
+{
+   const struct anv_physical_device *device = cmd_buffer->device->physical;
+   int32_t buffer_index =
+      pipe_state->descriptor_buffers[set_idx].buffer_index;
+   uint64_t buffer_address =
+      buffer_index == -1 ?
+      device->va.push_descriptor_buffer_pool.addr :
+      cmd_buffer->state.descriptor_buffers.address[buffer_index];
+
+   return (buffer_address - device->va.dynamic_state_db_pool.addr) +
+      pipe_state->descriptor_buffers[set_idx].buffer_offset;
+}
 
-         /* We have at most 4 buffers but they should be tightly packed */
-         for (unsigned i = buffer_count; i < 4; i++)
-            assert(bind_map->push_ranges[i].length == 0);
-      }
+void
+genX(flush_descriptor_buffers)(struct anv_cmd_buffer *cmd_buffer,
+                               struct anv_cmd_pipeline_state *pipe_state)
+{
+   /* On Gfx12.5+ the STATE_BASE_ADDRESS BindlessSurfaceStateBaseAddress &
+    * DynamicStateBaseAddress are fixed. So as long as we stay in one
+    * descriptor buffer mode, there is no need to switch.
+    */
+#if GFX_VERx10 >= 125
+   if (cmd_buffer->state.current_db_mode !=
+       cmd_buffer->state.pending_db_mode)
+      genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
+#else
+   if (cmd_buffer->state.descriptor_buffers.dirty)
+      genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
+#endif
 
-#if GFX_VER >= 12
-      /* If this stage doesn't have any push constants, emit it later in a
-       * single CONSTANT_ALL packet.
-       */
-      if (buffer_count == 0) {
-         nobuffer_stages |= 1 << stage;
-         continue;
+   assert(cmd_buffer->state.current_db_mode !=
+          ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN);
+   if (cmd_buffer->state.current_db_mode == ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER &&
+       (cmd_buffer->state.descriptor_buffers.dirty ||
+        (pipe_state->pipeline->active_stages &
+         cmd_buffer->state.descriptor_buffers.offsets_dirty) != 0)) {
+      struct anv_push_constants *push_constants =
+         &pipe_state->push_constants;
+      for (uint32_t i = 0; i < ARRAY_SIZE(push_constants->desc_surface_offsets); i++) {
+         update_descriptor_set_surface_state(cmd_buffer, pipe_state, i);
+
+         push_constants->desc_surface_offsets[i] =
+            compute_descriptor_set_surface_offset(cmd_buffer, pipe_state, i);
+         push_constants->desc_sampler_offsets[i] =
+            compute_descriptor_set_sampler_offset(cmd_buffer, pipe_state, i);
       }
 
-      /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
-       * contains only 5 bits, so we can only use it for buffers smaller than
-       * 32.
-       */
-      if (max_push_range < 32) {
-         cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage,
-                                           buffers, buffer_count);
-         continue;
-      }
+#if GFX_VERx10 < 125
+      struct anv_device *device = cmd_buffer->device;
+      push_constants->surfaces_base_offset =
+         (cmd_buffer->state.descriptor_buffers.surfaces_address -
+          device->physical->va.descriptor_buffer_pool.addr);
 #endif
 
-      cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
+      cmd_buffer->state.push_constants_dirty |=
+         (cmd_buffer->state.descriptor_buffers.offsets_dirty &
+          pipe_state->pipeline->active_stages);
+      pipe_state->push_constants_data_dirty = true;
+      cmd_buffer->state.descriptor_buffers.offsets_dirty &=
+         ~pipe_state->pipeline->active_stages;
    }
 
-#if GFX_VER >= 12
-   if (nobuffer_stages)
-      cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0);
-#endif
-
-   cmd_buffer->state.push_constants_dirty &= ~flushed;
+   cmd_buffer->state.descriptor_buffers.dirty = false;
 }
 
-static void
-cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer)
+void
+genX(cmd_buffer_begin_companion)(struct anv_cmd_buffer *cmd_buffer,
+                                 VkCommandBufferLevel level)
 {
-   const uint32_t clip_states =
-#if GFX_VER <= 7
-      ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
-      ANV_CMD_DIRTY_DYNAMIC_CULL_MODE |
-#endif
-      ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY |
-      ANV_CMD_DIRTY_DYNAMIC_VIEWPORT |
-      ANV_CMD_DIRTY_PIPELINE;
+   cmd_buffer->vk.level = level;
+   cmd_buffer->is_companion_rcs_cmd_buffer = true;
 
-   if ((cmd_buffer->state.gfx.dirty & clip_states) == 0)
-      return;
+   trace_intel_begin_cmd_buffer(&cmd_buffer->trace);
 
-   /* Take dynamic primitive topology in to account with
-    *    3DSTATE_CLIP::ViewportXYClipTestEnable
+#if GFX_VER >= 12
+   /* Reenable prefetching at the beginning of secondary command buffers. We
+    * do this so that the return instruction edition is not prefetched before
+    * completion.
     */
-   bool xy_clip_test_enable = 0;
-
-   if (cmd_buffer->state.gfx.pipeline->dynamic_states &
-       ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) {
-      VkPrimitiveTopology primitive_topology =
-         cmd_buffer->state.gfx.dynamic.primitive_topology;
-
-      VkPolygonMode dynamic_raster_mode =
-         genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
-                                   primitive_topology);
-
-      xy_clip_test_enable = (dynamic_raster_mode == VK_POLYGON_MODE_FILL);
+   if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
+         arb.PreParserDisableMask = true;
+         arb.PreParserDisable = false;
+      }
    }
-
-#if GFX_VER <= 7
-   const struct anv_dynamic_state *d = &cmd_buffer->state.gfx.dynamic;
-#endif
-   struct GENX(3DSTATE_CLIP) clip = {
-      GENX(3DSTATE_CLIP_header),
-#if GFX_VER <= 7
-      .FrontWinding = genX(vk_to_intel_front_face)[d->front_face],
-      .CullMode     = genX(vk_to_intel_cullmode)[d->cull_mode],
 #endif
-      .ViewportXYClipTestEnable = xy_clip_test_enable,
-   };
-   uint32_t dwords[GENX(3DSTATE_CLIP_length)];
 
-   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   const struct brw_vue_prog_data *last =
-      anv_pipeline_get_last_vue_prog_data(pipeline);
-   if (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
-      clip.MaximumVPIndex =
-         cmd_buffer->state.gfx.dynamic.viewport.count > 0 ?
-         cmd_buffer->state.gfx.dynamic.viewport.count - 1 : 0;
-   }
+   /* A companion command buffer is only used for blorp commands atm, so
+    * default to the legacy mode.
+    */
+   cmd_buffer->state.current_db_mode = ANV_CMD_DESCRIPTOR_BUFFER_MODE_LEGACY;
+   genX(cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
 
-   GENX(3DSTATE_CLIP_pack)(NULL, dwords, &clip);
-   anv_batch_emit_merge(&cmd_buffer->batch, dwords,
-                        pipeline->gfx7.clip);
+   /* Re-emit the aux table register in every command buffer.  This way we're
+    * ensured that we have the table even if this command buffer doesn't
+    * initialize any images.
+    */
+   if (cmd_buffer->device->info->has_aux_map) {
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
+                                "new cmd buffer with aux-tt");
+   }
 }
 
 static void
-cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer)
+genX(cmd_buffer_set_protected_memory)(struct anv_cmd_buffer *cmd_buffer,
+                                      bool enabled)
 {
-   const struct anv_dynamic_state *d = &cmd_buffer->state.gfx.dynamic;
-   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-
-#if GFX_VER == 7
-#  define streamout_state_dw pipeline->gfx7.streamout_state
+#if GFX_VER >= 12
+   if (enabled) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_SET_APPID), appid) {
+         /* Default value for single session. */
+         appid.ProtectedMemoryApplicationID = cmd_buffer->device->protected_session_id;
+         appid.ProtectedMemoryApplicationIDType = DISPLAY_APP;
+      }
+   }
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+      pc.PipeControlFlushEnable = true;
+      pc.DCFlushEnable = true;
+      pc.RenderTargetCacheFlushEnable = true;
+      pc.CommandStreamerStallEnable = true;
+      if (enabled)
+         pc.ProtectedMemoryEnable = true;
+      else
+         pc.ProtectedMemoryDisable = true;
+   }
 #else
-#  define streamout_state_dw pipeline->gfx8.streamout_state
+   unreachable("Protected content not supported");
 #endif
-
-   uint32_t dwords[GENX(3DSTATE_STREAMOUT_length)];
-
-   struct GENX(3DSTATE_STREAMOUT) so = {
-      GENX(3DSTATE_STREAMOUT_header),
-      .RenderingDisable = d->raster_discard,
-   };
-   GENX(3DSTATE_STREAMOUT_pack)(NULL, dwords, &so);
-   anv_batch_emit_merge(&cmd_buffer->batch, dwords, streamout_state_dw);
 }
 
-void
-genX(cmd_buffer_flush_state)(struct anv_cmd_buffer *cmd_buffer)
+VkResult
+genX(BeginCommandBuffer)(
+    VkCommandBuffer                             commandBuffer,
+    const VkCommandBufferBeginInfo*             pBeginInfo)
 {
-   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   uint32_t *p;
-
-   assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
-
-   genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
-
-   genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
-
-   genX(flush_pipeline_select_3d)(cmd_buffer);
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   VkResult result;
 
-   /* Apply any pending pipeline flushes we may have.  We want to apply them
-    * now because, if any of those flushes are for things like push constants,
-    * the GPU will read the state at weird times.
+   /* If this is the first vkBeginCommandBuffer, we must *initialize* the
+    * command buffer's state. Otherwise, we must *reset* its state. In both
+    * cases we reset it.
+    *
+    * From the Vulkan 1.0 spec:
+    *
+    *    If a command buffer is in the executable state and the command buffer
+    *    was allocated from a command pool with the
+    *    VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
+    *    vkBeginCommandBuffer implicitly resets the command buffer, behaving
+    *    as if vkResetCommandBuffer had been called with
+    *    VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
+    *    the command buffer in the recording state.
     */
-   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-
-   uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used;
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE)
-      vb_emit |= pipeline->vb_used;
+   anv_cmd_buffer_reset(&cmd_buffer->vk, 0);
+   anv_cmd_buffer_reset_rendering(cmd_buffer);
 
-   if (vb_emit) {
-      const uint32_t num_buffers = __builtin_popcount(vb_emit);
-      const uint32_t num_dwords = 1 + num_buffers * 4;
-
-      p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
-                          GENX(3DSTATE_VERTEX_BUFFERS));
-      uint32_t i = 0;
-      u_foreach_bit(vb, vb_emit) {
-         struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
-         uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
-
-         /* If dynamic, use stride/size from vertex binding, otherwise use
-          * stride/size that was setup in the pipeline object.
-          */
-         bool dynamic_stride = cmd_buffer->state.gfx.dynamic.dyn_vbo_stride;
-         bool dynamic_size = cmd_buffer->state.gfx.dynamic.dyn_vbo_size;
-
-         struct GENX(VERTEX_BUFFER_STATE) state;
-         if (buffer) {
-            uint32_t stride = dynamic_stride ?
-               cmd_buffer->state.vertex_bindings[vb].stride : pipeline->vb[vb].stride;
-            /* From the Vulkan spec (vkCmdBindVertexBuffers2EXT):
-             *
-             * "If pname:pSizes is not NULL then pname:pSizes[i] specifies
-             * the bound size of the vertex buffer starting from the corresponding
-             * elements of pname:pBuffers[i] plus pname:pOffsets[i]."
-             */
-            UNUSED uint32_t size = dynamic_size ?
-               cmd_buffer->state.vertex_bindings[vb].size : buffer->size - offset;
+   cmd_buffer->usage_flags = pBeginInfo->flags;
 
-            state = (struct GENX(VERTEX_BUFFER_STATE)) {
-               .VertexBufferIndex = vb,
+   /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
+    * primary level command buffers.
+    *
+    * From the Vulkan 1.0 spec:
+    *
+    *    VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
+    *    secondary command buffer is considered to be entirely inside a render
+    *    pass. If this is a primary command buffer, then this bit is ignored.
+    */
+   if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
+      cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
 
-               .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,
-                                ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
-#if GFX_VER <= 7
-               .BufferAccessType = pipeline->vb[vb].instanced ? INSTANCEDATA : VERTEXDATA,
-               .InstanceDataStepRate = pipeline->vb[vb].instance_divisor,
-#endif
-               .AddressModifyEnable = true,
-               .BufferPitch = stride,
-               .BufferStartingAddress = anv_address_add(buffer->address, offset),
-               .NullVertexBuffer = offset >= buffer->size,
 #if GFX_VER >= 12
-               .L3BypassDisable = true,
-#endif
-
-#if GFX_VER >= 8
-               .BufferSize = size,
-#else
-               /* XXX: to handle dynamic offset for older gens we might want
-                * to modify Endaddress, but there are issues when doing so:
-                *
-                * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7439
-                */
-               .EndAddress = anv_address_add(buffer->address, buffer->size - 1),
-#endif
-            };
-         } else {
-            state = (struct GENX(VERTEX_BUFFER_STATE)) {
-               .VertexBufferIndex = vb,
-               .NullVertexBuffer = true,
-            };
-         }
-
-#if GFX_VER >= 8 && GFX_VER <= 9
-         genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
-                                                        state.BufferStartingAddress,
-                                                        state.BufferSize);
-#endif
-
-         GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
-         i++;
+   /* Reenable prefetching at the beginning of secondary command buffers. We
+    * do this so that the return instruction edition is not prefetched before
+    * completion.
+    */
+   if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
+         arb.PreParserDisableMask = true;
+         arb.PreParserDisable = false;
       }
    }
+#endif
 
-   cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
+   /* Assume the viewport has already been set in primary command buffers. */
+   if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
+      cmd_buffer->state.gfx.viewport_set = true;
 
-   uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
-                                pipeline->active_stages;
-   if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
-       !cmd_buffer->state.push_constants_dirty)
-      return;
+   trace_intel_begin_cmd_buffer(&cmd_buffer->trace);
 
-   if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) ||
-       (GFX_VER == 7 && (cmd_buffer->state.gfx.dirty &
-                         ANV_CMD_DIRTY_PIPELINE))) {
-      /* We don't need any per-buffer dirty tracking because you're not
-       * allowed to bind different XFB buffers while XFB is enabled.
+   if (anv_cmd_buffer_is_video_queue(cmd_buffer) ||
+       anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
+      /* Re-emit the aux table register in every command buffer.  This way we're
+       * ensured that we have the table even if this command buffer doesn't
+       * initialize any images.
        */
-      for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
-         struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
-         anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
-#if GFX_VER < 12
-            sob.SOBufferIndex = idx;
-#else
-            sob._3DCommandOpcode = 0;
-            sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx;
-#endif
-
-            if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
-               sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo, 0);
-               sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,
-                                                        xfb->offset);
-#if GFX_VER >= 8
-               sob.SOBufferEnable = true;
-               sob.StreamOffsetWriteEnable = false;
-               /* Size is in DWords - 1 */
-               sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
-#else
-               /* We don't have SOBufferEnable in 3DSTATE_SO_BUFFER on Gfx7 so
-                * we trust in SurfaceEndAddress = SurfaceBaseAddress = 0 (the
-                * default for an empty SO_BUFFER packet) to disable them.
-                */
-               sob.SurfacePitch = pipeline->gfx7.xfb_bo_pitch[idx];
-               sob.SurfaceEndAddress = anv_address_add(xfb->buffer->address,
-                                                       xfb->offset + xfb->size);
-#endif
-            }
-         }
-      }
-
-      /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */
-      if (GFX_VER >= 10) {
+      if (cmd_buffer->device->info->has_aux_map) {
          anv_add_pending_pipe_bits(cmd_buffer,
-                                   ANV_PIPE_CS_STALL_BIT,
-                                   "after 3DSTATE_SO_BUFFER call");
+                                   ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
+                                   "new cmd buffer with aux-tt");
       }
+      return VK_SUCCESS;
    }
 
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
-      anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
-
-      /* Remove from dynamic state emission all of stuff that is baked into
-       * the pipeline.
-       */
-      cmd_buffer->state.gfx.dirty &= ~pipeline->static_state_mask;
-
-      /* If the pipeline changed, we may need to re-allocate push constant
-       * space in the URB.
-       */
-      cmd_buffer_alloc_push_constants(cmd_buffer);
-   }
-
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE)
-      cmd_buffer->state.gfx.primitive_topology = pipeline->topology;
+#if GFX_VER >= 12
+   if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
+       cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
+      genX(cmd_buffer_set_protected_memory)(cmd_buffer, true);
+#endif
 
-#if GFX_VER <= 7
-   if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
-       cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
-      /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
-       *
-       *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
-       *    stall needs to be sent just prior to any 3DSTATE_VS,
-       *    3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
-       *    3DSTATE_BINDING_TABLE_POINTER_VS,
-       *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one
-       *    PIPE_CONTROL needs to be sent before any combination of VS
-       *    associated 3DSTATE."
-       */
-      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-         pc.DepthStallEnable  = true;
-         pc.PostSyncOperation = WriteImmediateData;
-         pc.Address           = cmd_buffer->device->workaround_address;
-         anv_debug_dump_pc(pc);
-      }
+   if (cmd_buffer->device->vk.enabled_extensions.EXT_descriptor_buffer) {
+      genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
+   } else {
+      cmd_buffer->state.current_db_mode = ANV_CMD_DESCRIPTOR_BUFFER_MODE_LEGACY;
+      genX(cmd_buffer_emit_bt_pool_base_address)(cmd_buffer);
    }
-#endif
 
-   /* Render targets live in the same binding table as fragment descriptors */
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
-      descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
+   /* We sometimes store vertex data in the dynamic state buffer for blorp
+    * operations and our dynamic state stream may re-use data from previous
+    * command buffers.  In order to prevent stale cache data, we flush the VF
+    * cache.  We could do this on every blorp call but that's not really
+    * needed as all of the data will get written by the CPU prior to the GPU
+    * executing anything.  The chances are fairly high that they will use
+    * blorp at least once per primary command buffer so it shouldn't be
+    * wasted.
+    *
+    * There is also a workaround on gfx8 which requires us to invalidate the
+    * VF cache occasionally.  It's easier if we can assume we start with a
+    * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).)
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
+                             "new cmd buffer");
 
-   /* We emit the binding tables and sampler tables first, then emit push
-    * constants and then finally emit binding table and sampler table
-    * pointers.  It has to happen in this order, since emitting the binding
-    * tables may change the push constants (in case of storage images). After
-    * emitting push constants, on SKL+ we have to emit the corresponding
-    * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
+   /* Re-emit the aux table register in every command buffer.  This way we're
+    * ensured that we have the table even if this command buffer doesn't
+    * initialize any images.
     */
-   uint32_t dirty = 0;
-   if (descriptors_dirty) {
-      dirty = flush_descriptor_sets(cmd_buffer,
-                                    &cmd_buffer->state.gfx.base,
-                                    descriptors_dirty,
-                                    pipeline->shaders,
-                                    ARRAY_SIZE(pipeline->shaders));
-      cmd_buffer->state.descriptors_dirty &= ~dirty;
-   }
-
-   if (dirty || cmd_buffer->state.push_constants_dirty) {
-      /* Because we're pushing UBOs, we have to push whenever either
-       * descriptors or push constants is dirty.
-       */
-      dirty |= cmd_buffer->state.push_constants_dirty;
-      dirty &= ANV_STAGE_MASK & VK_SHADER_STAGE_ALL_GRAPHICS;
-      cmd_buffer_flush_push_constants(cmd_buffer, dirty);
+   if (cmd_buffer->device->info->has_aux_map) {
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_AUX_TABLE_INVALIDATE_BIT,
+                                "new cmd buffer with aux-tt");
    }
 
-   if (dirty)
-      cmd_buffer_emit_descriptor_pointers(cmd_buffer, dirty);
-
-   cmd_buffer_emit_clip(cmd_buffer);
+   /* We send an "Indirect State Pointers Disable" packet at
+    * EndCommandBuffer, so all push constant packets are ignored during a
+    * context restore. Documentation says after that command, we need to
+    * emit push constants again before any rendering operation. So we
+    * flag them dirty here to make sure they get emitted.
+    */
+   cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
+   cmd_buffer->state.gfx.base.push_constants_data_dirty = true;
 
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE)
-      cmd_buffer_emit_streamout(cmd_buffer);
+   if (cmd_buffer->usage_flags &
+       VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
+      struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+
+      char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
+      const VkRenderingInfo *resume_info =
+         vk_get_command_buffer_inheritance_as_rendering_resume(cmd_buffer->vk.level,
+                                                               pBeginInfo,
+                                                               gcbiar_data);
+      if (resume_info != NULL) {
+         genX(CmdBeginRendering)(commandBuffer, resume_info);
+      } else {
+         const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
+            vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level,
+                                                             pBeginInfo);
+         assert(inheritance_info);
+
+         gfx->rendering_flags = inheritance_info->flags;
+         gfx->render_area = (VkRect2D) { };
+         gfx->layer_count = 0;
+         gfx->samples = inheritance_info->rasterizationSamples;
+         gfx->view_mask = inheritance_info->viewMask;
+
+         uint32_t color_att_count = inheritance_info->colorAttachmentCount;
+         result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
+         if (result != VK_SUCCESS)
+            return result;
+
+         for (uint32_t i = 0; i < color_att_count; i++) {
+            gfx->color_att[i].vk_format =
+               inheritance_info->pColorAttachmentFormats[i];
+         }
+         gfx->depth_att.vk_format =
+            inheritance_info->depthAttachmentFormat;
+         gfx->stencil_att.vk_format =
+            inheritance_info->stencilAttachmentFormat;
 
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_VIEWPORT)
-      gfx8_cmd_buffer_emit_viewport(cmd_buffer);
+         anv_cmd_graphic_state_update_has_uint_rt(gfx);
 
-   if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_DYNAMIC_VIEWPORT |
-                                  ANV_CMD_DIRTY_PIPELINE)) {
-      gfx8_cmd_buffer_emit_depth_viewport(cmd_buffer,
-                                          pipeline->depth_clamp_enable);
+         cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_AREA |
+                                        ANV_CMD_DIRTY_RENDER_TARGETS;
+      }
    }
 
-   if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_DYNAMIC_SCISSOR |
-                                      ANV_CMD_DIRTY_RENDER_TARGETS))
-      gfx7_cmd_buffer_emit_scissor(cmd_buffer);
+   /* Emit the sample pattern at the beginning of the batch because the
+    * default locations emitted at the device initialization might have been
+    * changed by a previous command buffer.
+    *
+    * Do not change that when we're continuing a previous renderpass.
+    */
+   if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations &&
+       !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
+      genX(emit_sample_pattern)(&cmd_buffer->batch, NULL);
 
-   genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
-}
+   if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
+      const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =
+         vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT);
 
-static void
-emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,
-               struct anv_address addr,
-               uint32_t size, uint32_t index)
-{
-   uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
-                                 GENX(3DSTATE_VERTEX_BUFFERS));
-
-   GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
-      &(struct GENX(VERTEX_BUFFER_STATE)) {
-         .VertexBufferIndex = index,
-         .AddressModifyEnable = true,
-         .BufferPitch = 0,
-         .MOCS = addr.bo ? anv_mocs(cmd_buffer->device, addr.bo,
-                                    ISL_SURF_USAGE_VERTEX_BUFFER_BIT) : 0,
-         .NullVertexBuffer = size == 0,
-#if GFX_VER >= 12
-         .L3BypassDisable = true,
-#endif
-#if (GFX_VER >= 8)
-         .BufferStartingAddress = addr,
-         .BufferSize = size
-#else
-         .BufferStartingAddress = addr,
-         .EndAddress = anv_address_add(addr, size),
-#endif
-      });
+      /* If secondary buffer supports conditional rendering
+       * we should emit commands as if conditional rendering is enabled.
+       */
+      cmd_buffer->state.conditional_render_enabled =
+         conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable;
 
-   genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer,
-                                                  index, addr, size);
-}
+      if (pBeginInfo->pInheritanceInfo->occlusionQueryEnable) {
+         cmd_buffer->state.gfx.n_occlusion_queries = 1;
+         cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE;
+      }
+   }
 
-static void
-emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
-                             struct anv_address addr)
-{
-   emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX);
+   return VK_SUCCESS;
 }
 
+/* From the PRM, Volume 2a:
+ *
+ *    "Indirect State Pointers Disable
+ *
+ *    At the completion of the post-sync operation associated with this pipe
+ *    control packet, the indirect state pointers in the hardware are
+ *    considered invalid; the indirect pointers are not saved in the context.
+ *    If any new indirect state commands are executed in the command stream
+ *    while the pipe control is pending, the new indirect state commands are
+ *    preserved.
+ *
+ *    [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
+ *    restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
+ *    commands are only considered as Indirect State Pointers. Once ISP is
+ *    issued in a context, SW must initialize by programming push constant
+ *    commands for all the shaders (at least to zero length) before attempting
+ *    any rendering operation for the same context."
+ *
+ * 3DSTATE_CONSTANT_* packets are restored during a context restore,
+ * even though they point to a BO that has been already unreferenced at
+ * the end of the previous batch buffer. This has been fine so far since
+ * we are protected by these scratch page (every address not covered by
+ * a BO should be pointing to the scratch page). But on CNL, it is
+ * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
+ * instruction.
+ *
+ * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
+ * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
+ * context restore, so the mentioned hang doesn't happen. However,
+ * software must program push constant commands for all stages prior to
+ * rendering anything. So we flag them dirty in BeginCommandBuffer.
+ *
+ * Finally, we also make sure to stall at pixel scoreboard to make sure the
+ * constants have been loaded into the EUs prior to disable the push constants
+ * so that it doesn't hang a previous 3DPRIMITIVE.
+ */
 static void
-emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
-                          uint32_t base_vertex, uint32_t base_instance)
+emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)
 {
-   if (base_vertex == 0 && base_instance == 0) {
-      emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS);
-   } else {
-      struct anv_state id_state =
-         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
-
-      ((uint32_t *)id_state.map)[0] = base_vertex;
-      ((uint32_t *)id_state.map)[1] = base_instance;
-
-      struct anv_address addr = {
-         .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
-         .offset = id_state.offset,
-      };
-
-      emit_base_vertex_instance_bo(cmd_buffer, addr);
+   genx_batch_emit_pipe_control(&cmd_buffer->batch,
+                                cmd_buffer->device->info,
+                                cmd_buffer->state.current_pipeline,
+                                ANV_PIPE_CS_STALL_BIT |
+                                ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.IndirectStatePointersDisable = true;
+         pc.CommandStreamerStallEnable = true;
+         anv_debug_dump_pc(pc, __func__);
    }
 }
 
-static void
-emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)
+static VkResult
+end_command_buffer(struct anv_cmd_buffer *cmd_buffer)
 {
-   struct anv_state state =
-      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4, 4);
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return cmd_buffer->batch.status;
 
-   ((uint32_t *)state.map)[0] = draw_index;
+   anv_measure_endcommandbuffer(cmd_buffer);
 
-   struct anv_address addr = {
-      .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
-      .offset = state.offset,
-   };
+   if (anv_cmd_buffer_is_video_queue(cmd_buffer) ||
+       anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
+      trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+      anv_cmd_buffer_end_batch_buffer(cmd_buffer);
+      return VK_SUCCESS;
+   }
 
-   emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
-}
+   /* Flush query clears using blorp so that secondary query writes do not
+    * race with the clear.
+    */
+   if (cmd_buffer->state.queries.clear_bits) {
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_QUERY_BITS(cmd_buffer->state.queries.clear_bits),
+                                "query clear flush prior command buffer end");
+   }
 
-static void
-update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer,
-                                   uint32_t access_type)
-{
-   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
-
-   uint64_t vb_used = pipeline->vb_used;
-   if (vs_prog_data->uses_firstvertex ||
-       vs_prog_data->uses_baseinstance)
-      vb_used |= 1ull << ANV_SVGS_VB_INDEX;
-   if (vs_prog_data->uses_drawid)
-      vb_used |= 1ull << ANV_DRAWID_VB_INDEX;
-
-   genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer,
-                                                       access_type == RANDOM,
-                                                       vb_used);
-}
+   genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
 
-ALWAYS_INLINE static void
-cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer,
-                                           const struct brw_vs_prog_data *vs_prog_data,
-                                           uint32_t base_vertex,
-                                           uint32_t base_instance,
-                                           uint32_t draw_id,
-                                           bool force_flush)
-{
-   bool emitted = false;
-   if (vs_prog_data->uses_firstvertex ||
-       vs_prog_data->uses_baseinstance) {
-      emit_base_vertex_instance(cmd_buffer, base_vertex, base_instance);
-      emitted = true;
-   }
-   if (vs_prog_data->uses_drawid) {
-      emit_draw_index(cmd_buffer, draw_id);
-      emitted = true;
-   }
-   /* Emitting draw index or vertex index BOs may result in needing
-    * additional VF cache flushes.
+   /* Turn on object level preemption if it is disabled to have it in known
+    * state at the beginning of new command buffer.
     */
-   if (emitted || force_flush)
-      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-}
-
-void genX(CmdDraw)(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    vertexCount,
-    uint32_t                                    instanceCount,
-    uint32_t                                    firstVertex,
-    uint32_t                                    firstInstance)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+   if (!cmd_buffer->state.gfx.object_preemption)
+      genX(cmd_buffer_set_preemption)(cmd_buffer, true);
 
-   if (anv_batch_has_error(&cmd_buffer->batch))
-      return;
+   /* We want every command buffer to start with the PMA fix in a known state,
+    * so we disable it at the end of the command buffer.
+    */
+   genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
 
-   const uint32_t count = (vertexCount *
-                           instanceCount *
-                           (pipeline->use_primitive_replication ?
-                            1 : anv_subpass_view_count(cmd_buffer->state.subpass)));
-   anv_measure_snapshot(cmd_buffer,
-                        INTEL_SNAPSHOT_DRAW,
-                        "draw", count);
+   /* Wa_14015814527
+    *
+    * Apply task URB workaround in the end of primary or secondary cmd_buffer.
+    */
+   genX(apply_task_urb_workaround)(cmd_buffer);
 
-   genX(cmd_buffer_flush_state)(cmd_buffer);
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 
-   if (cmd_buffer->state.conditional_render_enabled)
-      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+   emit_isp_disable(cmd_buffer);
 
-   cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
-                                              firstVertex, firstInstance, 0,
-                                              true);
+#if GFX_VER >= 12
+   if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
+       cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
+      genX(cmd_buffer_set_protected_memory)(cmd_buffer, false);
+#endif
 
-   /* Our implementation of VK_KHR_multiview uses instancing to draw the
-    * different views.  We need to multiply instanceCount by the view count.
-    */
-   if (!pipeline->use_primitive_replication)
-      instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
+   trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
-      prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
-      prim.VertexAccessType         = SEQUENTIAL;
-      prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
-      prim.VertexCountPerInstance   = vertexCount;
-      prim.StartVertexLocation      = firstVertex;
-      prim.InstanceCount            = instanceCount;
-      prim.StartInstanceLocation    = firstInstance;
-      prim.BaseVertexLocation       = 0;
-   }
+   anv_cmd_buffer_end_batch_buffer(cmd_buffer);
 
-   update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
+   return VK_SUCCESS;
 }
 
-void genX(CmdDrawMultiEXT)(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    drawCount,
-    const VkMultiDrawInfoEXT                   *pVertexInfo,
-    uint32_t                                    instanceCount,
-    uint32_t                                    firstInstance,
-    uint32_t                                    stride)
+VkResult
+genX(EndCommandBuffer)(
+    VkCommandBuffer                             commandBuffer)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
 
-   if (anv_batch_has_error(&cmd_buffer->batch))
-      return;
-
-   const uint32_t count = (drawCount *
-                           instanceCount *
-                           (pipeline->use_primitive_replication ?
-                            1 : anv_subpass_view_count(cmd_buffer->state.subpass)));
-   anv_measure_snapshot(cmd_buffer,
-                        INTEL_SNAPSHOT_DRAW,
-                        "draw_multi", count);
-
-   genX(cmd_buffer_flush_state)(cmd_buffer);
-
-   if (cmd_buffer->state.conditional_render_enabled)
-      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+   VkResult status = end_command_buffer(cmd_buffer);
+   if (status != VK_SUCCESS)
+      return status;
 
-   /* Our implementation of VK_KHR_multiview uses instancing to draw the
-    * different views.  We need to multiply instanceCount by the view count.
+   /* If there is MSAA access over the compute/transfer queue, we can use the
+    * companion RCS command buffer and end it properly.
     */
-   if (!pipeline->use_primitive_replication)
-      instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
-
-   uint32_t i = 0;
-   vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
-      cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
-                                                 draw->firstVertex,
-                                                 firstInstance, i, !i);
-
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
-         prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
-         prim.VertexAccessType         = SEQUENTIAL;
-         prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
-         prim.VertexCountPerInstance   = draw->vertexCount;
-         prim.StartVertexLocation      = draw->firstVertex;
-         prim.InstanceCount            = instanceCount;
-         prim.StartInstanceLocation    = firstInstance;
-         prim.BaseVertexLocation       = 0;
-      }
+   if (cmd_buffer->companion_rcs_cmd_buffer) {
+       assert(anv_cmd_buffer_is_compute_queue(cmd_buffer) ||
+              anv_cmd_buffer_is_blitter_queue(cmd_buffer));
+       status = end_command_buffer(cmd_buffer->companion_rcs_cmd_buffer);
    }
 
-   update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
+   ANV_RMV(cmd_buffer_create, cmd_buffer->device, cmd_buffer);
+
+   return status;
 }
 
-void genX(CmdDrawIndexed)(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    indexCount,
-    uint32_t                                    instanceCount,
-    uint32_t                                    firstIndex,
-    int32_t                                     vertexOffset,
-    uint32_t                                    firstInstance)
+static void
+cmd_buffer_emit_copy_ts_buffer(struct u_trace_context *utctx,
+                               void *cmdstream,
+                               void *ts_from, uint32_t from_offset,
+                               void *ts_to, uint32_t to_offset,
+                               uint32_t count)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
-
-   if (anv_batch_has_error(&cmd_buffer->batch))
-      return;
-
-   const uint32_t count = (indexCount *
-                           instanceCount *
-                           (pipeline->use_primitive_replication ?
-                            1 : anv_subpass_view_count(cmd_buffer->state.subpass)));
-   anv_measure_snapshot(cmd_buffer,
-                        INTEL_SNAPSHOT_DRAW,
-                        "draw indexed",
-                        count);
-
-   genX(cmd_buffer_flush_state)(cmd_buffer);
-
-   if (cmd_buffer->state.conditional_render_enabled)
-      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
-
-   cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data, vertexOffset, firstInstance, 0, true);
-
-   /* Our implementation of VK_KHR_multiview uses instancing to draw the
-    * different views.  We need to multiply instanceCount by the view count.
-    */
-   if (!pipeline->use_primitive_replication)
-      instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
-
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
-      prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
-      prim.VertexAccessType         = RANDOM;
-      prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
-      prim.VertexCountPerInstance   = indexCount;
-      prim.StartVertexLocation      = firstIndex;
-      prim.InstanceCount            = instanceCount;
-      prim.StartInstanceLocation    = firstInstance;
-      prim.BaseVertexLocation       = vertexOffset;
-   }
-
-   update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
+   struct anv_memcpy_state *memcpy_state = cmdstream;
+   struct anv_address from_addr = (struct anv_address) {
+      .bo = ts_from, .offset = from_offset * sizeof(uint64_t) };
+   struct anv_address to_addr = (struct anv_address) {
+      .bo = ts_to, .offset = to_offset * sizeof(uint64_t) };
+
+   genX(emit_so_memcpy)(memcpy_state, to_addr, from_addr,
+                        count * sizeof(uint64_t));
 }
 
-void genX(CmdDrawMultiIndexedEXT)(
+void
+genX(CmdExecuteCommands)(
     VkCommandBuffer                             commandBuffer,
-    uint32_t                                    drawCount,
-    const VkMultiDrawIndexedInfoEXT            *pIndexInfo,
-    uint32_t                                    instanceCount,
-    uint32_t                                    firstInstance,
-    uint32_t                                    stride,
-    const int32_t                              *pVertexOffset)
+    uint32_t                                    commandBufferCount,
+    const VkCommandBuffer*                      pCmdBuffers)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+   ANV_FROM_HANDLE(anv_cmd_buffer, container, commandBuffer);
 
-   if (anv_batch_has_error(&cmd_buffer->batch))
+   struct anv_device *device = container->device;
+
+   if (anv_batch_has_error(&container->batch))
       return;
 
-   const uint32_t count = (drawCount *
-                           instanceCount *
-                           (pipeline->use_primitive_replication ?
-                            1 : anv_subpass_view_count(cmd_buffer->state.subpass)));
-   anv_measure_snapshot(cmd_buffer,
-                        INTEL_SNAPSHOT_DRAW,
-                        "draw indexed_multi",
-                        count);
+   /* The secondary command buffers will assume that the PMA fix is disabled
+    * when they begin executing.  Make sure this is true.
+    */
+   genX(cmd_buffer_enable_pma_fix)(container, false);
 
-   genX(cmd_buffer_flush_state)(cmd_buffer);
+   /* Turn on preemption in case it was toggled off. */
+   if (!container->state.gfx.object_preemption)
+      genX(cmd_buffer_set_preemption)(container, true);
 
-   if (cmd_buffer->state.conditional_render_enabled)
-      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+   /* Wa_14015814527
+    *
+    * Apply task URB workaround before secondary cmd buffers.
+    */
+   genX(apply_task_urb_workaround)(container);
 
-   /* Our implementation of VK_KHR_multiview uses instancing to draw the
-    * different views.  We need to multiply instanceCount by the view count.
+   /* Flush query clears using blorp so that secondary query writes do not
+    * race with the clear.
     */
-   if (!pipeline->use_primitive_replication)
-      instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
-
-   uint32_t i = 0;
-   if (pVertexOffset) {
-      if (vs_prog_data->uses_drawid) {
-         bool emitted = true;
-         if (vs_prog_data->uses_firstvertex ||
-             vs_prog_data->uses_baseinstance) {
-            emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
-            emitted = true;
-         }
-         vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
-            if (vs_prog_data->uses_drawid) {
-               emit_draw_index(cmd_buffer, i);
-               emitted = true;
-            }
-            /* Emitting draw index or vertex index BOs may result in needing
-             * additional VF cache flushes.
-             */
-            if (emitted)
-               genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-
-            anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
-               prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
-               prim.VertexAccessType         = RANDOM;
-               prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
-               prim.VertexCountPerInstance   = draw->indexCount;
-               prim.StartVertexLocation      = draw->firstIndex;
-               prim.InstanceCount            = instanceCount;
-               prim.StartInstanceLocation    = firstInstance;
-               prim.BaseVertexLocation       = *pVertexOffset;
-            }
-            emitted = false;
-         }
-      } else {
-         if (vs_prog_data->uses_firstvertex ||
-             vs_prog_data->uses_baseinstance) {
-            emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
-            /* Emitting draw index or vertex index BOs may result in needing
-             * additional VF cache flushes.
-             */
-            genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-         }
-         vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
-            anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
-               prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
-               prim.VertexAccessType         = RANDOM;
-               prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
-               prim.VertexCountPerInstance   = draw->indexCount;
-               prim.StartVertexLocation      = draw->firstIndex;
-               prim.InstanceCount            = instanceCount;
-               prim.StartInstanceLocation    = firstInstance;
-               prim.BaseVertexLocation       = *pVertexOffset;
-            }
-         }
-      }
-   } else {
-      vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
-         cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
-                                                    draw->vertexOffset,
-                                                    firstInstance, i, i != 0);
-
-         anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
-            prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
-            prim.VertexAccessType         = RANDOM;
-            prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
-            prim.VertexCountPerInstance   = draw->indexCount;
-            prim.StartVertexLocation      = draw->firstIndex;
-            prim.InstanceCount            = instanceCount;
-            prim.StartInstanceLocation    = firstInstance;
-            prim.BaseVertexLocation       = draw->vertexOffset;
-         }
-      }
+   if (container->state.queries.clear_bits) {
+      anv_add_pending_pipe_bits(container,
+                                ANV_PIPE_QUERY_BITS(container->state.queries.clear_bits),
+                                "query clear flush prior to secondary buffer");
    }
 
-   update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
-}
-
-/* Auto-Draw / Indirect Registers */
-#define GFX7_3DPRIM_END_OFFSET          0x2420
-#define GFX7_3DPRIM_START_VERTEX        0x2430
-#define GFX7_3DPRIM_VERTEX_COUNT        0x2434
-#define GFX7_3DPRIM_INSTANCE_COUNT      0x2438
-#define GFX7_3DPRIM_START_INSTANCE      0x243C
-#define GFX7_3DPRIM_BASE_VERTEX         0x2440
-
-void genX(CmdDrawIndirectByteCountEXT)(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    instanceCount,
-    uint32_t                                    firstInstance,
-    VkBuffer                                    counterBuffer,
-    VkDeviceSize                                counterBufferOffset,
-    uint32_t                                    counterOffset,
-    uint32_t                                    vertexStride)
-{
-#if GFX_VERx10 >= 75
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
-   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
-
-   /* firstVertex is always zero for this draw function */
-   const uint32_t firstVertex = 0;
-
-   if (anv_batch_has_error(&cmd_buffer->batch))
-      return;
-
-   anv_measure_snapshot(cmd_buffer,
-                        INTEL_SNAPSHOT_DRAW,
-                        "draw indirect byte count",
-                        instanceCount);
+   /* The secondary command buffer doesn't know which textures etc. have been
+    * flushed prior to their execution.  Apply those flushes now.
+    */
+   genX(cmd_buffer_apply_pipe_flushes)(container);
 
-   genX(cmd_buffer_flush_state)(cmd_buffer);
+   genX(cmd_buffer_flush_generated_draws)(container);
 
-   if (vs_prog_data->uses_firstvertex ||
-       vs_prog_data->uses_baseinstance)
-      emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
-   if (vs_prog_data->uses_drawid)
-      emit_draw_index(cmd_buffer, 0);
+   UNUSED enum anv_cmd_descriptor_buffer_mode db_mode =
+      container->state.current_db_mode;
 
-   /* Emitting draw index or vertex index BOs may result in needing
-    * additional VF cache flushes.
+   /* Do a first pass to copy the surface state content of the render targets
+    * if needed.
     */
-   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+   bool need_surface_state_copy = false;
+   for (uint32_t i = 0; i < commandBufferCount; i++) {
+      ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
 
-   /* Our implementation of VK_KHR_multiview uses instancing to draw the
-    * different views.  We need to multiply instanceCount by the view count.
-    */
-   if (!pipeline->use_primitive_replication)
-      instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
+      if (secondary->usage_flags &
+          VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
+         need_surface_state_copy = true;
+         break;
+      }
+   }
 
-   struct mi_builder b;
-   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
-   struct mi_value count =
-      mi_mem32(anv_address_add(counter_buffer->address,
-                                   counterBufferOffset));
-   if (counterOffset)
-      count = mi_isub(&b, count, mi_imm(counterOffset));
-   count = mi_udiv32_imm(&b, count, vertexStride);
-   mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
-
-   mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
-   mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), mi_imm(instanceCount));
-   mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
-   mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
-
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
-      prim.IndirectParameterEnable  = true;
-      prim.VertexAccessType         = SEQUENTIAL;
-      prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
-   }
-
-   update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
-#endif /* GFX_VERx10 >= 75 */
-}
+   if (need_surface_state_copy) {
+      if (container->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
+         genX(cmd_buffer_set_protected_memory)(container, false);
 
-static void
-load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
-                         struct anv_address addr,
-                         bool indexed)
-{
-   struct mi_builder b;
-   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+      /* The memcpy will take care of the 3D preemption requirements. */
+      struct anv_memcpy_state memcpy_state;
+      genX(emit_so_memcpy_init)(&memcpy_state, device, &container->batch);
 
-   mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
-                mi_mem32(anv_address_add(addr, 0)));
+      for (uint32_t i = 0; i < commandBufferCount; i++) {
+         ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
 
-   struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
-   unsigned view_count = anv_subpass_view_count(cmd_buffer->state.subpass);
-   if (view_count > 1) {
-#if GFX_VERx10 >= 75
-      instance_count = mi_imul_imm(&b, instance_count, view_count);
-#else
-      anv_finishme("Multiview + indirect draw requires MI_MATH; "
-                   "MI_MATH is not supported on Ivy Bridge");
-#endif
-   }
-   mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
+         assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+         assert(!anv_batch_has_error(&secondary->batch));
 
-   mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
-                mi_mem32(anv_address_add(addr, 8)));
+         if (secondary->usage_flags &
+             VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
+            /* If we're continuing a render pass from the container, we need
+             * to copy the surface states for the current subpass into the
+             * storage we allocated for them in BeginCommandBuffer.
+             */
+            struct anv_state src_state = container->state.gfx.att_states;
+            struct anv_state dst_state = secondary->state.gfx.att_states;
+            assert(src_state.alloc_size == dst_state.alloc_size);
+
+            genX(emit_so_memcpy)(
+               &memcpy_state,
+               anv_state_pool_state_address(&device->internal_surface_state_pool,
+                                            dst_state),
+               anv_state_pool_state_address(&device->internal_surface_state_pool,
+                                            src_state),
+               src_state.alloc_size);
+         }
+      }
+      genX(emit_so_memcpy_fini)(&memcpy_state);
 
-   if (indexed) {
-      mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
-                   mi_mem32(anv_address_add(addr, 12)));
-      mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
-                   mi_mem32(anv_address_add(addr, 16)));
-   } else {
-      mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
-                   mi_mem32(anv_address_add(addr, 12)));
-      mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
+      if (container->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
+         genX(cmd_buffer_set_protected_memory)(container, true);
    }
-}
-
-void genX(CmdDrawIndirect)(
-    VkCommandBuffer                             commandBuffer,
-    VkBuffer                                    _buffer,
-    VkDeviceSize                                offset,
-    uint32_t                                    drawCount,
-    uint32_t                                    stride)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
-   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
 
-   if (anv_batch_has_error(&cmd_buffer->batch))
-      return;
+   /* Ensure preemption is enabled (assumption for all secondary) */
+   genX(cmd_buffer_set_preemption)(container, true);
 
-   genX(cmd_buffer_flush_state)(cmd_buffer);
+   for (uint32_t i = 0; i < commandBufferCount; i++) {
+      ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
 
-   if (cmd_buffer->state.conditional_render_enabled)
-      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+      assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
+      assert(!anv_batch_has_error(&secondary->batch));
 
-   for (uint32_t i = 0; i < drawCount; i++) {
-      struct anv_address draw = anv_address_add(buffer->address, offset);
+      if (secondary->state.conditional_render_enabled) {
+         if (!container->state.conditional_render_enabled) {
+            /* Secondary buffer is constructed as if it will be executed
+             * with conditional rendering, we should satisfy this dependency
+             * regardless of conditional rendering being enabled in container.
+             */
+            struct mi_builder b;
+            mi_builder_init(&b, device->info, &container->batch);
+            mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
+                         mi_imm(UINT64_MAX));
+         }
+      }
 
-      if (vs_prog_data->uses_firstvertex ||
-          vs_prog_data->uses_baseinstance)
-         emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
-      if (vs_prog_data->uses_drawid)
-         emit_draw_index(cmd_buffer, i);
+      anv_cmd_buffer_add_secondary(container, secondary);
 
-      /* Emitting draw index or vertex index BOs may result in needing
-       * additional VF cache flushes.
+      /* Add secondary buffer's RCS command buffer to container buffer's RCS
+       * command buffer for execution if secondary RCS is valid.
        */
-      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-
-      load_indirect_parameters(cmd_buffer, draw, false);
+      if (secondary->companion_rcs_cmd_buffer != NULL) {
+         VkResult result = anv_cmd_buffer_ensure_rcs_companion(container);
+         if (result != VK_SUCCESS) {
+            anv_batch_set_error(&container->batch, result);
+            return;
+         }
 
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
-         prim.IndirectParameterEnable  = true;
-         prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
-         prim.VertexAccessType         = SEQUENTIAL;
-         prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
+         anv_cmd_buffer_add_secondary(container->companion_rcs_cmd_buffer,
+                                      secondary->companion_rcs_cmd_buffer);
       }
 
-      update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
+      assert(secondary->perf_query_pool == NULL || container->perf_query_pool == NULL ||
+             secondary->perf_query_pool == container->perf_query_pool);
+      if (secondary->perf_query_pool)
+         container->perf_query_pool = secondary->perf_query_pool;
 
-      offset += stride;
-   }
-}
+#if INTEL_NEEDS_WA_1808121037
+      if (secondary->state.depth_reg_mode != ANV_DEPTH_REG_MODE_UNKNOWN)
+         container->state.depth_reg_mode = secondary->state.depth_reg_mode;
+#endif
 
-void genX(CmdDrawIndexedIndirect)(
-    VkCommandBuffer                             commandBuffer,
-    VkBuffer                                    _buffer,
-    VkDeviceSize                                offset,
-    uint32_t                                    drawCount,
-    uint32_t                                    stride)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
-   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+      container->state.gfx.viewport_set |= secondary->state.gfx.viewport_set;
 
-   if (anv_batch_has_error(&cmd_buffer->batch))
-      return;
+      db_mode = secondary->state.current_db_mode;
+   }
+
+   /* The secondary isn't counted in our VF cache tracking so we need to
+    * invalidate the whole thing.
+    */
+   if (GFX_VER == 9) {
+      anv_add_pending_pipe_bits(container,
+                                ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
+                                "Secondary cmd buffer not tracked in VF cache");
+   }
 
-   genX(cmd_buffer_flush_state)(cmd_buffer);
+#if INTEL_WA_16014538804_GFX_VER
+   if (anv_cmd_buffer_is_render_queue(container) &&
+       intel_needs_workaround(device->info, 16014538804))
+      anv_batch_emit(&container->batch, GENX(PIPE_CONTROL), pc);
+#endif
 
-   if (cmd_buffer->state.conditional_render_enabled)
-      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+   /* The secondary may have selected a different pipeline (3D or compute) and
+    * may have changed the current L3$ configuration.  Reset our tracking
+    * variables to invalid values to ensure that we re-emit these in the case
+    * where we do any draws or compute dispatches from the container after the
+    * secondary has returned.
+    */
+   container->state.current_pipeline = UINT32_MAX;
+   container->state.current_l3_config = NULL;
+   container->state.current_hash_scale = 0;
+   container->state.gfx.push_constant_stages = 0;
+   container->state.gfx.ds_write_state = false;
 
-   for (uint32_t i = 0; i < drawCount; i++) {
-      struct anv_address draw = anv_address_add(buffer->address, offset);
+   memset(&container->state.gfx.urb_cfg, 0, sizeof(struct intel_urb_config));
 
-      /* TODO: We need to stomp base vertex to 0 somehow */
-      if (vs_prog_data->uses_firstvertex ||
-          vs_prog_data->uses_baseinstance)
-         emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
-      if (vs_prog_data->uses_drawid)
-         emit_draw_index(cmd_buffer, i);
+   /* Reemit all GFX instructions in container */
+   memcpy(container->state.gfx.dyn_state.dirty,
+          device->gfx_dirty_state,
+          sizeof(container->state.gfx.dyn_state.dirty));
+   if (container->device->vk.enabled_extensions.KHR_fragment_shading_rate) {
+      /* Also recompute the CPS_STATE offset */
+      struct vk_dynamic_graphics_state *dyn =
+         &container->vk.dynamic_graphics_state;
+      BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_FSR);
+   }
 
-      /* Emitting draw index or vertex index BOs may result in needing
-       * additional VF cache flushes.
+   /* Each of the secondary command buffers will use its own state base
+    * address.  We need to re-emit state base address for the container after
+    * all of the secondaries are done.
+    */
+   if (container->device->vk.enabled_extensions.EXT_descriptor_buffer) {
+#if GFX_VERx10 >= 125
+      /* If the last secondary had a different mode, reemit the last pending
+       * mode. Otherwise, we can do a lighter binding table pool update.
        */
-      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+      if (db_mode != container->state.current_db_mode) {
+         container->state.current_db_mode = db_mode;
+         genX(cmd_buffer_emit_state_base_address)(container);
+      } else {
+         genX(cmd_buffer_emit_bt_pool_base_address)(container);
+      }
+#else
+      genX(cmd_buffer_emit_state_base_address)(container);
+#endif
+   } else {
+      genX(cmd_buffer_emit_bt_pool_base_address)(container);
+   }
 
-      load_indirect_parameters(cmd_buffer, draw, true);
+   /* Copy of utrace timestamp buffers from secondary into container */
+   if (u_trace_enabled(&device->ds.trace_context)) {
+      trace_intel_begin_trace_copy(&container->trace);
 
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
-         prim.IndirectParameterEnable  = true;
-         prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
-         prim.VertexAccessType         = RANDOM;
-         prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
+      struct anv_memcpy_state memcpy_state;
+      genX(emit_so_memcpy_init)(&memcpy_state, device, &container->batch);
+      uint32_t num_traces = 0;
+      for (uint32_t i = 0; i < commandBufferCount; i++) {
+         ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
+
+         num_traces += secondary->trace.num_traces;
+         u_trace_clone_append(u_trace_begin_iterator(&secondary->trace),
+                              u_trace_end_iterator(&secondary->trace),
+                              &container->trace,
+                              &memcpy_state,
+                              cmd_buffer_emit_copy_ts_buffer);
       }
+      genX(emit_so_memcpy_fini)(&memcpy_state);
 
-      update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
+      trace_intel_end_trace_copy(&container->trace, num_traces);
 
-      offset += stride;
+      /* Memcpy is done using the 3D pipeline. */
+      container->state.current_pipeline = _3D;
    }
 }
 
-static struct mi_value
-prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
-                                 struct mi_builder *b,
-                                 struct anv_buffer *count_buffer,
-                                 uint64_t countBufferOffset)
+static inline enum anv_pipe_bits
+anv_pipe_flush_bits_for_access_flags(struct anv_cmd_buffer *cmd_buffer,
+                                     VkAccessFlags2 flags)
 {
-   struct anv_address count_address =
-         anv_address_add(count_buffer->address, countBufferOffset);
-
-   struct mi_value ret = mi_imm(0);
-
-   if (cmd_buffer->state.conditional_render_enabled) {
-#if GFX_VERx10 >= 75
-      ret = mi_new_gpr(b);
-      mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
-#endif
-   } else {
-      /* Upload the current draw count from the draw parameters buffer to
-       * MI_PREDICATE_SRC0.
-       */
-      mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
-      mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
+   enum anv_pipe_bits pipe_bits = 0;
+
+   u_foreach_bit64(b, flags) {
+      switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
+      case VK_ACCESS_2_SHADER_WRITE_BIT:
+      case VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT:
+      case VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
+         /* We're transitioning a buffer that was previously used as write
+          * destination through the data port. To make its content available
+          * to future operations, flush the hdc pipeline.
+          */
+         pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
+         pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+         break;
+      case VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT:
+         /* We're transitioning a buffer that was previously used as render
+          * target. To make its content available to future operations, flush
+          * the render target cache.
+          */
+         pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+         break;
+      case VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
+         /* We're transitioning a buffer that was previously used as depth
+          * buffer. To make its content available to future operations, flush
+          * the depth cache.
+          */
+         pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
+         break;
+      case VK_ACCESS_2_TRANSFER_WRITE_BIT:
+         /* We're transitioning a buffer that was previously used as a
+          * transfer write destination. Generic write operations include color
+          * & depth operations as well as buffer operations like :
+          *     - vkCmdClearColorImage()
+          *     - vkCmdClearDepthStencilImage()
+          *     - vkCmdBlitImage()
+          *     - vkCmdCopy*(), vkCmdUpdate*(), vkCmdFill*()
+          *
+          * Most of these operations are implemented using Blorp which writes
+          * through the render target cache or the depth cache on the graphics
+          * queue. On the compute queue, the writes are done through the data
+          * port.
+          */
+         if (anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
+            pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
+            pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+         } else {
+            /* We can use the data port when trying to stay in compute mode on
+             * the RCS.
+             */
+            pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
+            pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+            /* Most operations are done through RT/detph writes */
+            pipe_bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+            pipe_bits |= ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
+         }
+         break;
+      case VK_ACCESS_2_MEMORY_WRITE_BIT:
+         /* We're transitioning a buffer for generic write operations. Flush
+          * all the caches.
+          */
+         pipe_bits |= ANV_PIPE_FLUSH_BITS;
+         break;
+      case VK_ACCESS_2_HOST_WRITE_BIT:
+         /* We're transitioning a buffer for access by CPU. Invalidate
+          * all the caches. Since data and tile caches don't have invalidate,
+          * we are forced to flush those as well.
+          */
+         pipe_bits |= ANV_PIPE_FLUSH_BITS;
+         pipe_bits |= ANV_PIPE_INVALIDATE_BITS;
+         break;
+      case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
+      case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
+         /* We're transitioning a buffer written either from VS stage or from
+          * the command streamer (see CmdEndTransformFeedbackEXT), we just
+          * need to stall the CS.
+          *
+          * Streamout writes apparently bypassing L3, in order to make them
+          * visible to the destination, we need to invalidate the other
+          * caches.
+          */
+         pipe_bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_INVALIDATE_BITS;
+         break;
+      default:
+         break; /* Nothing to do */
+      }
    }
 
-   return ret;
+   return pipe_bits;
 }
 
-static void
-emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
-                          struct mi_builder *b,
-                          uint32_t draw_index)
+static inline enum anv_pipe_bits
+anv_pipe_invalidate_bits_for_access_flags(struct anv_cmd_buffer *cmd_buffer,
+                                          VkAccessFlags2 flags)
 {
-   /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
-   mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
-
-   if (draw_index == 0) {
-      anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
-         mip.LoadOperation    = LOAD_LOADINV;
-         mip.CombineOperation = COMBINE_SET;
-         mip.CompareOperation = COMPARE_SRCS_EQUAL;
-      }
-   } else {
-      /* While draw_index < draw_count the predicate's result will be
-       *  (draw_index == draw_count) ^ TRUE = TRUE
-       * When draw_index == draw_count the result is
-       *  (TRUE) ^ TRUE = FALSE
-       * After this all results will be:
-       *  (FALSE) ^ FALSE = FALSE
-       */
-      anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
-         mip.LoadOperation    = LOAD_LOAD;
-         mip.CombineOperation = COMBINE_XOR;
-         mip.CompareOperation = COMPARE_SRCS_EQUAL;
+   struct anv_device *device = cmd_buffer->device;
+   enum anv_pipe_bits pipe_bits = 0;
+
+   u_foreach_bit64(b, flags) {
+      switch ((VkAccessFlags2)BITFIELD64_BIT(b)) {
+      case VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT:
+         /* Indirect draw commands take a buffer as input that we're going to
+          * read from the command streamer to load some of the HW registers
+          * (see genX_cmd_buffer.c:load_indirect_parameters). This requires a
+          * command streamer stall so that all the cache flushes have
+          * completed before the command streamer loads from memory.
+          */
+         pipe_bits |=  ANV_PIPE_CS_STALL_BIT;
+         /* Indirect draw commands also set gl_BaseVertex & gl_BaseIndex
+          * through a vertex buffer, so invalidate that cache.
+          */
+         pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+         /* For CmdDipatchIndirect, we also load gl_NumWorkGroups through a
+          * UBO from the buffer, so we need to invalidate constant cache.
+          */
+         pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
+         pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
+         /* Tile cache flush needed For CmdDipatchIndirect since command
+          * streamer and vertex fetch aren't L3 coherent.
+          */
+         pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
+         break;
+      case VK_ACCESS_2_INDEX_READ_BIT:
+      case VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT:
+         /* We transitioning a buffer to be used for as input for vkCmdDraw*
+          * commands, so we invalidate the VF cache to make sure there is no
+          * stale data when we start rendering.
+          */
+         pipe_bits |= ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
+         break;
+      case VK_ACCESS_2_UNIFORM_READ_BIT:
+      case VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR:
+         /* We transitioning a buffer to be used as uniform data. Because
+          * uniform is accessed through the data port & sampler, we need to
+          * invalidate the texture cache (sampler) & constant cache (data
+          * port) to avoid stale data.
+          */
+         pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
+         if (device->physical->compiler->indirect_ubos_use_sampler) {
+            pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
+         } else {
+            pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
+            pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+         }
+         break;
+      case VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT:
+      case VK_ACCESS_2_TRANSFER_READ_BIT:
+      case VK_ACCESS_2_SHADER_SAMPLED_READ_BIT:
+         /* Transitioning a buffer to be read through the sampler, so
+          * invalidate the texture cache, we don't want any stale data.
+          */
+         pipe_bits |= ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
+         break;
+      case VK_ACCESS_2_SHADER_READ_BIT:
+         /* Same as VK_ACCESS_2_UNIFORM_READ_BIT and
+          * VK_ACCESS_2_SHADER_SAMPLED_READ_BIT cases above
+          */
+         pipe_bits |= ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
+                      ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
+         if (!device->physical->compiler->indirect_ubos_use_sampler) {
+            pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
+            pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+         }
+         break;
+      case VK_ACCESS_2_MEMORY_READ_BIT:
+         /* Transitioning a buffer for generic read, invalidate all the
+          * caches.
+          */
+         pipe_bits |= ANV_PIPE_INVALIDATE_BITS;
+         break;
+      case VK_ACCESS_2_MEMORY_WRITE_BIT:
+         /* Generic write, make sure all previously written things land in
+          * memory.
+          */
+         pipe_bits |= ANV_PIPE_FLUSH_BITS;
+         break;
+      case VK_ACCESS_2_CONDITIONAL_RENDERING_READ_BIT_EXT:
+      case VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT:
+         /* Transitioning a buffer for conditional rendering or transform
+          * feedback. We'll load the content of this buffer into HW registers
+          * using the command streamer, so we need to stall the command
+          * streamer , so we need to stall the command streamer to make sure
+          * any in-flight flush operations have completed.
+          */
+         pipe_bits |= ANV_PIPE_CS_STALL_BIT;
+         pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
+         pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
+         break;
+      case VK_ACCESS_2_HOST_READ_BIT:
+         /* We're transitioning a buffer that was written by CPU.  Flush
+          * all the caches.
+          */
+         pipe_bits |= ANV_PIPE_FLUSH_BITS;
+         break;
+      case VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
+         /* We're transitioning a buffer to be written by the streamout fixed
+          * function. This one is apparently not L3 coherent, so we need a
+          * tile cache flush to make sure any previous write is not going to
+          * create WaW hazards.
+          */
+         pipe_bits |= ANV_PIPE_DATA_CACHE_FLUSH_BIT;
+         pipe_bits |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
+         break;
+      case VK_ACCESS_2_SHADER_STORAGE_READ_BIT:
+         /* VK_ACCESS_2_SHADER_STORAGE_READ_BIT specifies read access to a
+          * storage buffer, physical storage buffer, storage texel buffer, or
+          * storage image in any shader pipeline stage.
+          *
+          * Any storage buffers or images written to must be invalidated and
+          * flushed before the shader can access them.
+          *
+          * Both HDC & Untyped flushes also do invalidation. This is why we
+          * use this here on Gfx12+.
+          *
+          * Gfx11 and prior don't have HDC. Only Data cache flush is available
+          * and it only operates on the written cache lines.
+          */
+         if (device->info->ver >= 12) {
+            pipe_bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+            pipe_bits |= ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
+         }
+         break;
+      case VK_ACCESS_2_DESCRIPTOR_BUFFER_READ_BIT_EXT:
+         pipe_bits |= ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
+         break;
+      default:
+         break; /* Nothing to do */
       }
    }
+
+   return pipe_bits;
 }
 
-#if GFX_VERx10 >= 75
-static void
-emit_draw_count_predicate_with_conditional_render(
-                          struct anv_cmd_buffer *cmd_buffer,
-                          struct mi_builder *b,
-                          uint32_t draw_index,
-                          struct mi_value max)
+static inline bool
+stage_is_shader(const VkPipelineStageFlags2 stage)
 {
-   struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
-   pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
-
-#if GFX_VER >= 8
-   mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
-#else
-   /* MI_PREDICATE_RESULT is not whitelisted in i915 command parser
-    * so we emit MI_PREDICATE to set it.
-    */
-
-   mi_store(b, mi_reg64(MI_PREDICATE_SRC0), pred);
-   mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
+   return (stage & (VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
+                    VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
+                    VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
+                    VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
+                    VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
+                    VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
+                    VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
+                    VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
+                    VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR |
+                    VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT |
+                    VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT));
+}
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
-      mip.LoadOperation    = LOAD_LOADINV;
-      mip.CombineOperation = COMBINE_SET;
-      mip.CompareOperation = COMPARE_SRCS_EQUAL;
-   }
-#endif
+static inline bool
+stage_is_transfer(const VkPipelineStageFlags2 stage)
+{
+   return (stage & (VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
+                    VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT));
 }
-#endif
 
-static void
-emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
-                               struct mi_builder *b,
-                               uint32_t draw_index,
-                               struct mi_value max)
+static inline bool
+stage_is_video(const VkPipelineStageFlags2 stage)
 {
-#if GFX_VERx10 >= 75
-   if (cmd_buffer->state.conditional_render_enabled) {
-      emit_draw_count_predicate_with_conditional_render(
-            cmd_buffer, b, draw_index, mi_value_ref(b, max));
-   } else {
-      emit_draw_count_predicate(cmd_buffer, b, draw_index);
-   }
-#else
-   emit_draw_count_predicate(cmd_buffer, b, draw_index);
+   return (stage & (VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
+#ifdef VK_ENABLE_BETA_EXTENSIONS
+                    VK_PIPELINE_STAGE_2_VIDEO_ENCODE_BIT_KHR |
 #endif
+                    VK_PIPELINE_STAGE_2_VIDEO_DECODE_BIT_KHR));
 }
 
-void genX(CmdDrawIndirectCount)(
-    VkCommandBuffer                             commandBuffer,
-    VkBuffer                                    _buffer,
-    VkDeviceSize                                offset,
-    VkBuffer                                    _countBuffer,
-    VkDeviceSize                                countBufferOffset,
-    uint32_t                                    maxDrawCount,
-    uint32_t                                    stride)
+static inline bool
+mask_is_shader_write(const VkAccessFlags2 access)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
-   ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
-   struct anv_cmd_state *cmd_state = &cmd_buffer->state;
-   struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
-   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
-
-   if (anv_batch_has_error(&cmd_buffer->batch))
-      return;
-
-   genX(cmd_buffer_flush_state)(cmd_buffer);
-
-   struct mi_builder b;
-   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
-   struct mi_value max =
-      prepare_for_draw_count_predicate(cmd_buffer, &b,
-                                       count_buffer, countBufferOffset);
-
-   for (uint32_t i = 0; i < maxDrawCount; i++) {
-      struct anv_address draw = anv_address_add(buffer->address, offset);
-
-      emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
-
-      if (vs_prog_data->uses_firstvertex ||
-          vs_prog_data->uses_baseinstance)
-         emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
-      if (vs_prog_data->uses_drawid)
-         emit_draw_index(cmd_buffer, i);
-
-      /* Emitting draw index or vertex index BOs may result in needing
-       * additional VF cache flushes.
-       */
-      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-
-      load_indirect_parameters(cmd_buffer, draw, false);
-
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
-         prim.IndirectParameterEnable  = true;
-         prim.PredicateEnable          = true;
-         prim.VertexAccessType         = SEQUENTIAL;
-         prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
-      }
-
-      update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
-
-      offset += stride;
-   }
-
-   mi_value_unref(&b, max);
+   return (access & (VK_ACCESS_2_SHADER_WRITE_BIT |
+                     VK_ACCESS_2_MEMORY_WRITE_BIT |
+                     VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT));
 }
 
-void genX(CmdDrawIndexedIndirectCount)(
-    VkCommandBuffer                             commandBuffer,
-    VkBuffer                                    _buffer,
-    VkDeviceSize                                offset,
-    VkBuffer                                    _countBuffer,
-    VkDeviceSize                                countBufferOffset,
-    uint32_t                                    maxDrawCount,
-    uint32_t                                    stride)
+static inline bool
+mask_is_write(const VkAccessFlags2 access)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
-   ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
-   struct anv_cmd_state *cmd_state = &cmd_buffer->state;
-   struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
-   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
-
-   if (anv_batch_has_error(&cmd_buffer->batch))
-      return;
-
-   genX(cmd_buffer_flush_state)(cmd_buffer);
+   return access & (VK_ACCESS_2_SHADER_WRITE_BIT |
+                    VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
+                    VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
+                    VK_ACCESS_2_TRANSFER_WRITE_BIT |
+                    VK_ACCESS_2_HOST_WRITE_BIT |
+                    VK_ACCESS_2_MEMORY_WRITE_BIT |
+                    VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
+                    VK_ACCESS_2_VIDEO_DECODE_WRITE_BIT_KHR |
+#ifdef VK_ENABLE_BETA_EXTENSIONS
+                    VK_ACCESS_2_VIDEO_ENCODE_WRITE_BIT_KHR |
+#endif
+                    VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT |
+                    VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT |
+                    VK_ACCESS_2_COMMAND_PREPROCESS_WRITE_BIT_NV |
+                    VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR |
+                    VK_ACCESS_2_MICROMAP_WRITE_BIT_EXT |
+                    VK_ACCESS_2_OPTICAL_FLOW_WRITE_BIT_NV);
+}
 
-   struct mi_builder b;
-   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
-   struct mi_value max =
-      prepare_for_draw_count_predicate(cmd_buffer, &b,
-                                       count_buffer, countBufferOffset);
+static inline bool
+mask_is_transfer_write(const VkAccessFlags2 access)
+{
+   return access & (VK_ACCESS_2_TRANSFER_WRITE_BIT |
+                    VK_ACCESS_2_MEMORY_WRITE_BIT);
+}
 
-   for (uint32_t i = 0; i < maxDrawCount; i++) {
-      struct anv_address draw = anv_address_add(buffer->address, offset);
+static void
+cmd_buffer_barrier_video(struct anv_cmd_buffer *cmd_buffer,
+                        const VkDependencyInfo *dep_info)
+{
+   assert(anv_cmd_buffer_is_video_queue(cmd_buffer));
 
-      emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
+   bool flush_llc = false;
+   bool flush_ccs = false;
+   for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
+      const VkImageMemoryBarrier2 *img_barrier =
+         &dep_info->pImageMemoryBarriers[i];
 
-      /* TODO: We need to stomp base vertex to 0 somehow */
-      if (vs_prog_data->uses_firstvertex ||
-          vs_prog_data->uses_baseinstance)
-         emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
-      if (vs_prog_data->uses_drawid)
-         emit_draw_index(cmd_buffer, i);
+      ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
+      const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
 
-      /* Emitting draw index or vertex index BOs may result in needing
-       * additional VF cache flushes.
+      /* If srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this
+       * memory barrier defines a queue family ownership transfer.
        */
-      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+      if (img_barrier->srcQueueFamilyIndex != img_barrier->dstQueueFamilyIndex)
+         flush_llc = true;
 
-      load_indirect_parameters(cmd_buffer, draw, true);
-
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
-         prim.IndirectParameterEnable  = true;
-         prim.PredicateEnable          = true;
-         prim.VertexAccessType         = RANDOM;
-         prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
+      VkImageAspectFlags img_aspects =
+            vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
+      anv_foreach_image_aspect_bit(aspect_bit, image, img_aspects) {
+         const uint32_t plane =
+            anv_image_aspect_to_plane(image, 1UL << aspect_bit);
+         if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) {
+            flush_ccs = true;
+         }
       }
-
-      update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
-
-      offset += stride;
    }
 
-   mi_value_unref(&b, max);
-}
-
-void genX(CmdBeginTransformFeedbackEXT)(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    firstCounterBuffer,
-    uint32_t                                    counterBufferCount,
-    const VkBuffer*                             pCounterBuffers,
-    const VkDeviceSize*                         pCounterBufferOffsets)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   assert(firstCounterBuffer < MAX_XFB_BUFFERS);
-   assert(counterBufferCount <= MAX_XFB_BUFFERS);
-   assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
-
-   /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
-    *
-    *    "Ssoftware must ensure that no HW stream output operations can be in
-    *    process or otherwise pending at the point that the MI_LOAD/STORE
-    *    commands are processed. This will likely require a pipeline flush."
-    */
-   anv_add_pending_pipe_bits(cmd_buffer,
-                             ANV_PIPE_CS_STALL_BIT,
-                             "begin transform feedback");
-   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-
-   for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
-      /* If we have a counter buffer, this is a resume so we need to load the
-       * value into the streamout offset register.  Otherwise, this is a begin
-       * and we need to reset it to zero.
+   for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
+      /* Flush the cache if something is written by the video operations and
+       * used by any other stages except video encode/decode stages or if
+       * srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this memory
+       * barrier defines a queue family ownership transfer.
        */
-      if (pCounterBuffers &&
-          idx >= firstCounterBuffer &&
-          idx - firstCounterBuffer < counterBufferCount &&
-          pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
-         uint32_t cb_idx = idx - firstCounterBuffer;
-         ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
-         uint64_t offset = pCounterBufferOffsets ?
-                           pCounterBufferOffsets[cb_idx] : 0;
-
-         anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
-            lrm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
-            lrm.MemoryAddress    = anv_address_add(counter_buffer->address,
-                                                   offset);
-         }
-      } else {
-         anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
-            lri.RegisterOffset   = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
-            lri.DataDWord        = 0;
-         }
+      if ((stage_is_video(dep_info->pBufferMemoryBarriers[i].srcStageMask) &&
+           mask_is_write(dep_info->pBufferMemoryBarriers[i].srcAccessMask) &&
+           !stage_is_video(dep_info->pBufferMemoryBarriers[i].dstStageMask)) ||
+          (dep_info->pBufferMemoryBarriers[i].srcQueueFamilyIndex !=
+           dep_info->pBufferMemoryBarriers[i].dstQueueFamilyIndex)) {
+         flush_llc = true;
+         break;
       }
    }
 
-   cmd_buffer->state.xfb_enabled = true;
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
-}
-
-void genX(CmdEndTransformFeedbackEXT)(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    firstCounterBuffer,
-    uint32_t                                    counterBufferCount,
-    const VkBuffer*                             pCounterBuffers,
-    const VkDeviceSize*                         pCounterBufferOffsets)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   assert(firstCounterBuffer < MAX_XFB_BUFFERS);
-   assert(counterBufferCount <= MAX_XFB_BUFFERS);
-   assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
-
-   /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
-    *
-    *    "Ssoftware must ensure that no HW stream output operations can be in
-    *    process or otherwise pending at the point that the MI_LOAD/STORE
-    *    commands are processed. This will likely require a pipeline flush."
-    */
-   anv_add_pending_pipe_bits(cmd_buffer,
-                             ANV_PIPE_CS_STALL_BIT,
-                             "end transform feedback");
-   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-
-   for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
-      unsigned idx = firstCounterBuffer + cb_idx;
-
-      /* If we have a counter buffer, this is a resume so we need to load the
-       * value into the streamout offset register.  Otherwise, this is a begin
-       * and we need to reset it to zero.
+   for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
+      /* Flush the cache if something is written by the video operations and
+       * used by any other stages except video encode/decode stage.
        */
-      if (pCounterBuffers &&
-          cb_idx < counterBufferCount &&
-          pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
-         ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
-         uint64_t offset = pCounterBufferOffsets ?
-                           pCounterBufferOffsets[cb_idx] : 0;
-
-         anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
-            srm.MemoryAddress    = anv_address_add(counter_buffer->address,
-                                                   offset);
-            srm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
-         }
+      if (stage_is_video(dep_info->pMemoryBarriers[i].srcStageMask) &&
+          mask_is_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
+          !stage_is_video(dep_info->pMemoryBarriers[i].dstStageMask)) {
+         flush_llc = true;
+         break;
       }
    }
 
-   cmd_buffer->state.xfb_enabled = false;
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
+   if (flush_ccs || flush_llc) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
+#if GFX_VERx10 >= 125
+         fd.FlushCCS = flush_ccs;
+#endif
+#if GFX_VER >= 12
+         /* Using this bit on Gfx9 triggers a GPU hang.
+          * This is undocumented behavior. Gfx12 seems fine.
+          * TODO: check Gfx11
+          */
+         fd.FlushLLC = flush_llc;
+#endif
+      }
+   }
 }
 
-void
-genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
+static void
+cmd_buffer_barrier_blitter(struct anv_cmd_buffer *cmd_buffer,
+                           const VkDependencyInfo *dep_info)
 {
-   struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
-   struct anv_compute_pipeline *pipeline = comp_state->pipeline;
-
-   assert(pipeline->cs);
+#if GFX_VERx10 >= 125
+   assert(anv_cmd_buffer_is_blitter_queue(cmd_buffer));
 
-   genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
+   /* The blitter requires an MI_FLUSH_DW command when a buffer transitions
+    * from being a destination to a source.
+    */
+   bool flush_llc = false;
+   bool flush_ccs = false;
+   for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
+      const VkImageMemoryBarrier2 *img_barrier =
+         &dep_info->pImageMemoryBarriers[i];
 
-   genX(flush_pipeline_select_gpgpu)(cmd_buffer);
+      ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
+      const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
 
-   /* Apply any pending pipeline flushes we may have.  We want to apply them
-    * now because, if any of those flushes are for things like push constants,
-    * the GPU will read the state at weird times.
-    */
-   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+      /* If srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this
+       * memory barrier defines a queue family transfer operation.
+       */
+      if (img_barrier->srcQueueFamilyIndex != img_barrier->dstQueueFamilyIndex)
+         flush_llc = true;
 
-   if (cmd_buffer->state.compute.pipeline_dirty) {
-      /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
-       *
-       *    "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
-       *    the only bits that are changed are scoreboard related: Scoreboard
-       *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
-       *    these scoreboard related states, a MEDIA_STATE_FLUSH is
-       *    sufficient."
+      /* Flush cache if transfer command reads the output of the previous
+       * transfer command, ideally we should just wait for the completion but
+       * for now just flush the cache to make the data visible.
        */
-      anv_add_pending_pipe_bits(cmd_buffer,
-                              ANV_PIPE_CS_STALL_BIT,
-                              "flush compute state");
-      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+      if ((img_barrier->oldLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL ||
+            img_barrier->oldLayout == VK_IMAGE_LAYOUT_GENERAL) &&
+          (img_barrier->newLayout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL ||
+           img_barrier->newLayout == VK_IMAGE_LAYOUT_GENERAL)) {
+         flush_llc = true;
+      }
 
-      anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
+      VkImageAspectFlags img_aspects =
+            vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
+      anv_foreach_image_aspect_bit(aspect_bit, image, img_aspects) {
+         const uint32_t plane =
+            anv_image_aspect_to_plane(image, 1UL << aspect_bit);
+         if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage)) {
+            flush_ccs = true;
+         }
+      }
+   }
 
-      /* The workgroup size of the pipeline affects our push constant layout
-       * so flag push constants as dirty if we change the pipeline.
+   for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
+      /* Flush the cache if something is written by the transfer command and
+       * used by any other stages except transfer stage or if
+       * srcQueueFamilyIndex is not equal to dstQueueFamilyIndex, this memory
+       * barrier defines a queue family transfer operation.
        */
-      cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+      if ((stage_is_transfer(dep_info->pBufferMemoryBarriers[i].srcStageMask) &&
+           mask_is_write(dep_info->pBufferMemoryBarriers[i].srcAccessMask)) ||
+          (dep_info->pBufferMemoryBarriers[i].srcQueueFamilyIndex !=
+           dep_info->pBufferMemoryBarriers[i].dstQueueFamilyIndex)) {
+         flush_llc = true;
+         break;
+      }
    }
 
-   if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
-       cmd_buffer->state.compute.pipeline_dirty) {
-      flush_descriptor_sets(cmd_buffer,
-                            &cmd_buffer->state.compute.base,
-                            VK_SHADER_STAGE_COMPUTE_BIT,
-                            &pipeline->cs, 1);
-      cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
-
-#if GFX_VERx10 < 125
-      uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
-      struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
-         .BindingTablePointer =
-            cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
-         .SamplerStatePointer =
-            cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
-      };
-      GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
-
-      struct anv_state state =
-         anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
-                                      pipeline->interface_descriptor_data,
-                                      GENX(INTERFACE_DESCRIPTOR_DATA_length),
-                                      64);
-
-      uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
-      anv_batch_emit(&cmd_buffer->batch,
-                     GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
-         mid.InterfaceDescriptorTotalLength        = size;
-         mid.InterfaceDescriptorDataStartAddress   = state.offset;
+   for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
+      /* Flush the cache if something is written by the transfer command and
+       * used by any other stages except transfer stage.
+       */
+      if (stage_is_transfer(dep_info->pMemoryBarriers[i].srcStageMask) &&
+          mask_is_write(dep_info->pMemoryBarriers[i].srcAccessMask)) {
+         flush_llc = true;
+         break;
       }
-#endif
    }
 
-   if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
-      comp_state->push_data =
-         anv_cmd_buffer_cs_push_constants(cmd_buffer);
-
-#if GFX_VERx10 < 125
-      if (comp_state->push_data.alloc_size) {
-         anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
-            curbe.CURBETotalDataLength    = comp_state->push_data.alloc_size;
-            curbe.CURBEDataStartAddress   = comp_state->push_data.offset;
-         }
+   if (flush_ccs || flush_llc) {
+      /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
+      if (intel_needs_workaround(cmd_buffer->device->info, 16018063123)) {
+         genX(batch_emit_fast_color_dummy_blit)(&cmd_buffer->batch,
+                                                cmd_buffer->device);
+      }
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
+         fd.FlushCCS = flush_ccs;
+         fd.FlushLLC = flush_llc;
       }
-#endif
-
-      cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
    }
-
-   cmd_buffer->state.compute.pipeline_dirty = false;
-
-   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+#endif
 }
 
-#if GFX_VER == 7
-
-static VkResult
-verify_cmd_parser(const struct anv_device *device,
-                  int required_version,
-                  const char *function)
+static inline bool
+cmd_buffer_has_pending_copy_query(struct anv_cmd_buffer *cmd_buffer)
 {
-   if (device->physical->cmd_parser_version < required_version) {
-      return vk_errorf(device, &device->physical->vk.base,
-                       VK_ERROR_FEATURE_NOT_PRESENT,
-                       "cmd parser version %d is required for %s",
-                       required_version, function);
-   } else {
-      return VK_SUCCESS;
-   }
+   /* Query copies are only written with dataport, so we only need to check
+    * that flag.
+    */
+   return (cmd_buffer->state.queries.buffer_write_bits &
+           ANV_QUERY_WRITES_DATA_FLUSH) != 0;
 }
 
-#endif
-
 static void
-anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer,
-                                  uint32_t baseGroupX,
-                                  uint32_t baseGroupY,
-                                  uint32_t baseGroupZ)
+cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
+                   const VkDependencyInfo *dep_info,
+                   const char *reason)
 {
-   if (anv_batch_has_error(&cmd_buffer->batch))
+   if (anv_cmd_buffer_is_video_queue(cmd_buffer)) {
+      cmd_buffer_barrier_video(cmd_buffer, dep_info);
       return;
-
-   struct anv_push_constants *push =
-      &cmd_buffer->state.compute.base.push_constants;
-   if (push->cs.base_work_group_id[0] != baseGroupX ||
-       push->cs.base_work_group_id[1] != baseGroupY ||
-       push->cs.base_work_group_id[2] != baseGroupZ) {
-      push->cs.base_work_group_id[0] = baseGroupX;
-      push->cs.base_work_group_id[1] = baseGroupY;
-      push->cs.base_work_group_id[2] = baseGroupZ;
-
-      cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
    }
-}
 
-void genX(CmdDispatch)(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    x,
-    uint32_t                                    y,
-    uint32_t                                    z)
-{
-   genX(CmdDispatchBase)(commandBuffer, 0, 0, 0, x, y, z);
-}
-
-#if GFX_VERx10 >= 125
-
-static inline void
-emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
-                    const struct anv_compute_pipeline *pipeline, bool indirect,
-                    const struct brw_cs_prog_data *prog_data,
-                    uint32_t groupCountX, uint32_t groupCountY,
-                    uint32_t groupCountZ)
-{
-   struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
-   const struct anv_shader_bin *cs_bin = pipeline->cs;
-   bool predicate = cmd_buffer->state.conditional_render_enabled;
-
-   const struct intel_device_info *devinfo = &pipeline->base.device->info;
-   const struct brw_cs_dispatch_info dispatch =
-      brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
-
-   anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
-      cw.IndirectParameterEnable        = indirect;
-      cw.PredicateEnable                = predicate;
-      cw.SIMDSize                       = dispatch.simd_size / 16;
-      cw.IndirectDataStartAddress       = comp_state->push_data.offset;
-      cw.IndirectDataLength             = comp_state->push_data.alloc_size;
-      cw.LocalXMaximum                  = prog_data->local_size[0] - 1;
-      cw.LocalYMaximum                  = prog_data->local_size[1] - 1;
-      cw.LocalZMaximum                  = prog_data->local_size[2] - 1;
-      cw.ThreadGroupIDXDimension        = groupCountX;
-      cw.ThreadGroupIDYDimension        = groupCountY;
-      cw.ThreadGroupIDZDimension        = groupCountZ;
-      cw.ExecutionMask                  = dispatch.right_mask;
-
-      cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
-         .KernelStartPointer = cs_bin->kernel.offset,
-         .SamplerStatePointer =
-            cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
-         .BindingTablePointer =
-            cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
-         .BindingTableEntryCount =
-            1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
-         .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
-         .SharedLocalMemorySize = encode_slm_size(GFX_VER,
-                                                  prog_data->base.total_shared),
-         .BarrierEnable = prog_data->uses_barrier,
-      };
+   if (anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
+      cmd_buffer_barrier_blitter(cmd_buffer, dep_info);
+      return;
    }
-}
 
-#else /* #if GFX_VERx10 >= 125 */
-
-static inline void
-emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
-                  const struct anv_compute_pipeline *pipeline, bool indirect,
-                  const struct brw_cs_prog_data *prog_data,
-                  uint32_t groupCountX, uint32_t groupCountY,
-                  uint32_t groupCountZ)
-{
-   bool predicate = (GFX_VER <= 7 && indirect) ||
-      cmd_buffer->state.conditional_render_enabled;
-
-   const struct intel_device_info *devinfo = &pipeline->base.device->info;
-   const struct brw_cs_dispatch_info dispatch =
-      brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
-
-   anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
-      ggw.IndirectParameterEnable      = indirect;
-      ggw.PredicateEnable              = predicate;
-      ggw.SIMDSize                     = dispatch.simd_size / 16;
-      ggw.ThreadDepthCounterMaximum    = 0;
-      ggw.ThreadHeightCounterMaximum   = 0;
-      ggw.ThreadWidthCounterMaximum    = dispatch.threads - 1;
-      ggw.ThreadGroupIDXDimension      = groupCountX;
-      ggw.ThreadGroupIDYDimension      = groupCountY;
-      ggw.ThreadGroupIDZDimension      = groupCountZ;
-      ggw.RightExecutionMask           = dispatch.right_mask;
-      ggw.BottomExecutionMask          = 0xffffffff;
-   }
-
-   anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
-}
-
-#endif /* #if GFX_VERx10 >= 125 */
-
-static inline void
-emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
-               const struct anv_compute_pipeline *pipeline, bool indirect,
-               const struct brw_cs_prog_data *prog_data,
-               uint32_t groupCountX, uint32_t groupCountY,
-               uint32_t groupCountZ)
-{
-#if GFX_VERx10 >= 125
-   emit_compute_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,
-                       groupCountY, groupCountZ);
-#else
-   emit_gpgpu_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,
-                     groupCountY, groupCountZ);
-#endif
-}
+   struct anv_device *device = cmd_buffer->device;
 
-void genX(CmdDispatchBase)(
-    VkCommandBuffer                             commandBuffer,
-    uint32_t                                    baseGroupX,
-    uint32_t                                    baseGroupY,
-    uint32_t                                    baseGroupZ,
-    uint32_t                                    groupCountX,
-    uint32_t                                    groupCountY,
-    uint32_t                                    groupCountZ)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
-   const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
+   /* XXX: Right now, we're really dumb and just flush whatever categories
+    * the app asks for.  One of these days we may make this a bit better
+    * but right now that's all the hardware allows for in most areas.
+    */
+   VkAccessFlags2 src_flags = 0;
+   VkAccessFlags2 dst_flags = 0;
 
-   anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX,
-                                     baseGroupY, baseGroupZ);
+   bool apply_sparse_flushes = false;
+   bool flush_query_copies = false;
 
-   if (anv_batch_has_error(&cmd_buffer->batch))
-      return;
+   for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
+      src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask;
+      dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask;
 
-   anv_measure_snapshot(cmd_buffer,
-                        INTEL_SNAPSHOT_COMPUTE,
-                        "compute",
-                        groupCountX * groupCountY * groupCountZ *
-                        prog_data->local_size[0] * prog_data->local_size[1] *
-                        prog_data->local_size[2]);
+      /* Shader writes to buffers that could then be written by a transfer
+       * command (including queries).
+       */
+      if (stage_is_shader(dep_info->pMemoryBarriers[i].srcStageMask) &&
+          mask_is_shader_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
+          stage_is_transfer(dep_info->pMemoryBarriers[i].dstStageMask)) {
+         cmd_buffer->state.queries.buffer_write_bits |=
+            ANV_QUERY_COMPUTE_WRITES_PENDING_BITS;
+      }
 
-   if (prog_data->uses_num_work_groups) {
-      struct anv_state state =
-         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
-      uint32_t *sizes = state.map;
-      sizes[0] = groupCountX;
-      sizes[1] = groupCountY;
-      sizes[2] = groupCountZ;
-      cmd_buffer->state.compute.num_workgroups = (struct anv_address) {
-         .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
-         .offset = state.offset,
-      };
+      if (stage_is_transfer(dep_info->pMemoryBarriers[i].srcStageMask) &&
+          mask_is_transfer_write(dep_info->pMemoryBarriers[i].srcAccessMask) &&
+          cmd_buffer_has_pending_copy_query(cmd_buffer))
+         flush_query_copies = true;
 
-      /* The num_workgroups buffer goes in the binding table */
-      cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+      /* There's no way of knowing if this memory barrier is related to sparse
+       * buffers! This is pretty horrible.
+       */
+      if (mask_is_write(src_flags) &&
+          p_atomic_read(&device->num_sparse_resources) > 0)
+         apply_sparse_flushes = true;
    }
 
-   genX(cmd_buffer_flush_compute_state)(cmd_buffer);
-
-   if (cmd_buffer->state.conditional_render_enabled)
-      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
-
-   emit_cs_walker(cmd_buffer, pipeline, false, prog_data, groupCountX,
-                  groupCountY, groupCountZ);
-}
-
-#define GPGPU_DISPATCHDIMX 0x2500
-#define GPGPU_DISPATCHDIMY 0x2504
-#define GPGPU_DISPATCHDIMZ 0x2508
+   for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
+      const VkBufferMemoryBarrier2 *buf_barrier =
+         &dep_info->pBufferMemoryBarriers[i];
+      ANV_FROM_HANDLE(anv_buffer, buffer, buf_barrier->buffer);
 
-void genX(CmdDispatchIndirect)(
-    VkCommandBuffer                             commandBuffer,
-    VkBuffer                                    _buffer,
-    VkDeviceSize                                offset)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
-   struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
-   const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
-   struct anv_address addr = anv_address_add(buffer->address, offset);
-   UNUSED struct anv_batch *batch = &cmd_buffer->batch;
-
-   anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0);
-
-#if GFX_VER == 7
-   /* Linux 4.4 added command parser version 5 which allows the GPGPU
-    * indirect dispatch registers to be written.
-    */
-   if (verify_cmd_parser(cmd_buffer->device, 5,
-                         "vkCmdDispatchIndirect") != VK_SUCCESS)
-      return;
-#endif
+      src_flags |= buf_barrier->srcAccessMask;
+      dst_flags |= buf_barrier->dstAccessMask;
 
-   anv_measure_snapshot(cmd_buffer,
-                        INTEL_SNAPSHOT_COMPUTE,
-                        "compute indirect",
-                        0);
+      /* Shader writes to buffers that could then be written by a transfer
+       * command (including queries).
+       */
+      if (stage_is_shader(buf_barrier->srcStageMask) &&
+          mask_is_shader_write(buf_barrier->srcAccessMask) &&
+          stage_is_transfer(buf_barrier->dstStageMask)) {
+         cmd_buffer->state.queries.buffer_write_bits |=
+            ANV_QUERY_COMPUTE_WRITES_PENDING_BITS;
+      }
 
-   if (prog_data->uses_num_work_groups) {
-      cmd_buffer->state.compute.num_workgroups = addr;
+      if (stage_is_transfer(buf_barrier->srcStageMask) &&
+          mask_is_transfer_write(buf_barrier->srcAccessMask) &&
+          cmd_buffer_has_pending_copy_query(cmd_buffer))
+         flush_query_copies = true;
 
-      /* The num_workgroups buffer goes in the binding table */
-      cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+      if (anv_buffer_is_sparse(buffer) && mask_is_write(src_flags))
+         apply_sparse_flushes = true;
    }
 
-   genX(cmd_buffer_flush_compute_state)(cmd_buffer);
-
-   struct mi_builder b;
-   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
-
-   struct mi_value size_x = mi_mem32(anv_address_add(addr, 0));
-   struct mi_value size_y = mi_mem32(anv_address_add(addr, 4));
-   struct mi_value size_z = mi_mem32(anv_address_add(addr, 8));
+   for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
+      const VkImageMemoryBarrier2 *img_barrier =
+         &dep_info->pImageMemoryBarriers[i];
 
-   mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
-   mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
-   mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
+      src_flags |= img_barrier->srcAccessMask;
+      dst_flags |= img_barrier->dstAccessMask;
 
-#if GFX_VER <= 7
-   /* predicate = (compute_dispatch_indirect_x_size == 0); */
-   mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), size_x);
-   mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
-   anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
-      mip.LoadOperation    = LOAD_LOAD;
-      mip.CombineOperation = COMBINE_SET;
-      mip.CompareOperation = COMPARE_SRCS_EQUAL;
-   }
+      ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
+      const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
 
-   /* predicate |= (compute_dispatch_indirect_y_size == 0); */
-   mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_y);
-   anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
-      mip.LoadOperation    = LOAD_LOAD;
-      mip.CombineOperation = COMBINE_OR;
-      mip.CompareOperation = COMPARE_SRCS_EQUAL;
-   }
+      uint32_t base_layer, layer_count;
+      if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
+         base_layer = 0;
+         layer_count = u_minify(image->vk.extent.depth, range->baseMipLevel);
+      } else {
+         base_layer = range->baseArrayLayer;
+         layer_count = vk_image_subresource_layer_count(&image->vk, range);
+      }
+      const uint32_t level_count =
+         vk_image_subresource_level_count(&image->vk, range);
 
-   /* predicate |= (compute_dispatch_indirect_z_size == 0); */
-   mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_z);
-   anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
-      mip.LoadOperation    = LOAD_LOAD;
-      mip.CombineOperation = COMBINE_OR;
-      mip.CompareOperation = COMPARE_SRCS_EQUAL;
-   }
+      VkImageLayout old_layout = img_barrier->oldLayout;
+      VkImageLayout new_layout = img_barrier->newLayout;
 
-   /* predicate = !predicate; */
-   anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
-      mip.LoadOperation    = LOAD_LOADINV;
-      mip.CombineOperation = COMBINE_OR;
-      mip.CompareOperation = COMPARE_FALSE;
-   }
-
-#if GFX_VERx10 == 75
-   if (cmd_buffer->state.conditional_render_enabled) {
-      /* predicate &= !(conditional_rendering_predicate == 0); */
-      mi_store(&b, mi_reg32(MI_PREDICATE_SRC0),
-                   mi_reg32(ANV_PREDICATE_RESULT_REG));
-      anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
-         mip.LoadOperation    = LOAD_LOADINV;
-         mip.CombineOperation = COMBINE_AND;
-         mip.CompareOperation = COMPARE_SRCS_EQUAL;
+      /* If we're inside a render pass, the runtime might have converted some
+       * layouts from GENERAL to FEEDBACK_LOOP. Check if that's the case and
+       * reconvert back to the original layout so that application barriers
+       * within renderpass are operating with consistent layouts.
+       */
+      if (!cmd_buffer->vk.runtime_rp_barrier &&
+          cmd_buffer->vk.render_pass != NULL) {
+         assert(anv_cmd_graphics_state_has_image_as_attachment(&cmd_buffer->state.gfx,
+                                                               image));
+         VkImageLayout subpass_att_layout, subpass_stencil_att_layout;
+
+         vk_command_buffer_get_attachment_layout(
+            &cmd_buffer->vk, &image->vk,
+            &subpass_att_layout, &subpass_stencil_att_layout);
+
+         old_layout = subpass_att_layout;
+         new_layout = subpass_att_layout;
       }
-   }
-#endif
 
-#else /* GFX_VER > 7 */
-   if (cmd_buffer->state.conditional_render_enabled)
-      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
-#endif
+      if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
+         transition_depth_buffer(cmd_buffer, image,
+                                 range->baseMipLevel, level_count,
+                                 base_layer, layer_count,
+                                 old_layout, new_layout,
+                                 false /* will_full_fast_clear */);
+      }
 
-   emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0);
-}
+      if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
+         transition_stencil_buffer(cmd_buffer, image,
+                                   range->baseMipLevel, level_count,
+                                   base_layer, layer_count,
+                                   old_layout, new_layout,
+                                   false /* will_full_fast_clear */);
+      }
 
-#if GFX_VERx10 >= 125
-static void
-calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
-{
-   unsigned total_shift = 0;
-   memset(local_shift, 0, 3);
-
-   bool progress;
-   do {
-      progress = false;
-      for (unsigned i = 0; i < 3; i++) {
-         assert(global[i] > 0);
-         if ((1 << local_shift[i]) < global[i]) {
-            progress = true;
-            local_shift[i]++;
-            total_shift++;
+      if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
+         VkImageAspectFlags color_aspects =
+            vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
+         anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) {
+            transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit,
+                                    range->baseMipLevel, level_count,
+                                    base_layer, layer_count,
+                                    old_layout, new_layout,
+                                    img_barrier->srcQueueFamilyIndex,
+                                    img_barrier->dstQueueFamilyIndex,
+                                    false /* will_full_fast_clear */);
          }
-
-         if (total_shift == 3)
-            return;
       }
-   } while(progress);
-
-   /* Assign whatever's left to x */
-   local_shift[0] += 3 - total_shift;
-}
-
-static struct GFX_RT_SHADER_TABLE
-vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)
-{
-   return (struct GFX_RT_SHADER_TABLE) {
-      .BaseAddress = anv_address_from_u64(region->deviceAddress),
-      .Stride = region->stride,
-   };
-}
-
-static void
-cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
-                      const VkStridedDeviceAddressRegionKHR *raygen_sbt,
-                      const VkStridedDeviceAddressRegionKHR *miss_sbt,
-                      const VkStridedDeviceAddressRegionKHR *hit_sbt,
-                      const VkStridedDeviceAddressRegionKHR *callable_sbt,
-                      bool is_indirect,
-                      uint32_t launch_width,
-                      uint32_t launch_height,
-                      uint32_t launch_depth,
-                      uint64_t launch_size_addr)
-{
-   struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
-   struct anv_ray_tracing_pipeline *pipeline = rt->pipeline;
-
-   if (anv_batch_has_error(&cmd_buffer->batch))
-      return;
-
-   /* If we have a known degenerate launch size, just bail */
-   if (!is_indirect &&
-       (launch_width == 0 || launch_height == 0 || launch_depth == 0))
-      return;
-
-   genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
-   genX(flush_pipeline_select_gpgpu)(cmd_buffer);
-
-   cmd_buffer->state.rt.pipeline_dirty = false;
-
-   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-
-   /* Add these to the reloc list as they're internal buffers that don't
-    * actually have relocs to pick them up manually.
-    *
-    * TODO(RT): This is a bit of a hack
-    */
-   anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
-                         cmd_buffer->batch.alloc,
-                         rt->scratch.bo);
-
-   /* Allocate and set up our RT_DISPATCH_GLOBALS */
-   struct anv_state rtdg_state =
-      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
-                                         BRW_RT_PUSH_CONST_OFFSET +
-                                         sizeof(struct anv_push_constants),
-                                         64);
-
-   struct GFX_RT_DISPATCH_GLOBALS rtdg = {
-      .MemBaseAddress = (struct anv_address) {
-         .bo = rt->scratch.bo,
-         .offset = rt->scratch.layout.ray_stack_start,
-      },
-      .CallStackHandler =
-         anv_shader_bin_get_bsr(cmd_buffer->device->rt_trivial_return, 0),
-      .AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64,
-      .NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss,
-      .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
-      .Flags = RT_DEPTH_TEST_LESS_EQUAL,
-      .HitGroupTable = vk_sdar_to_shader_table(hit_sbt),
-      .MissGroupTable = vk_sdar_to_shader_table(miss_sbt),
-      .SWStackSize = rt->scratch.layout.sw_stack_size / 64,
-      .LaunchWidth = launch_width,
-      .LaunchHeight = launch_height,
-      .LaunchDepth = launch_depth,
-      .CallableGroupTable = vk_sdar_to_shader_table(callable_sbt),
-   };
-   GFX_RT_DISPATCH_GLOBALS_pack(NULL, rtdg_state.map, &rtdg);
 
-   /* Push constants go after the RT_DISPATCH_GLOBALS */
-   assert(GFX_RT_DISPATCH_GLOBALS_length * 4 <= BRW_RT_PUSH_CONST_OFFSET);
-   memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET,
-          &cmd_buffer->state.rt.base.push_constants,
-          sizeof(struct anv_push_constants));
-
-   struct anv_address rtdg_addr = {
-      .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
-      .offset = rtdg_state.offset,
-   };
-
-   uint8_t local_size_log2[3];
-   uint32_t global_size[3] = {};
-   if (is_indirect) {
-      /* Pick a local size that's probably ok.  We assume most TraceRays calls
-       * will use a two-dimensional dispatch size.  Worst case, our initial
-       * dispatch will be a little slower than it has to be.
+      /* Mark image as compressed if the destination layout has untracked
+       * writes to the aux surface.
        */
-      local_size_log2[0] = 2;
-      local_size_log2[1] = 1;
-      local_size_log2[2] = 0;
+      VkImageAspectFlags aspects =
+         vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
+      anv_foreach_image_aspect_bit(aspect_bit, image, aspects) {
+         VkImageAspectFlagBits aspect = 1UL << aspect_bit;
+         if (anv_layout_has_untracked_aux_writes(
+                device->info,
+                image, aspect,
+                img_barrier->newLayout,
+                cmd_buffer->queue_family->queueFlags)) {
+            for (uint32_t l = 0; l < level_count; l++) {
+               const uint32_t level = range->baseMipLevel + l;
+               const uint32_t aux_layers =
+                  anv_image_aux_layers(image, aspect, level);
+
+               if (base_layer >= aux_layers)
+                  break; /* We will only get fewer layers as level increases */
+
+               uint32_t level_layer_count =
+                  MIN2(layer_count, aux_layers - base_layer);
 
-      struct mi_builder b;
-      mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+               set_image_compressed_bit(cmd_buffer, image, aspect,
+                                        level,
+                                        base_layer, level_layer_count,
+                                        true);
+            }
+         }
+      }
 
-      struct mi_value launch_size[3] = {
-         mi_mem32(anv_address_from_u64(launch_size_addr + 0)),
-         mi_mem32(anv_address_from_u64(launch_size_addr + 4)),
-         mi_mem32(anv_address_from_u64(launch_size_addr + 8)),
-      };
+      if (anv_image_is_sparse(image) && mask_is_write(src_flags))
+         apply_sparse_flushes = true;
+   }
 
-      /* Store the original launch size into RT_DISPATCH_GLOBALS
-       *
-       * TODO: Pull values from genX_bits.h once RT_DISPATCH_GLOBALS gets
-       * moved into a genX version.
-       */
-      mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 52)),
-               mi_value_ref(&b, launch_size[0]));
-      mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 56)),
-               mi_value_ref(&b, launch_size[1]));
-      mi_store(&b, mi_mem32(anv_address_add(rtdg_addr, 60)),
-               mi_value_ref(&b, launch_size[2]));
-
-      /* Compute the global dispatch size */
-      for (unsigned i = 0; i < 3; i++) {
-         if (local_size_log2[i] == 0)
-            continue;
+   enum anv_pipe_bits bits =
+      anv_pipe_flush_bits_for_access_flags(cmd_buffer, src_flags) |
+      anv_pipe_invalidate_bits_for_access_flags(cmd_buffer, dst_flags);
 
-         /* global_size = DIV_ROUND_UP(launch_size, local_size)
-          *
-          * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm
-          * has the semantics of shifting the enture 64-bit value and taking
-          * the bottom 32 so we don't have to worry about roll-over.
-          */
-         uint32_t local_size = 1 << local_size_log2[i];
-         launch_size[i] = mi_iadd(&b, launch_size[i],
-                                      mi_imm(local_size - 1));
-         launch_size[i] = mi_ushr32_imm(&b, launch_size[i],
-                                            local_size_log2[i]);
-      }
-
-      mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]);
-      mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]);
-      mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]);
-   } else {
-      uint32_t launch_size[3] = { launch_width, launch_height, launch_depth };
-      calc_local_trace_size(local_size_log2, launch_size);
+   /* Our HW implementation of the sparse feature lives in the GAM unit
+    * (interface between all the GPU caches and external memory). As a result
+    * writes to NULL bound images & buffers that should be ignored are
+    * actually still visible in the caches. The only way for us to get correct
+    * NULL bound regions to return 0s is to evict the caches to force the
+    * caches to be repopulated with 0s.
+    */
+   if (apply_sparse_flushes)
+      bits |= ANV_PIPE_FLUSH_BITS;
 
-      for (unsigned i = 0; i < 3; i++) {
-         /* We have to be a bit careful here because DIV_ROUND_UP adds to the
-          * numerator value may overflow.  Cast to uint64_t to avoid this.
-          */
-         uint32_t local_size = 1 << local_size_log2[i];
-         global_size[i] = DIV_ROUND_UP((uint64_t)launch_size[i], local_size);
-      }
+   /* Copies from query pools are executed with a shader writing through the
+    * dataport.
+    */
+   if (flush_query_copies) {
+      bits |= (GFX_VER >= 12 ?
+               ANV_PIPE_HDC_PIPELINE_FLUSH_BIT : ANV_PIPE_DATA_CACHE_FLUSH_BIT);
    }
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
-      cw.IndirectParameterEnable        = is_indirect;
-      cw.PredicateEnable                = false;
-      cw.SIMDSize                       = SIMD8;
-      cw.LocalXMaximum                  = (1 << local_size_log2[0]) - 1;
-      cw.LocalYMaximum                  = (1 << local_size_log2[1]) - 1;
-      cw.LocalZMaximum                  = (1 << local_size_log2[2]) - 1;
-      cw.ThreadGroupIDXDimension        = global_size[0];
-      cw.ThreadGroupIDYDimension        = global_size[1];
-      cw.ThreadGroupIDZDimension        = global_size[2];
-      cw.ExecutionMask                  = 0xff;
-      cw.EmitInlineParameter            = true;
-
-      const gl_shader_stage s = MESA_SHADER_RAYGEN;
-      struct anv_device *device = cmd_buffer->device;
-      struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];
-      struct anv_state *samplers = &cmd_buffer->state.samplers[s];
-      cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
-         .KernelStartPointer = device->rt_trampoline->kernel.offset,
-         .SamplerStatePointer = samplers->offset,
-         /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */
-         .SamplerCount = 0,
-         .BindingTablePointer = surfaces->offset,
-         .NumberofThreadsinGPGPUThreadGroup = 1,
-         .BTDMode = true,
-      };
+   if (dst_flags & VK_ACCESS_INDIRECT_COMMAND_READ_BIT)
+      genX(cmd_buffer_flush_generated_draws)(cmd_buffer);
 
-      struct brw_rt_raygen_trampoline_params trampoline_params = {
-         .rt_disp_globals_addr = anv_address_physical(rtdg_addr),
-         .raygen_bsr_addr = raygen_sbt->deviceAddress,
-         .is_indirect = is_indirect,
-         .local_group_size_log2 = {
-            local_size_log2[0],
-            local_size_log2[1],
-            local_size_log2[2],
-         },
-      };
-      STATIC_ASSERT(sizeof(trampoline_params) == 32);
-      memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params));
-   }
+   anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
 }
 
-void
-genX(CmdTraceRaysKHR)(
+void genX(CmdPipelineBarrier2)(
     VkCommandBuffer                             commandBuffer,
-    const VkStridedDeviceAddressRegionKHR*      pRaygenShaderBindingTable,
-    const VkStridedDeviceAddressRegionKHR*      pMissShaderBindingTable,
-    const VkStridedDeviceAddressRegionKHR*      pHitShaderBindingTable,
-    const VkStridedDeviceAddressRegionKHR*      pCallableShaderBindingTable,
-    uint32_t                                    width,
-    uint32_t                                    height,
-    uint32_t                                    depth)
+    const VkDependencyInfo*                     pDependencyInfo)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 
-   cmd_buffer_trace_rays(cmd_buffer,
-                         pRaygenShaderBindingTable,
-                         pMissShaderBindingTable,
-                         pHitShaderBindingTable,
-                         pCallableShaderBindingTable,
-                         false /* is_indirect */,
-                         width, height, depth,
-                         0 /* launch_size_addr */);
+   cmd_buffer_barrier(cmd_buffer, pDependencyInfo, "pipe barrier");
 }
 
 void
-genX(CmdTraceRaysIndirectKHR)(
-    VkCommandBuffer                             commandBuffer,
-    const VkStridedDeviceAddressRegionKHR*      pRaygenShaderBindingTable,
-    const VkStridedDeviceAddressRegionKHR*      pMissShaderBindingTable,
-    const VkStridedDeviceAddressRegionKHR*      pHitShaderBindingTable,
-    const VkStridedDeviceAddressRegionKHR*      pCallableShaderBindingTable,
-    VkDeviceAddress                             indirectDeviceAddress)
+genX(batch_emit_breakpoint)(struct anv_batch *batch,
+                            struct anv_device *device,
+                            bool emit_before_draw)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   /* Update draw call count once */
+   uint32_t draw_count = emit_before_draw ?
+                         p_atomic_inc_return(&device->draw_call_count) :
+                         p_atomic_read(&device->draw_call_count);
+
+   if (((draw_count == intel_debug_bkp_before_draw_count &&
+        emit_before_draw) ||
+       (draw_count == intel_debug_bkp_after_draw_count &&
+        !emit_before_draw))) {
+      struct anv_address wait_addr =
+         anv_state_pool_state_address(&device->dynamic_state_pool,
+                                      device->breakpoint);
+
+      anv_batch_emit(batch, GENX(MI_SEMAPHORE_WAIT), sem) {
+         sem.WaitMode            = PollingMode;
+         sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD;
+         sem.SemaphoreDataDword  = 0x1;
+         sem.SemaphoreAddress    = wait_addr;
+      };
+   }
+}
 
-   cmd_buffer_trace_rays(cmd_buffer,
-                         pRaygenShaderBindingTable,
-                         pMissShaderBindingTable,
-                         pHitShaderBindingTable,
-                         pCallableShaderBindingTable,
-                         true /* is_indirect */,
-                         0, 0, 0, /* width, height, depth, */
-                         indirectDeviceAddress);
+/* Only emit PIPELINE_SELECT, for the whole mode switch and flushing use
+ * flush_pipeline_select()
+ */
+void
+genX(emit_pipeline_select)(struct anv_batch *batch, uint32_t pipeline,
+                           const struct anv_device *device)
+{
+   /* Bspec 55860: Xe2+ no longer requires PIPELINE_SELECT */
+#if GFX_VER < 20
+   anv_batch_emit(batch, GENX(PIPELINE_SELECT), ps) {
+      ps.MaskBits = GFX_VERx10 >= 125 ? 0x93 : GFX_VER >= 12 ? 0x13 : 0x3;
+#if GFX_VER == 12
+      ps.MediaSamplerDOPClockGateEnable = true;
+#endif
+      ps.PipelineSelection = pipeline;
+#if GFX_VERx10 == 125
+      /* It might still be better to only enable this when the compute
+       * pipeline will have DPAS instructions.
+       */
+      ps.SystolicModeEnable = pipeline == GPGPU &&
+         device->vk.enabled_extensions.KHR_cooperative_matrix &&
+         device->vk.enabled_features.cooperativeMatrix;
+#endif
+   }
+#endif /* if GFX_VER < 20 */
 }
-#endif /* GFX_VERx10 >= 125 */
 
 static void
 genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
                             uint32_t pipeline)
 {
-   UNUSED const struct intel_device_info *devinfo = &cmd_buffer->device->info;
+   UNUSED const struct intel_device_info *devinfo = cmd_buffer->device->info;
 
    if (cmd_buffer->state.current_pipeline == pipeline)
       return;
 
-#if GFX_VER >= 8 && GFX_VER < 10
+#if GFX_VER >= 20
+   /* While PIPELINE_SELECT is not needed on Xe2+, our current assumption
+    * is that the pipelined flushes in the 3D pipeline are not getting
+    * synchronized with the compute dispatches (and vice versa). So we need
+    * a CS_STALL prior the next set of commands to ensure the flushes have
+    * completed.
+    *
+    * The new RESOURCE_BARRIER instruction has support for synchronizing
+    * 3D/Compute and once we switch to that we should be able to get rid of
+    * this CS_STALL.
+    */
+   anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_CS_STALL_BIT, "pipeline switch stall");
+
+   /* Since we are not stalling/flushing caches explicitly while switching
+    * between the pipelines, we need to apply data dependency flushes recorded
+    * previously on the resource.
+    */
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+#else
+
+#if GFX_VER == 9
    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
     *
     *   Software must clear the COLOR_CALC_STATE Valid field in
@@ -5393,6 +4230,96 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
 #endif
 
+#if GFX_VERx10 == 120
+   /* Undocumented workaround to force the re-emission of
+    * MEDIA_INTERFACE_DESCRIPTOR_LOAD when switching from 3D to Compute
+    * pipeline without rebinding a pipeline :
+    *    vkCmdBindPipeline(COMPUTE, cs_pipeline);
+    *    vkCmdDispatch(...);
+    *    vkCmdBindPipeline(GRAPHICS, gfx_pipeline);
+    *    vkCmdDraw(...);
+    *    vkCmdDispatch(...);
+    */
+   if (pipeline == _3D)
+      cmd_buffer->state.compute.pipeline_dirty = true;
+#endif
+
+   /* We apparently cannot flush the tile cache (color/depth) from the GPGPU
+    * pipeline. That means query clears will not be visible to query
+    * copy/write. So we need to flush it before going to GPGPU mode.
+    */
+   if (cmd_buffer->state.current_pipeline == _3D &&
+       cmd_buffer->state.queries.clear_bits) {
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_QUERY_BITS(cmd_buffer->state.queries.clear_bits),
+                                "query clear flush prior to GPGPU");
+   }
+
+   /* Flush and invalidate bits done needed prior PIPELINE_SELECT. */
+   enum anv_pipe_bits bits = 0;
+
+#if GFX_VER >= 12
+   /* From Tigerlake PRM, Volume 2a, PIPELINE_SELECT:
+    *
+    *   "Software must ensure Render Cache, Depth Cache and HDC Pipeline flush
+    *   are flushed through a stalling PIPE_CONTROL command prior to
+    *   programming of PIPELINE_SELECT command transitioning Pipeline Select
+    *   from 3D to GPGPU/Media.
+    *   Software must ensure HDC Pipeline flush and Generic Media State Clear
+    *   is issued through a stalling PIPE_CONTROL command prior to programming
+    *   of PIPELINE_SELECT command transitioning Pipeline Select from
+    *   GPGPU/Media to 3D."
+    *
+    * Note: Issuing PIPE_CONTROL_MEDIA_STATE_CLEAR causes GPU hangs, probably
+    * because PIPE was not in MEDIA mode?!
+    */
+   bits |= ANV_PIPE_CS_STALL_BIT | ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
+
+   if (cmd_buffer->state.current_pipeline == _3D) {
+      bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+              ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
+   } else {
+      bits |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+   }
+#else
+   /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
+    * PIPELINE_SELECT [DevBWR+]":
+    *
+    *   Project: DEVSNB+
+    *
+    *   Software must ensure all the write caches are flushed through a
+    *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
+    *   command to invalidate read only caches prior to programming
+    *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
+    *
+    * Note the cmd_buffer_apply_pipe_flushes will split this into two
+    * PIPE_CONTROLs.
+    */
+   bits |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+           ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
+           ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
+           ANV_PIPE_CS_STALL_BIT |
+           ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
+           ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
+           ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
+           ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
+           ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+#endif
+
+   /* Wa_16013063087 -  State Cache Invalidate must be issued prior to
+    * PIPELINE_SELECT when switching from 3D to Compute.
+    *
+    * SW must do this by programming of PIPECONTROL with “CS Stall” followed by
+    * a PIPECONTROL with State Cache Invalidate bit set.
+    *
+    */
+   if (cmd_buffer->state.current_pipeline == _3D && pipeline == GPGPU &&
+       intel_needs_workaround(cmd_buffer->device->info, 16013063087))
+      bits |= ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
+
+   anv_add_pending_pipe_bits(cmd_buffer, bits, "flush/invalidate PIPELINE_SELECT");
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
 #if GFX_VER == 9
    if (pipeline == _3D) {
       /* There is a mid-object preemption workaround which requires you to
@@ -5400,6 +4327,13 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
        * even without preemption, we have issues with geometry flickering when
        * GPGPU and 3D are back-to-back and this seems to fix it.  We don't
        * really know why.
+       *
+       * Also, from the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
+       *
+       *    "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
+       *    the only bits that are changed are scoreboard related ..."
+       *
+       * This is satisfied by applying pre-PIPELINE_SELECT pipe flushes above.
        */
       anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_VFE_STATE), vfe) {
          vfe.MaximumNumberofThreads =
@@ -5417,54 +4351,10 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
    }
 #endif
 
-   /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
-    * PIPELINE_SELECT [DevBWR+]":
-    *
-    *   Project: DEVSNB+
-    *
-    *   Software must ensure all the write caches are flushed through a
-    *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
-    *   command to invalidate read only caches prior to programming
-    *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
-    */
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-      pc.RenderTargetCacheFlushEnable  = true;
-      pc.DepthCacheFlushEnable         = true;
-#if GFX_VER >= 12
-      pc.HDCPipelineFlushEnable        = true;
-#else
-      pc.DCFlushEnable                 = true;
-#endif
-      pc.PostSyncOperation             = NoWrite;
-      pc.CommandStreamerStallEnable    = true;
-#if GFX_VER >= 12
-      /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must be
-       * set with any PIPE_CONTROL with Depth Flush Enable bit set.
-       */
-      pc.DepthStallEnable = true;
-#endif
-      anv_debug_dump_pc(pc);
-   }
-
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-      pc.TextureCacheInvalidationEnable   = true;
-      pc.ConstantCacheInvalidationEnable  = true;
-      pc.StateCacheInvalidationEnable     = true;
-      pc.InstructionCacheInvalidateEnable = true;
-      pc.PostSyncOperation                = NoWrite;
-      anv_debug_dump_pc(pc);
-   }
-
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
-#if GFX_VER >= 9
-      ps.MaskBits = GFX_VER >= 12 ? 0x13 : 3;
-      ps.MediaSamplerDOPClockGateEnable = GFX_VER >= 12;
-#endif
-      ps.PipelineSelection = pipeline;
-   }
+   genX(emit_pipeline_select)(&cmd_buffer->batch, pipeline, cmd_buffer->device);
 
 #if GFX_VER == 9
-   if (devinfo->is_geminilake) {
+   if (devinfo->platform == INTEL_PLATFORM_GLK) {
       /* Project: DevGLK
        *
        * "This chicken bit works around a hardware issue with barrier logic
@@ -5479,7 +4369,7 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
       }
    }
 #endif
-
+#endif /* else of if GFX_VER >= 20 */
    cmd_buffer->state.current_pipeline = pipeline;
 }
 
@@ -5496,54 +4386,20 @@ genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
 }
 
 void
-genX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)
-{
-   if (GFX_VER >= 8)
-      return;
-
-   /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER:
-    *
-    *    "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any
-    *    combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS,
-    *    3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first
-    *    issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit
-    *    set), followed by a pipelined depth cache flush (PIPE_CONTROL with
-    *    Depth Flush Bit set, followed by another pipelined depth stall
-    *    (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise
-    *    guarantee that the pipeline from WM onwards is already flushed (e.g.,
-    *    via a preceding MI_FLUSH)."
-    */
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
-      pipe.DepthStallEnable = true;
-      anv_debug_dump_pc(pipe);
-   }
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
-      pipe.DepthCacheFlushEnable = true;
-#if GFX_VER >= 12
-      pipe.TileCacheFlushEnable = true;
-#endif
-      anv_debug_dump_pc(pipe);
-   }
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
-      pipe.DepthStallEnable = true;
-      anv_debug_dump_pc(pipe);
-   }
-}
-
-void
 genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer,
                                      const struct isl_surf *surf)
 {
-#if GFX_VERx10 == 120
-   const bool fmt_is_d16 = surf->format == ISL_FORMAT_R16_UNORM;
+#if INTEL_NEEDS_WA_1808121037
+   const bool is_d16_1x_msaa = surf->format == ISL_FORMAT_R16_UNORM &&
+                               surf->samples == 1;
 
    switch (cmd_buffer->state.depth_reg_mode) {
    case ANV_DEPTH_REG_MODE_HW_DEFAULT:
-      if (!fmt_is_d16)
+      if (!is_d16_1x_msaa)
          return;
       break;
-   case ANV_DEPTH_REG_MODE_D16:
-      if (fmt_is_d16)
+   case ANV_DEPTH_REG_MODE_D16_1X_MSAA:
+      if (is_d16_1x_msaa)
          return;
       break;
    case ANV_DEPTH_REG_MODE_UNKNOWN:
@@ -5558,33 +4414,26 @@ genX(cmd_buffer_emit_gfx12_depth_wa)(struct anv_cmd_buffer *cmd_buffer,
                              ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
                              ANV_PIPE_DEPTH_STALL_BIT |
                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
-                             "Workaround: Stop pipeline for 14010455700");
+                             "Workaround: Stop pipeline for 1808121037");
    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 
-   /* Wa_14010455700
+   /* Wa_1808121037
     *
     * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer
     * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”.
     */
    anv_batch_write_reg(&cmd_buffer->batch, GENX(COMMON_SLICE_CHICKEN1), reg) {
-      reg.HIZPlaneOptimizationdisablebit = fmt_is_d16 && surf->samples == 1;
+      reg.HIZPlaneOptimizationdisablebit = is_d16_1x_msaa;
       reg.HIZPlaneOptimizationdisablebitMask = true;
    }
 
-   /* Wa_1806527549
-    *
-    * Set HIZ_CHICKEN (7018h) bit 13 = 1 when depth buffer is D16_UNORM.
-    */
-   anv_batch_write_reg(&cmd_buffer->batch, GENX(HIZ_CHICKEN), reg) {
-      reg.HZDepthTestLEGEOptimizationDisable = fmt_is_d16;
-      reg.HZDepthTestLEGEOptimizationDisableMask = true;
-   }
-
    cmd_buffer->state.depth_reg_mode =
-      fmt_is_d16 ? ANV_DEPTH_REG_MODE_D16 : ANV_DEPTH_REG_MODE_HW_DEFAULT;
+      is_d16_1x_msaa ? ANV_DEPTH_REG_MODE_D16_1X_MSAA :
+                       ANV_DEPTH_REG_MODE_HW_DEFAULT;
 #endif
 }
 
+#if GFX_VER == 9
 /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
  *
  *    "The VF cache needs to be invalidated before binding and then using
@@ -5618,8 +4467,7 @@ genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer
                                                struct anv_address vb_address,
                                                uint32_t vb_size)
 {
-   if (GFX_VER < 8 || GFX_VER > 9 ||
-       !anv_use_softpin(cmd_buffer->device->physical))
+   if (GFX_VER > 9)
       return;
 
    struct anv_vb_cache_range *bound, *dirty;
@@ -5634,28 +4482,9 @@ genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer
       dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
    }
 
-   if (vb_size == 0) {
-      bound->start = 0;
-      bound->end = 0;
-      return;
-   }
-
-   assert(vb_address.bo && (vb_address.bo->flags & EXEC_OBJECT_PINNED));
-   bound->start = intel_48b_address(anv_address_physical(vb_address));
-   bound->end = bound->start + vb_size;
-   assert(bound->end > bound->start); /* No overflow */
-
-   /* Align everything to a cache line */
-   bound->start &= ~(64ull - 1ull);
-   bound->end = align_u64(bound->end, 64);
-
-   /* Compute the dirty range */
-   dirty->start = MIN2(dirty->start, bound->start);
-   dirty->end = MAX2(dirty->end, bound->end);
-
-   /* If our range is larger than 32 bits, we have to flush */
-   assert(bound->end - bound->start <= (1ull << 32));
-   if (dirty->end - dirty->start > (1ull << 32)) {
+   if (anv_gfx8_9_vb_cache_range_needs_workaround(bound, dirty,
+                                                  vb_address,
+                                                  vb_size)) {
       anv_add_pending_pipe_bits(cmd_buffer,
                                 ANV_PIPE_CS_STALL_BIT |
                                 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
@@ -5668,19 +4497,12 @@ genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_b
                                                     uint32_t access_type,
                                                     uint64_t vb_used)
 {
-   if (GFX_VER < 8 || GFX_VER > 9 ||
-       !anv_use_softpin(cmd_buffer->device->physical))
-      return;
-
    if (access_type == RANDOM) {
       /* We have an index buffer */
       struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
       struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
 
-      if (bound->end > bound->start) {
-         dirty->start = MIN2(dirty->start, bound->start);
-         dirty->end = MAX2(dirty->end, bound->end);
-      }
+      anv_merge_vb_cache_range(dirty, bound);
    }
 
    uint64_t mask = vb_used;
@@ -5694,12 +4516,10 @@ genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_b
       bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
       dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
 
-      if (bound->end > bound->start) {
-         dirty->start = MIN2(dirty->start, bound->start);
-         dirty->end = MAX2(dirty->end, bound->end);
-      }
+      anv_merge_vb_cache_range(dirty, bound);
    }
 }
+#endif /* GFX_VER == 9 */
 
 /**
  * Update the pixel hashing modes that determine the balancing of PS threads
@@ -5724,7 +4544,7 @@ genX(cmd_buffer_emit_hashing_mode)(struct anv_cmd_buffer *cmd_buffer,
                                    unsigned scale)
 {
 #if GFX_VER == 9
-   const struct intel_device_info *devinfo = &cmd_buffer->device->info;
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
    const unsigned slice_hashing[] = {
       /* Because all Gfx9 platforms with more than one slice require
        * three-way subslice hashing, a single "normal" 16x16 slice hashing
@@ -5796,25 +4616,39 @@ static void
 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
 {
    struct anv_device *device = cmd_buffer->device;
-   const struct anv_image_view *iview =
-      anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
-   const struct anv_image *image = iview ? iview->image : NULL;
-
-   /* FIXME: Width and Height are wrong */
-
-   genX(cmd_buffer_emit_gfx7_depth_flush)(cmd_buffer);
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
 
    uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
                                         device->isl_dev.ds.size / 4);
    if (dw == NULL)
       return;
 
-   struct isl_depth_stencil_hiz_emit_info info = { };
+   struct isl_view isl_view = {};
+   struct isl_depth_stencil_hiz_emit_info info = {
+      .view = &isl_view,
+      .mocs = anv_mocs(device, NULL, ISL_SURF_USAGE_DEPTH_BIT),
+   };
+
+   if (gfx->depth_att.iview != NULL) {
+      isl_view = gfx->depth_att.iview->planes[0].isl;
+   } else if (gfx->stencil_att.iview != NULL) {
+      isl_view = gfx->stencil_att.iview->planes[0].isl;
+   }
 
-   if (iview)
-      info.view = &iview->planes[0].isl;
+   if (gfx->view_mask) {
+      assert(isl_view.array_len == 0 ||
+             isl_view.array_len >= util_last_bit(gfx->view_mask));
+      isl_view.array_len = util_last_bit(gfx->view_mask);
+   } else {
+      assert(isl_view.array_len == 0 ||
+             isl_view.array_len >= util_last_bit(gfx->layer_count));
+      isl_view.array_len = gfx->layer_count;
+   }
+
+   if (gfx->depth_att.iview != NULL) {
+      const struct anv_image_view *iview = gfx->depth_att.iview;
+      const struct anv_image *image = iview->image;
 
-   if (image && (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
       const uint32_t depth_plane =
          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
       const struct anv_surface *depth_surface =
@@ -5822,18 +4656,14 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
       const struct anv_address depth_address =
          anv_image_address(image, &depth_surface->memory_range);
 
-      info.depth_surf = &depth_surface->isl;
+      anv_reloc_list_add_bo(cmd_buffer->batch.relocs, depth_address.bo);
 
-      info.depth_address =
-         anv_batch_emit_reloc(&cmd_buffer->batch,
-                              dw + device->isl_dev.ds.depth_offset / 4,
-                              depth_address.bo, depth_address.offset);
+      info.depth_surf = &depth_surface->isl;
+      info.depth_address = anv_address_physical(depth_address);
       info.mocs =
          anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT);
 
-      const uint32_t ds =
-         cmd_buffer->state.subpass->depth_stencil_attachment->attachment;
-      info.hiz_usage = cmd_buffer->state.attachments[ds].aux_usage;
+      info.hiz_usage = gfx->depth_att.aux_usage;
       if (info.hiz_usage != ISL_AUX_USAGE_NONE) {
          assert(isl_aux_usage_has_hiz(info.hiz_usage));
 
@@ -5842,18 +4672,19 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
          const struct anv_address hiz_address =
             anv_image_address(image, &hiz_surface->memory_range);
 
-         info.hiz_surf = &hiz_surface->isl;
+         anv_reloc_list_add_bo(cmd_buffer->batch.relocs, hiz_address.bo);
 
-         info.hiz_address =
-            anv_batch_emit_reloc(&cmd_buffer->batch,
-                                 dw + device->isl_dev.ds.hiz_offset / 4,
-                                 hiz_address.bo, hiz_address.offset);
+         info.hiz_surf = &hiz_surface->isl;
+         info.hiz_address = anv_address_physical(hiz_address);
 
          info.depth_clear_value = ANV_HZ_FC_VAL;
       }
    }
 
-   if (image && (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT)) {
+   if (gfx->stencil_att.iview != NULL) {
+      const struct anv_image_view *iview = gfx->stencil_att.iview;
+      const struct anv_image *image = iview->image;
+
       const uint32_t stencil_plane =
          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
       const struct anv_surface *stencil_surface =
@@ -5861,555 +4692,684 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
       const struct anv_address stencil_address =
          anv_image_address(image, &stencil_surface->memory_range);
 
+      anv_reloc_list_add_bo(cmd_buffer->batch.relocs, stencil_address.bo);
+
       info.stencil_surf = &stencil_surface->isl;
 
       info.stencil_aux_usage = image->planes[stencil_plane].aux_usage;
-      info.stencil_address =
-         anv_batch_emit_reloc(&cmd_buffer->batch,
-                              dw + device->isl_dev.ds.stencil_offset / 4,
-                              stencil_address.bo, stencil_address.offset);
+      info.stencil_address = anv_address_physical(stencil_address);
       info.mocs =
          anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT);
    }
 
    isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info);
 
+   /* Wa_14016712196:
+    * Emit depth flush after state that sends implicit depth flush.
+    */
+   if (intel_needs_workaround(cmd_buffer->device->info, 14016712196)) {
+      genx_batch_emit_pipe_control(&cmd_buffer->batch,
+                                   cmd_buffer->device->info,
+                                   cmd_buffer->state.current_pipeline,
+                                   ANV_PIPE_DEPTH_CACHE_FLUSH_BIT);
+   }
+
    if (info.depth_surf)
       genX(cmd_buffer_emit_gfx12_depth_wa)(cmd_buffer, info.depth_surf);
 
-   if (GFX_VER >= 12) {
+   if (GFX_VER >= 11) {
       cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 
-      /* Wa_1408224581
-       *
-       * Workaround: Gfx12LP Astep only An additional pipe control with
-       * post-sync = store dword operation would be required.( w/a is to
-       * have an additional pipe control after the stencil state whenever
-       * the surface state bits of this state is changing).
-       */
-      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-         pc.PostSyncOperation = WriteImmediateData;
-         pc.Address = cmd_buffer->device->workaround_address;
+      if (intel_needs_workaround(cmd_buffer->device->info, 1408224581) ||
+          intel_needs_workaround(cmd_buffer->device->info, 14014097488)) {
+         /* Wa_1408224581
+          *
+          * Workaround: Gfx12LP Astep only An additional pipe control with
+          * post-sync = store dword operation would be required.( w/a is to
+          * have an additional pipe control after the stencil state whenever
+          * the surface state bits of this state is changing).
+          *
+          * This also seems sufficient to handle Wa_14014097488.
+          */
+         genx_batch_emit_pipe_control_write
+            (&cmd_buffer->batch, cmd_buffer->device->info,
+             cmd_buffer->state.current_pipeline, WriteImmediateData,
+             cmd_buffer->device->workaround_address, 0, 0);
       }
    }
    cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage);
 }
 
-/**
- * This ANDs the view mask of the current subpass with the pending clear
- * views in the attachment to get the mask of views active in the subpass
- * that still need to be cleared.
- */
-static inline uint32_t
-get_multiview_subpass_clear_mask(const struct anv_cmd_state *cmd_state,
-                                 const struct anv_attachment_state *att_state)
-{
-   return cmd_state->subpass->view_mask & att_state->pending_clear_views;
-}
-
-static inline bool
-do_first_layer_clear(const struct anv_cmd_state *cmd_state,
-                     const struct anv_attachment_state *att_state)
+static void
+cmd_buffer_emit_cps_control_buffer(struct anv_cmd_buffer *cmd_buffer,
+                                   const struct anv_image_view *fsr_iview)
 {
-   if (!cmd_state->subpass->view_mask)
-      return true;
+#if GFX_VERx10 >= 125
+   struct anv_device *device = cmd_buffer->device;
 
-   uint32_t pending_clear_mask =
-      get_multiview_subpass_clear_mask(cmd_state, att_state);
+   if (!device->vk.enabled_extensions.KHR_fragment_shading_rate)
+      return;
 
-   return pending_clear_mask & 1;
-}
+   uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
+                                        device->isl_dev.cpb.size / 4);
+   if (dw == NULL)
+      return;
 
-static inline bool
-current_subpass_is_last_for_attachment(const struct anv_cmd_state *cmd_state,
-                                       uint32_t att_idx)
-{
-   const uint32_t last_subpass_idx =
-      cmd_state->pass->attachments[att_idx].last_subpass_idx;
-   const struct anv_subpass *last_subpass =
-      &cmd_state->pass->subpasses[last_subpass_idx];
-   return last_subpass == cmd_state->subpass;
-}
+   struct isl_cpb_emit_info info = { };
 
-static void
-cmd_buffer_begin_subpass(struct anv_cmd_buffer *cmd_buffer,
-                         uint32_t subpass_id)
-{
-   struct anv_cmd_state *cmd_state = &cmd_buffer->state;
-   struct anv_render_pass *pass = cmd_state->pass;
-   struct anv_subpass *subpass = &pass->subpasses[subpass_id];
-   cmd_state->subpass = subpass;
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
-
-   /* Our implementation of VK_KHR_multiview uses instancing to draw the
-    * different views.  If the client asks for instancing, we need to use the
-    * Instance Data Step Rate to ensure that we repeat the client's
-    * per-instance data once for each view.  Since this bit is in
-    * VERTEX_BUFFER_STATE on gfx7, we need to dirty vertex buffers at the top
-    * of each subpass.
-    */
-   if (GFX_VER == 7)
-      cmd_buffer->state.gfx.vb_dirty |= ~0;
+   if (fsr_iview) {
+      const struct anv_image_binding *binding = &fsr_iview->image->bindings[0];
 
-   /* It is possible to start a render pass with an old pipeline.  Because the
-    * render pass and subpass index are both baked into the pipeline, this is
-    * highly unlikely.  In order to do so, it requires that you have a render
-    * pass with a single subpass and that you use that render pass twice
-    * back-to-back and use the same pipeline at the start of the second render
-    * pass as at the end of the first.  In order to avoid unpredictable issues
-    * with this edge case, we just dirty the pipeline at the start of every
-    * subpass.
-    */
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE;
+      anv_reloc_list_add_bo(cmd_buffer->batch.relocs, binding->address.bo);
 
-   /* Accumulate any subpass flushes that need to happen before the subpass */
-   anv_add_pending_pipe_bits(cmd_buffer,
-                             cmd_buffer->state.pass->subpass_flushes[subpass_id],
-                             "begin subpass deps/attachments");
+      struct anv_address addr =
+         anv_address_add(binding->address, binding->memory_range.offset);
 
-   VkRect2D render_area = cmd_buffer->state.render_area;
-   struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
+      info.view = &fsr_iview->planes[0].isl;
+      info.surf = &fsr_iview->image->planes[0].primary_surface.isl;
+      info.address = anv_address_physical(addr);
+      info.mocs =
+         anv_mocs(device, fsr_iview->image->bindings[0].address.bo,
+                  ISL_SURF_USAGE_CPB_BIT);
+   }
 
-   bool is_multiview = subpass->view_mask != 0;
+   isl_emit_cpb_control_s(&device->isl_dev, dw, &info);
 
-   for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
-      const uint32_t a = subpass->attachments[i].attachment;
-      if (a == VK_ATTACHMENT_UNUSED)
-         continue;
+   /* Wa_14016712196:
+    * Emit depth flush after state that sends implicit depth flush.
+    */
+   if (intel_needs_workaround(cmd_buffer->device->info, 14016712196)) {
+      genx_batch_emit_pipe_control(&cmd_buffer->batch,
+                                   cmd_buffer->device->info,
+                                   cmd_buffer->state.current_pipeline,
+                                   ANV_PIPE_DEPTH_CACHE_FLUSH_BIT);
+   }
+#endif /* GFX_VERx10 >= 125 */
+}
 
-      assert(a < cmd_state->pass->attachment_count);
-      struct anv_attachment_state *att_state = &cmd_state->attachments[a];
+static VkImageLayout
+attachment_initial_layout(const VkRenderingAttachmentInfo *att)
+{
+   const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info =
+      vk_find_struct_const(att->pNext,
+                           RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA);
+   if (layout_info != NULL)
+      return layout_info->initialLayout;
 
-      struct anv_image_view *iview = cmd_state->attachments[a].image_view;
-      const struct anv_image *image = iview->image;
+   return att->imageLayout;
+}
 
-      VkImageLayout target_layout = subpass->attachments[i].layout;
-      VkImageLayout target_stencil_layout =
-         subpass->attachments[i].stencil_layout;
+void genX(CmdBeginRendering)(
+    VkCommandBuffer                             commandBuffer,
+    const VkRenderingInfo*                      pRenderingInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   VkResult result;
 
-      uint32_t level = iview->planes[0].isl.base_level;
-      uint32_t width = anv_minify(iview->image->vk.extent.width, level);
-      uint32_t height = anv_minify(iview->image->vk.extent.height, level);
-      bool full_surface_draw =
-         render_area.offset.x == 0 && render_area.offset.y == 0 &&
-         render_area.extent.width == width &&
-         render_area.extent.height == height;
+   if (!anv_cmd_buffer_is_render_queue(cmd_buffer)) {
+      assert(!"Trying to start a render pass on non-render queue!");
+      anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN);
+      return;
+   }
 
-      uint32_t base_layer, layer_count;
-      if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
-         base_layer = 0;
-         layer_count = anv_minify(iview->image->vk.extent.depth, level);
-      } else {
-         base_layer = iview->planes[0].isl.base_array_layer;
-         layer_count = fb->layers;
-      }
+   anv_measure_beginrenderpass(cmd_buffer);
+   trace_intel_begin_render_pass(&cmd_buffer->trace);
 
-      if (image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
-         bool will_full_fast_clear =
-            (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) &&
-            att_state->fast_clear && full_surface_draw;
-
-         assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
-         transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT,
-                                 level, 1, base_layer, layer_count,
-                                 att_state->current_layout, target_layout,
-                                 VK_QUEUE_FAMILY_IGNORED,
-                                 VK_QUEUE_FAMILY_IGNORED,
-                                 will_full_fast_clear);
-         att_state->aux_usage =
-            anv_layout_to_aux_usage(&cmd_buffer->device->info, image,
-                                    VK_IMAGE_ASPECT_COLOR_BIT,
-                                    VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
-                                    target_layout);
-      }
+   gfx->rendering_flags = pRenderingInfo->flags;
+   gfx->view_mask = pRenderingInfo->viewMask;
+   gfx->layer_count = pRenderingInfo->layerCount;
+   gfx->samples = 0;
 
-      if (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
-         bool will_full_fast_clear =
-            (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
-            att_state->fast_clear && full_surface_draw;
+   if (gfx->render_area.offset.x != pRenderingInfo->renderArea.offset.x ||
+       gfx->render_area.offset.y != pRenderingInfo->renderArea.offset.y ||
+       gfx->render_area.extent.width != pRenderingInfo->renderArea.extent.width ||
+       gfx->render_area.extent.height != pRenderingInfo->renderArea.extent.height) {
+      gfx->render_area = pRenderingInfo->renderArea;
+      gfx->dirty |= ANV_CMD_DIRTY_RENDER_AREA;
+   }
 
-         transition_depth_buffer(cmd_buffer, image,
-                                 base_layer, layer_count,
-                                 att_state->current_layout, target_layout,
-                                 will_full_fast_clear);
-         att_state->aux_usage =
-            anv_layout_to_aux_usage(&cmd_buffer->device->info, image,
-                                    VK_IMAGE_ASPECT_DEPTH_BIT,
-                                    VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
-                                    target_layout);
-      }
+   const bool is_multiview = gfx->view_mask != 0;
+   const VkRect2D render_area = gfx->render_area;
+   const uint32_t layers =
+      is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
 
-      if (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
-         bool will_full_fast_clear =
-            (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
-            att_state->fast_clear && full_surface_draw;
+   /* The framebuffer size is at least large enough to contain the render
+    * area.  Because a zero renderArea is possible, we MAX with 1.
+    */
+   struct isl_extent3d fb_size = {
+      .w = MAX2(1, render_area.offset.x + render_area.extent.width),
+      .h = MAX2(1, render_area.offset.y + render_area.extent.height),
+      .d = layers,
+   };
 
-         transition_stencil_buffer(cmd_buffer, image,
-                                   level, 1, base_layer, layer_count,
-                                   att_state->current_stencil_layout,
-                                   target_stencil_layout,
-                                   will_full_fast_clear);
-      }
-      att_state->current_layout = target_layout;
-      att_state->current_stencil_layout = target_stencil_layout;
+   const uint32_t color_att_count = pRenderingInfo->colorAttachmentCount;
+   result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
+   if (result != VK_SUCCESS)
+      return;
 
-      if (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
-         assert(att_state->pending_clear_aspects == VK_IMAGE_ASPECT_COLOR_BIT);
+   genX(flush_pipeline_select_3d)(cmd_buffer);
 
-         /* Multi-planar images are not supported as attachments */
-         assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
-         assert(image->n_planes == 1);
+   for (uint32_t i = 0; i < gfx->color_att_count; i++) {
+      if (pRenderingInfo->pColorAttachments[i].imageView == VK_NULL_HANDLE)
+         continue;
 
-         uint32_t base_clear_layer = iview->planes[0].isl.base_array_layer;
-         uint32_t clear_layer_count = fb->layers;
+      const VkRenderingAttachmentInfo *att =
+         &pRenderingInfo->pColorAttachments[i];
+      ANV_FROM_HANDLE(anv_image_view, iview, att->imageView);
+      const VkImageLayout initial_layout = attachment_initial_layout(att);
+
+      assert(render_area.offset.x + render_area.extent.width <=
+             iview->vk.extent.width);
+      assert(render_area.offset.y + render_area.extent.height <=
+             iview->vk.extent.height);
+      assert(layers <= iview->vk.layer_count);
+
+      fb_size.w = MAX2(fb_size.w, iview->vk.extent.width);
+      fb_size.h = MAX2(fb_size.h, iview->vk.extent.height);
+
+      assert(gfx->samples == 0 || gfx->samples == iview->vk.image->samples);
+      gfx->samples |= iview->vk.image->samples;
+
+      enum isl_aux_usage aux_usage =
+         anv_layout_to_aux_usage(cmd_buffer->device->info,
+                                 iview->image,
+                                 VK_IMAGE_ASPECT_COLOR_BIT,
+                                 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+                                 att->imageLayout,
+                                 cmd_buffer->queue_family->queueFlags);
+
+      union isl_color_value fast_clear_color = { .u32 = { 0, } };
+
+      if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
+          !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) {
+         const union isl_color_value clear_color =
+            vk_to_isl_color_with_format(att->clearValue.color,
+                                        iview->planes[0].isl.format);
+
+         /* We only support fast-clears on the first layer */
+         const bool fast_clear =
+            (!is_multiview || (gfx->view_mask & 1)) &&
+            anv_can_fast_clear_color_view(cmd_buffer->device, iview,
+                                          att->imageLayout, clear_color,
+                                          layers, render_area,
+                                          cmd_buffer->queue_family->queueFlags);
+
+         if (att->imageLayout != initial_layout) {
+            assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
+                   render_area.extent.width == iview->vk.extent.width &&
+                   render_area.extent.height == iview->vk.extent.height);
+            if (is_multiview) {
+               u_foreach_bit(view, gfx->view_mask) {
+                  transition_color_buffer(cmd_buffer, iview->image,
+                                          VK_IMAGE_ASPECT_COLOR_BIT,
+                                          iview->vk.base_mip_level, 1,
+                                          iview->vk.base_array_layer + view,
+                                          1, /* layer_count */
+                                          initial_layout, att->imageLayout,
+                                          VK_QUEUE_FAMILY_IGNORED,
+                                          VK_QUEUE_FAMILY_IGNORED,
+                                          fast_clear);
+               }
+            } else {
+               transition_color_buffer(cmd_buffer, iview->image,
+                                       VK_IMAGE_ASPECT_COLOR_BIT,
+                                       iview->vk.base_mip_level, 1,
+                                       iview->vk.base_array_layer,
+                                       gfx->layer_count,
+                                       initial_layout, att->imageLayout,
+                                       VK_QUEUE_FAMILY_IGNORED,
+                                       VK_QUEUE_FAMILY_IGNORED,
+                                       fast_clear);
+            }
+         }
 
-         if (att_state->fast_clear &&
-             do_first_layer_clear(cmd_state, att_state)) {
+         uint32_t clear_view_mask = pRenderingInfo->viewMask;
+         uint32_t base_clear_layer = iview->vk.base_array_layer;
+         uint32_t clear_layer_count = gfx->layer_count;
+         if (fast_clear) {
             /* We only support fast-clears on the first layer */
-            assert(level == 0 && base_layer == 0);
+            assert(iview->vk.base_mip_level == 0 &&
+                   iview->vk.base_array_layer == 0);
+
+            fast_clear_color = clear_color;
 
-            union isl_color_value clear_color = {};
-            anv_clear_color_from_att_state(&clear_color, att_state, iview);
             if (iview->image->vk.samples == 1) {
-               anv_image_ccs_op(cmd_buffer, image,
+               anv_image_ccs_op(cmd_buffer, iview->image,
                                 iview->planes[0].isl.format,
                                 iview->planes[0].isl.swizzle,
                                 VK_IMAGE_ASPECT_COLOR_BIT,
                                 0, 0, 1, ISL_AUX_OP_FAST_CLEAR,
-                                &clear_color,
+                                &fast_clear_color,
                                 false);
             } else {
-               anv_image_mcs_op(cmd_buffer, image,
+               anv_image_mcs_op(cmd_buffer, iview->image,
                                 iview->planes[0].isl.format,
                                 iview->planes[0].isl.swizzle,
                                 VK_IMAGE_ASPECT_COLOR_BIT,
                                 0, 1, ISL_AUX_OP_FAST_CLEAR,
-                                &clear_color,
+                                &fast_clear_color,
                                 false);
             }
+            clear_view_mask &= ~1u;
             base_clear_layer++;
             clear_layer_count--;
-            if (is_multiview)
-               att_state->pending_clear_views &= ~1;
-
-            if (isl_color_value_is_zero(clear_color,
-                                        iview->planes[0].isl.format)) {
-               /* This image has the auxiliary buffer enabled. We can mark the
-                * subresource as not needing a resolve because the clear color
-                * will match what's in every RENDER_SURFACE_STATE object when
-                * it's being used for sampling.
-                */
-               set_image_fast_clear_state(cmd_buffer, iview->image,
-                                          VK_IMAGE_ASPECT_COLOR_BIT,
-                                          ANV_FAST_CLEAR_DEFAULT_VALUE);
-            } else {
-               set_image_fast_clear_state(cmd_buffer, iview->image,
-                                          VK_IMAGE_ASPECT_COLOR_BIT,
-                                          ANV_FAST_CLEAR_ANY);
-            }
+
+            genX(set_fast_clear_state)(cmd_buffer, iview->image,
+                                       iview->planes[0].isl.format,
+                                       clear_color);
          }
 
-         /* From the VkFramebufferCreateInfo spec:
-          *
-          * "If the render pass uses multiview, then layers must be one and each
-          *  attachment requires a number of layers that is greater than the
-          *  maximum bit index set in the view mask in the subpasses in which it
-          *  is used."
-          *
-          * So if multiview is active we ignore the number of layers in the
-          * framebuffer and instead we honor the view mask from the subpass.
-          */
          if (is_multiview) {
-            assert(image->n_planes == 1);
-            uint32_t pending_clear_mask =
-               get_multiview_subpass_clear_mask(cmd_state, att_state);
-
-            u_foreach_bit(layer_idx, pending_clear_mask) {
-               uint32_t layer =
-                  iview->planes[0].isl.base_array_layer + layer_idx;
-
-               anv_image_clear_color(cmd_buffer, image,
+            u_foreach_bit(view, clear_view_mask) {
+               anv_image_clear_color(cmd_buffer, iview->image,
                                      VK_IMAGE_ASPECT_COLOR_BIT,
-                                     att_state->aux_usage,
+                                     aux_usage,
                                      iview->planes[0].isl.format,
                                      iview->planes[0].isl.swizzle,
-                                     level, layer, 1,
-                                     render_area,
-                                     vk_to_isl_color(att_state->clear_value.color));
+                                     iview->vk.base_mip_level,
+                                     iview->vk.base_array_layer + view, 1,
+                                     render_area, clear_color);
             }
-
-            att_state->pending_clear_views &= ~pending_clear_mask;
-         } else if (clear_layer_count > 0) {
-            assert(image->n_planes == 1);
-            anv_image_clear_color(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT,
-                                  att_state->aux_usage,
+         } else {
+            anv_image_clear_color(cmd_buffer, iview->image,
+                                  VK_IMAGE_ASPECT_COLOR_BIT,
+                                  aux_usage,
                                   iview->planes[0].isl.format,
                                   iview->planes[0].isl.swizzle,
-                                  level, base_clear_layer, clear_layer_count,
-                                  render_area,
-                                  vk_to_isl_color(att_state->clear_value.color));
+                                  iview->vk.base_mip_level,
+                                  base_clear_layer, clear_layer_count,
+                                  render_area, clear_color);
+         }
+      } else {
+         /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
+         assert(att->imageLayout == initial_layout);
+      }
+
+      gfx->color_att[i].vk_format = iview->vk.format;
+      gfx->color_att[i].iview = iview;
+      gfx->color_att[i].layout = att->imageLayout;
+      gfx->color_att[i].aux_usage = aux_usage;
+
+      struct isl_view isl_view = iview->planes[0].isl;
+      if (pRenderingInfo->viewMask) {
+         assert(isl_view.array_len >= util_last_bit(pRenderingInfo->viewMask));
+         isl_view.array_len = util_last_bit(pRenderingInfo->viewMask);
+      } else {
+         assert(isl_view.array_len >= pRenderingInfo->layerCount);
+         isl_view.array_len = pRenderingInfo->layerCount;
+      }
+
+      anv_image_fill_surface_state(cmd_buffer->device,
+                                   iview->image,
+                                   VK_IMAGE_ASPECT_COLOR_BIT,
+                                   &isl_view,
+                                   ISL_SURF_USAGE_RENDER_TARGET_BIT,
+                                   aux_usage, &fast_clear_color,
+                                   0, /* anv_image_view_state_flags */
+                                   &gfx->color_att[i].surface_state);
+
+      add_surface_state_relocs(cmd_buffer, &gfx->color_att[i].surface_state);
+
+      if (GFX_VER < 10 &&
+          (att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
+           (gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) &&
+          iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&
+          iview->planes[0].isl.base_level == 0 &&
+          iview->planes[0].isl.base_array_layer == 0) {
+         genX(load_image_clear_color)(cmd_buffer,
+                                      gfx->color_att[i].surface_state.state,
+                                      iview->image);
+      }
+
+      if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
+         gfx->color_att[i].resolve_mode = att->resolveMode;
+         gfx->color_att[i].resolve_iview =
+            anv_image_view_from_handle(att->resolveImageView);
+         gfx->color_att[i].resolve_layout = att->resolveImageLayout;
+      }
+   }
+
+   anv_cmd_graphic_state_update_has_uint_rt(gfx);
+
+   const struct anv_image_view *fsr_iview = NULL;
+   const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_att =
+      vk_find_struct_const(pRenderingInfo->pNext,
+                           RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
+   if (fsr_att != NULL && fsr_att->imageView != VK_NULL_HANDLE) {
+      fsr_iview = anv_image_view_from_handle(fsr_att->imageView);
+      /* imageLayout and shadingRateAttachmentTexelSize are ignored */
+   }
+
+   const struct anv_image_view *ds_iview = NULL;
+   const VkRenderingAttachmentInfo *d_att = pRenderingInfo->pDepthAttachment;
+   const VkRenderingAttachmentInfo *s_att = pRenderingInfo->pStencilAttachment;
+   if ((d_att != NULL && d_att->imageView != VK_NULL_HANDLE) ||
+       (s_att != NULL && s_att->imageView != VK_NULL_HANDLE)) {
+      const struct anv_image_view *d_iview = NULL, *s_iview = NULL;
+      VkImageLayout depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+      VkImageLayout stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+      VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+      VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+      enum isl_aux_usage depth_aux_usage = ISL_AUX_USAGE_NONE;
+      enum isl_aux_usage stencil_aux_usage = ISL_AUX_USAGE_NONE;
+      float depth_clear_value = 0;
+      uint32_t stencil_clear_value = 0;
+
+      if (d_att != NULL && d_att->imageView != VK_NULL_HANDLE) {
+         d_iview = anv_image_view_from_handle(d_att->imageView);
+         initial_depth_layout = attachment_initial_layout(d_att);
+         depth_layout = d_att->imageLayout;
+         depth_aux_usage =
+            anv_layout_to_aux_usage(cmd_buffer->device->info,
+                                    d_iview->image,
+                                    VK_IMAGE_ASPECT_DEPTH_BIT,
+                                    VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+                                    depth_layout,
+                                    cmd_buffer->queue_family->queueFlags);
+         depth_clear_value = d_att->clearValue.depthStencil.depth;
+      }
+
+      if (s_att != NULL && s_att->imageView != VK_NULL_HANDLE) {
+         s_iview = anv_image_view_from_handle(s_att->imageView);
+         initial_stencil_layout = attachment_initial_layout(s_att);
+         stencil_layout = s_att->imageLayout;
+         stencil_aux_usage =
+            anv_layout_to_aux_usage(cmd_buffer->device->info,
+                                    s_iview->image,
+                                    VK_IMAGE_ASPECT_STENCIL_BIT,
+                                    VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+                                    stencil_layout,
+                                    cmd_buffer->queue_family->queueFlags);
+         stencil_clear_value = s_att->clearValue.depthStencil.stencil;
+      }
+
+      assert(s_iview == NULL || d_iview == NULL || s_iview == d_iview);
+      ds_iview = d_iview != NULL ? d_iview : s_iview;
+      assert(ds_iview != NULL);
+
+      assert(render_area.offset.x + render_area.extent.width <=
+             ds_iview->vk.extent.width);
+      assert(render_area.offset.y + render_area.extent.height <=
+             ds_iview->vk.extent.height);
+      assert(layers <= ds_iview->vk.layer_count);
+
+      fb_size.w = MAX2(fb_size.w, ds_iview->vk.extent.width);
+      fb_size.h = MAX2(fb_size.h, ds_iview->vk.extent.height);
+
+      assert(gfx->samples == 0 || gfx->samples == ds_iview->vk.image->samples);
+      gfx->samples |= ds_iview->vk.image->samples;
+
+      VkImageAspectFlags clear_aspects = 0;
+      if (d_iview != NULL && d_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
+          !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
+         clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
+      if (s_iview != NULL && s_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
+          !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
+         clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
+
+      if (clear_aspects != 0) {
+         const bool hiz_clear =
+            anv_can_hiz_clear_ds_view(cmd_buffer->device, d_iview,
+                                      depth_layout, clear_aspects,
+                                      depth_clear_value,
+                                      render_area,
+                                      cmd_buffer->queue_family->queueFlags);
+
+         if (depth_layout != initial_depth_layout) {
+            assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
+                   render_area.extent.width == d_iview->vk.extent.width &&
+                   render_area.extent.height == d_iview->vk.extent.height);
+
+            if (is_multiview) {
+               u_foreach_bit(view, gfx->view_mask) {
+                  transition_depth_buffer(cmd_buffer, d_iview->image,
+                                          d_iview->vk.base_mip_level, 1,
+                                          d_iview->vk.base_array_layer + view,
+                                          1 /* layer_count */,
+                                          initial_depth_layout, depth_layout,
+                                          hiz_clear);
+               }
+            } else {
+               transition_depth_buffer(cmd_buffer, d_iview->image,
+                                       d_iview->vk.base_mip_level, 1,
+                                       d_iview->vk.base_array_layer,
+                                       gfx->layer_count,
+                                       initial_depth_layout, depth_layout,
+                                       hiz_clear);
+            }
          }
-      } else if (att_state->pending_clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
-                                                     VK_IMAGE_ASPECT_STENCIL_BIT)) {
-         if (att_state->fast_clear &&
-             (att_state->pending_clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
-            /* We currently only support HiZ for single-LOD images */
-            assert(isl_aux_usage_has_hiz(iview->image->planes[0].aux_usage));
-            assert(iview->planes[0].isl.base_level == 0);
-            assert(iview->planes[0].isl.levels == 1);
+
+         if (stencil_layout != initial_stencil_layout) {
+            assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
+                   render_area.extent.width == s_iview->vk.extent.width &&
+                   render_area.extent.height == s_iview->vk.extent.height);
+
+            if (is_multiview) {
+               u_foreach_bit(view, gfx->view_mask) {
+                  transition_stencil_buffer(cmd_buffer, s_iview->image,
+                                            s_iview->vk.base_mip_level, 1,
+                                            s_iview->vk.base_array_layer + view,
+                                            1 /* layer_count */,
+                                            initial_stencil_layout,
+                                            stencil_layout,
+                                            hiz_clear);
+               }
+            } else {
+               transition_stencil_buffer(cmd_buffer, s_iview->image,
+                                         s_iview->vk.base_mip_level, 1,
+                                         s_iview->vk.base_array_layer,
+                                         gfx->layer_count,
+                                         initial_stencil_layout,
+                                         stencil_layout,
+                                         hiz_clear);
+            }
          }
 
          if (is_multiview) {
-            uint32_t pending_clear_mask =
-              get_multiview_subpass_clear_mask(cmd_state, att_state);
-
-            u_foreach_bit(layer_idx, pending_clear_mask) {
-               uint32_t layer =
-                  iview->planes[0].isl.base_array_layer + layer_idx;
-
-               if (att_state->fast_clear) {
-                  anv_image_hiz_clear(cmd_buffer, image,
-                                      att_state->pending_clear_aspects,
-                                      level, layer, 1, render_area,
-                                      att_state->clear_value.depthStencil.stencil);
+            u_foreach_bit(view, gfx->view_mask) {
+               uint32_t level = ds_iview->vk.base_mip_level;
+               uint32_t layer = ds_iview->vk.base_array_layer + view;
+
+               if (hiz_clear) {
+                  anv_image_hiz_clear(cmd_buffer, ds_iview->image,
+                                      clear_aspects,
+                                      level, layer, 1,
+                                      render_area,
+                                      stencil_clear_value);
                } else {
-                  anv_image_clear_depth_stencil(cmd_buffer, image,
-                                                att_state->pending_clear_aspects,
-                                                att_state->aux_usage,
-                                                level, layer, 1, render_area,
-                                                att_state->clear_value.depthStencil.depth,
-                                                att_state->clear_value.depthStencil.stencil);
+                  anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
+                                                clear_aspects,
+                                                depth_aux_usage,
+                                                level, layer, 1,
+                                                render_area,
+                                                depth_clear_value,
+                                                stencil_clear_value);
                }
             }
-
-            att_state->pending_clear_views &= ~pending_clear_mask;
          } else {
-            if (att_state->fast_clear) {
-               anv_image_hiz_clear(cmd_buffer, image,
-                                   att_state->pending_clear_aspects,
+            uint32_t level = ds_iview->vk.base_mip_level;
+            uint32_t base_layer = ds_iview->vk.base_array_layer;
+            uint32_t layer_count = gfx->layer_count;
+
+            if (hiz_clear) {
+               anv_image_hiz_clear(cmd_buffer, ds_iview->image,
+                                   clear_aspects,
                                    level, base_layer, layer_count,
                                    render_area,
-                                   att_state->clear_value.depthStencil.stencil);
+                                   stencil_clear_value);
             } else {
-               anv_image_clear_depth_stencil(cmd_buffer, image,
-                                             att_state->pending_clear_aspects,
-                                             att_state->aux_usage,
+               anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
+                                             clear_aspects,
+                                             depth_aux_usage,
                                              level, base_layer, layer_count,
                                              render_area,
-                                             att_state->clear_value.depthStencil.depth,
-                                             att_state->clear_value.depthStencil.stencil);
+                                             depth_clear_value,
+                                             stencil_clear_value);
             }
          }
-      } else  {
-         assert(att_state->pending_clear_aspects == 0);
+      } else {
+         /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
+         assert(depth_layout == initial_depth_layout);
+         assert(stencil_layout == initial_stencil_layout);
       }
 
-      /* If multiview is enabled, then we are only done clearing when we no
-       * longer have pending layers to clear, or when we have processed the
-       * last subpass that uses this attachment.
-       */
-      if (!is_multiview ||
-          att_state->pending_clear_views == 0 ||
-          current_subpass_is_last_for_attachment(cmd_state, a)) {
-         att_state->pending_clear_aspects = 0;
+      if (d_iview != NULL) {
+         gfx->depth_att.vk_format = d_iview->vk.format;
+         gfx->depth_att.iview = d_iview;
+         gfx->depth_att.layout = depth_layout;
+         gfx->depth_att.aux_usage = depth_aux_usage;
+         if (d_att != NULL && d_att->resolveMode != VK_RESOLVE_MODE_NONE) {
+            assert(d_att->resolveImageView != VK_NULL_HANDLE);
+            gfx->depth_att.resolve_mode = d_att->resolveMode;
+            gfx->depth_att.resolve_iview =
+               anv_image_view_from_handle(d_att->resolveImageView);
+            gfx->depth_att.resolve_layout = d_att->resolveImageLayout;
+         }
       }
 
-      att_state->pending_load_aspects = 0;
+      if (s_iview != NULL) {
+         gfx->stencil_att.vk_format = s_iview->vk.format;
+         gfx->stencil_att.iview = s_iview;
+         gfx->stencil_att.layout = stencil_layout;
+         gfx->stencil_att.aux_usage = stencil_aux_usage;
+         if (s_att->resolveMode != VK_RESOLVE_MODE_NONE) {
+            assert(s_att->resolveImageView != VK_NULL_HANDLE);
+            gfx->stencil_att.resolve_mode = s_att->resolveMode;
+            gfx->stencil_att.resolve_iview =
+               anv_image_view_from_handle(s_att->resolveImageView);
+            gfx->stencil_att.resolve_layout = s_att->resolveImageLayout;
+         }
+      }
    }
 
-   /* We've transitioned all our images possibly fast clearing them.  Now we
-    * can fill out the surface states that we will use as render targets
-    * during actual subpass rendering.
-    */
-   VkResult result = genX(cmd_buffer_alloc_att_surf_states)(cmd_buffer,
-                                                            pass, subpass);
-   if (result != VK_SUCCESS)
-      return;
-
+   /* Finally, now that we know the right size, set up the null surface */
+   assert(util_bitcount(gfx->samples) <= 1);
    isl_null_fill_state(&cmd_buffer->device->isl_dev,
-                       cmd_state->null_surface_state.map,
-                       .size = isl_extent3d(fb->width, fb->height, fb->layers));
+                       gfx->null_surface_state.map,
+                       .size = fb_size);
 
-   for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
-      const uint32_t att = subpass->attachments[i].attachment;
-      if (att == VK_ATTACHMENT_UNUSED)
+   for (uint32_t i = 0; i < gfx->color_att_count; i++) {
+      if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE)
          continue;
 
-      assert(att < cmd_state->pass->attachment_count);
-      struct anv_render_pass_attachment *pass_att = &pass->attachments[att];
-      struct anv_attachment_state *att_state = &cmd_state->attachments[att];
-      struct anv_image_view *iview = att_state->image_view;
-
-      if (!vk_format_is_color(pass_att->format))
-         continue;
-
-      const VkImageUsageFlagBits att_usage = subpass->attachments[i].usage;
-      assert(util_bitcount(att_usage) == 1);
-
-      struct anv_surface_state *surface_state;
-      isl_surf_usage_flags_t isl_surf_usage;
-      enum isl_aux_usage isl_aux_usage;
-      if (att_usage == VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
-         surface_state = &att_state->color;
-         isl_surf_usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
-         isl_aux_usage = att_state->aux_usage;
-      } else if (att_usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) {
-         surface_state = &att_state->input;
-         isl_surf_usage = ISL_SURF_USAGE_TEXTURE_BIT;
-         isl_aux_usage =
-            anv_layout_to_aux_usage(&cmd_buffer->device->info, iview->image,
-                                    VK_IMAGE_ASPECT_COLOR_BIT,
-                                    VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT,
-                                    att_state->current_layout);
-      } else {
-         continue;
-      }
-
-      /* We had better have a surface state when we get here */
-      assert(surface_state->state.map);
+      isl_null_fill_state(&cmd_buffer->device->isl_dev,
+                          gfx->color_att[i].surface_state.state.map,
+                          .size = fb_size);
+   }
 
-      union isl_color_value clear_color = { .u32 = { 0, } };
-      if (pass_att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR &&
-          att_state->fast_clear)
-         anv_clear_color_from_att_state(&clear_color, att_state, iview);
+   /****** We can now start emitting code to begin the render pass ******/
 
-      anv_image_fill_surface_state(cmd_buffer->device,
-                                   iview->image,
-                                   VK_IMAGE_ASPECT_COLOR_BIT,
-                                   &iview->planes[0].isl,
-                                   isl_surf_usage,
-                                   isl_aux_usage,
-                                   &clear_color,
-                                   0,
-                                   surface_state,
-                                   NULL);
+   gfx->dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
 
-      add_surface_state_relocs(cmd_buffer, *surface_state);
+   /* It is possible to start a render pass with an old pipeline.  Because the
+    * render pass and subpass index are both baked into the pipeline, this is
+    * highly unlikely.  In order to do so, it requires that you have a render
+    * pass with a single subpass and that you use that render pass twice
+    * back-to-back and use the same pipeline at the start of the second render
+    * pass as at the end of the first.  In order to avoid unpredictable issues
+    * with this edge case, we just dirty the pipeline at the start of every
+    * subpass.
+    */
+   gfx->dirty |= ANV_CMD_DIRTY_PIPELINE;
 
-      if (GFX_VER < 10 &&
-          pass_att->load_op == VK_ATTACHMENT_LOAD_OP_LOAD &&
-          iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&
-          iview->planes[0].isl.base_level == 0 &&
-          iview->planes[0].isl.base_array_layer == 0) {
-         genX(copy_fast_clear_dwords)(cmd_buffer, surface_state->state,
-                                      iview->image,
-                                      VK_IMAGE_ASPECT_COLOR_BIT,
-                                      false /* copy to ss */);
+#if GFX_VER >= 11
+   bool has_color_att = false;
+   for (uint32_t i = 0; i < gfx->color_att_count; i++) {
+      if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE) {
+         has_color_att = true;
+         break;
       }
    }
-
-#if GFX_VER >= 11
-   /* The PIPE_CONTROL command description says:
-    *
-    *    "Whenever a Binding Table Index (BTI) used by a Render Taget Message
-    *     points to a different RENDER_SURFACE_STATE, SW must issue a Render
-    *     Target Cache Flush by enabling this bit. When render target flush
-    *     is set due to new association of BTI, PS Scoreboard Stall bit must
-    *     be set in this packet."
-    */
-   anv_add_pending_pipe_bits(cmd_buffer,
-                             ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
-                             ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
-                             "change RT");
+   if (has_color_att) {
+      /* The PIPE_CONTROL command description says:
+      *
+      *    "Whenever a Binding Table Index (BTI) used by a Render Target Message
+      *     points to a different RENDER_SURFACE_STATE, SW must issue a Render
+      *     Target Cache Flush by enabling this bit. When render target flush
+      *     is set due to new association of BTI, PS Scoreboard Stall bit must
+      *     be set in this packet."
+      *
+      * We assume that a new BeginRendering is always changing the RTs, which
+      * may not be true and cause excessive flushing.  We can trivially skip it
+      * in the case that there are no RTs (depth-only rendering), though.
+      */
+      anv_add_pending_pipe_bits(cmd_buffer,
+                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
+                              ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
+                              "change RT");
+   }
 #endif
 
    cmd_buffer_emit_depth_stencil(cmd_buffer);
-}
 
-static enum blorp_filter
-vk_to_blorp_resolve_mode(VkResolveModeFlagBitsKHR vk_mode)
-{
-   switch (vk_mode) {
-   case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT_KHR:
-      return BLORP_FILTER_SAMPLE_0;
-   case VK_RESOLVE_MODE_AVERAGE_BIT_KHR:
-      return BLORP_FILTER_AVERAGE;
-   case VK_RESOLVE_MODE_MIN_BIT_KHR:
-      return BLORP_FILTER_MIN_SAMPLE;
-   case VK_RESOLVE_MODE_MAX_BIT_KHR:
-      return BLORP_FILTER_MAX_SAMPLE;
-   default:
-      return BLORP_FILTER_NONE;
-   }
+   cmd_buffer_emit_cps_control_buffer(cmd_buffer, fsr_iview);
 }
 
 static void
-cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer)
+cmd_buffer_mark_attachment_written(struct anv_cmd_buffer *cmd_buffer,
+                                   struct anv_attachment *att,
+                                   VkImageAspectFlagBits aspect)
 {
-   struct anv_cmd_state *cmd_state = &cmd_buffer->state;
-   struct anv_subpass *subpass = cmd_state->subpass;
-   uint32_t subpass_id = anv_get_subpass_id(&cmd_buffer->state);
-   struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   const struct anv_image_view *iview = att->iview;
 
-   /* We are done with the previous subpass and all rendering directly to that
-    * subpass is now complete.  Zero out all the surface states so we don't
-    * accidentally use them between now and the next subpass.
-    */
-   for (uint32_t i = 0; i < cmd_state->pass->attachment_count; ++i) {
-      memset(&cmd_state->attachments[i].color, 0,
-             sizeof(cmd_state->attachments[i].color));
-      memset(&cmd_state->attachments[i].input, 0,
-             sizeof(cmd_state->attachments[i].input));
-   }
-   cmd_state->null_surface_state = ANV_STATE_NULL;
-   cmd_state->attachment_states = ANV_STATE_NULL;
-
-   for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
-      const uint32_t a = subpass->attachments[i].attachment;
-      if (a == VK_ATTACHMENT_UNUSED)
-         continue;
+   if (iview == NULL)
+      return;
 
-      assert(a < cmd_state->pass->attachment_count);
-      struct anv_attachment_state *att_state = &cmd_state->attachments[a];
-      struct anv_image_view *iview = att_state->image_view;
+   if (gfx->view_mask == 0) {
+      genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
+                                          aspect, att->aux_usage,
+                                          iview->planes[0].isl.base_level,
+                                          iview->planes[0].isl.base_array_layer,
+                                          gfx->layer_count);
+   } else {
+      uint32_t res_view_mask = gfx->view_mask;
+      while (res_view_mask) {
+         int i = u_bit_scan(&res_view_mask);
+
+         const uint32_t level = iview->planes[0].isl.base_level;
+         const uint32_t layer = iview->planes[0].isl.base_array_layer + i;
 
-      assert(util_bitcount(subpass->attachments[i].usage) == 1);
-      if (subpass->attachments[i].usage ==
-          VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
-         /* We assume that if we're ending a subpass, we did do some rendering
-          * so we may end up with compressed data.
-          */
          genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
-                                             VK_IMAGE_ASPECT_COLOR_BIT,
-                                             att_state->aux_usage,
-                                             iview->planes[0].isl.base_level,
-                                             iview->planes[0].isl.base_array_layer,
-                                             fb->layers);
-      } else if (subpass->attachments[i].usage ==
-                 VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
-         /* We may be writing depth or stencil so we need to mark the surface.
-          * Unfortunately, there's no way to know at this point whether the
-          * depth or stencil tests used will actually write to the surface.
-          *
-          * Even though stencil may be plane 1, it always shares a base_level
-          * with depth.
-          */
-         const struct isl_view *ds_view = &iview->planes[0].isl;
-         if (iview->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
-            genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
-                                                VK_IMAGE_ASPECT_DEPTH_BIT,
-                                                att_state->aux_usage,
-                                                ds_view->base_level,
-                                                ds_view->base_array_layer,
-                                                fb->layers);
-         }
-         if (iview->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
-            /* Even though stencil may be plane 1, it always shares a
-             * base_level with depth.
-             */
-            genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
-                                                VK_IMAGE_ASPECT_STENCIL_BIT,
-                                                ISL_AUX_USAGE_NONE,
-                                                ds_view->base_level,
-                                                ds_view->base_array_layer,
-                                                fb->layers);
-         }
+                                             aspect, att->aux_usage,
+                                             level, layer, 1);
       }
    }
+}
+
+void genX(CmdEndRendering)(
+    VkCommandBuffer                             commandBuffer)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   const bool is_multiview = gfx->view_mask != 0;
+   const uint32_t layers =
+      is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
+
+   bool has_color_resolve = false;
+   for (uint32_t i = 0; i < gfx->color_att_count; i++) {
+      cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->color_att[i],
+                                         VK_IMAGE_ASPECT_COLOR_BIT);
 
-   if (subpass->has_color_resolve) {
+      /* Stash this off for later */
+      if (gfx->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE &&
+          !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
+         has_color_resolve = true;
+   }
+
+   cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->depth_att,
+                                       VK_IMAGE_ASPECT_DEPTH_BIT);
+
+   cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->stencil_att,
+                                       VK_IMAGE_ASPECT_STENCIL_BIT);
+
+   if (has_color_resolve) {
       /* We are about to do some MSAA resolves.  We need to flush so that the
        * result of writes to the MSAA color attachments show up in the sampler
        * when we blit to the single-sampled resolve target.
@@ -6418,58 +5378,11 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer)
                                 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
                                 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
                                 "MSAA resolve");
-
-      for (uint32_t i = 0; i < subpass->color_count; ++i) {
-         uint32_t src_att = subpass->color_attachments[i].attachment;
-         uint32_t dst_att = subpass->resolve_attachments[i].attachment;
-
-         if (dst_att == VK_ATTACHMENT_UNUSED)
-            continue;
-
-         assert(src_att < cmd_buffer->state.pass->attachment_count);
-         assert(dst_att < cmd_buffer->state.pass->attachment_count);
-
-         if (cmd_buffer->state.attachments[dst_att].pending_clear_aspects) {
-            /* From the Vulkan 1.0 spec:
-             *
-             *    If the first use of an attachment in a render pass is as a
-             *    resolve attachment, then the loadOp is effectively ignored
-             *    as the resolve is guaranteed to overwrite all pixels in the
-             *    render area.
-             */
-            cmd_buffer->state.attachments[dst_att].pending_clear_aspects = 0;
-         }
-
-         struct anv_image_view *src_iview = cmd_state->attachments[src_att].image_view;
-         struct anv_image_view *dst_iview = cmd_state->attachments[dst_att].image_view;
-
-         const VkRect2D render_area = cmd_buffer->state.render_area;
-
-         enum isl_aux_usage src_aux_usage =
-            cmd_buffer->state.attachments[src_att].aux_usage;
-         enum isl_aux_usage dst_aux_usage =
-            cmd_buffer->state.attachments[dst_att].aux_usage;
-
-         assert(src_iview->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT &&
-                dst_iview->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
-
-         anv_image_msaa_resolve(cmd_buffer,
-                                src_iview->image, src_aux_usage,
-                                src_iview->planes[0].isl.base_level,
-                                src_iview->planes[0].isl.base_array_layer,
-                                dst_iview->image, dst_aux_usage,
-                                dst_iview->planes[0].isl.base_level,
-                                dst_iview->planes[0].isl.base_array_layer,
-                                VK_IMAGE_ASPECT_COLOR_BIT,
-                                render_area.offset.x, render_area.offset.y,
-                                render_area.offset.x, render_area.offset.y,
-                                render_area.extent.width,
-                                render_area.extent.height,
-                                fb->layers, BLORP_FILTER_NONE);
-      }
    }
 
-   if (subpass->ds_resolve_attachment) {
+   if (!(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT) &&
+       (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE ||
+        gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE)) {
       /* We are about to do some MSAA resolves.  We need to flush so that the
        * result of writes to the MSAA depth attachments show up in the sampler
        * when we blit to the single-sampled resolve target.
@@ -6478,313 +5391,71 @@ cmd_buffer_end_subpass(struct anv_cmd_buffer *cmd_buffer)
                               ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
                               ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,
                               "MSAA resolve");
-
-      uint32_t src_att = subpass->depth_stencil_attachment->attachment;
-      uint32_t dst_att = subpass->ds_resolve_attachment->attachment;
-
-      assert(src_att < cmd_buffer->state.pass->attachment_count);
-      assert(dst_att < cmd_buffer->state.pass->attachment_count);
-
-      if (cmd_buffer->state.attachments[dst_att].pending_clear_aspects) {
-         /* From the Vulkan 1.0 spec:
-          *
-          *    If the first use of an attachment in a render pass is as a
-          *    resolve attachment, then the loadOp is effectively ignored
-          *    as the resolve is guaranteed to overwrite all pixels in the
-          *    render area.
-          */
-         cmd_buffer->state.attachments[dst_att].pending_clear_aspects = 0;
-      }
-
-      struct anv_image_view *src_iview = cmd_state->attachments[src_att].image_view;
-      struct anv_image_view *dst_iview = cmd_state->attachments[dst_att].image_view;
-
-      const VkRect2D render_area = cmd_buffer->state.render_area;
-
-      struct anv_attachment_state *src_state =
-         &cmd_state->attachments[src_att];
-      struct anv_attachment_state *dst_state =
-         &cmd_state->attachments[dst_att];
-
-      if ((src_iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
-          subpass->depth_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) {
-
-         /* MSAA resolves sample from the source attachment.  Transition the
-          * depth attachment first to get rid of any HiZ that we may not be
-          * able to handle.
-          */
-         transition_depth_buffer(cmd_buffer, src_iview->image,
-                                 src_iview->planes[0].isl.base_array_layer,
-                                 fb->layers,
-                                 src_state->current_layout,
-                                 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
-                                 false /* will_full_fast_clear */);
-         src_state->aux_usage =
-            anv_layout_to_aux_usage(&cmd_buffer->device->info, src_iview->image,
-                                    VK_IMAGE_ASPECT_DEPTH_BIT,
-                                    VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
-                                    VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
-         src_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
-
-         /* MSAA resolves write to the resolve attachment as if it were any
-          * other transfer op.  Transition the resolve attachment accordingly.
-          */
-         VkImageLayout dst_initial_layout = dst_state->current_layout;
-
-         /* If our render area is the entire size of the image, we're going to
-          * blow it all away so we can claim the initial layout is UNDEFINED
-          * and we'll get a HiZ ambiguate instead of a resolve.
-          */
-         if (dst_iview->image->vk.image_type != VK_IMAGE_TYPE_3D &&
-             render_area.offset.x == 0 && render_area.offset.y == 0 &&
-             render_area.extent.width == dst_iview->vk.extent.width &&
-             render_area.extent.height == dst_iview->vk.extent.height)
-            dst_initial_layout = VK_IMAGE_LAYOUT_UNDEFINED;
-
-         transition_depth_buffer(cmd_buffer, dst_iview->image,
-                                 dst_iview->planes[0].isl.base_array_layer,
-                                 fb->layers,
-                                 dst_initial_layout,
-                                 VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
-                                 false /* will_full_fast_clear */);
-         dst_state->aux_usage =
-            anv_layout_to_aux_usage(&cmd_buffer->device->info, dst_iview->image,
-                                    VK_IMAGE_ASPECT_DEPTH_BIT,
-                                    VK_IMAGE_USAGE_TRANSFER_DST_BIT,
-                                    VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
-         dst_state->current_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
-
-         enum blorp_filter filter =
-            vk_to_blorp_resolve_mode(subpass->depth_resolve_mode);
-
-         anv_image_msaa_resolve(cmd_buffer,
-                                src_iview->image, src_state->aux_usage,
-                                src_iview->planes[0].isl.base_level,
-                                src_iview->planes[0].isl.base_array_layer,
-                                dst_iview->image, dst_state->aux_usage,
-                                dst_iview->planes[0].isl.base_level,
-                                dst_iview->planes[0].isl.base_array_layer,
-                                VK_IMAGE_ASPECT_DEPTH_BIT,
-                                render_area.offset.x, render_area.offset.y,
-                                render_area.offset.x, render_area.offset.y,
-                                render_area.extent.width,
-                                render_area.extent.height,
-                                fb->layers, filter);
-      }
-
-      if ((src_iview->image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
-          subpass->stencil_resolve_mode != VK_RESOLVE_MODE_NONE_KHR) {
-
-         src_state->current_stencil_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
-         dst_state->current_stencil_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
-
-         enum isl_aux_usage src_aux_usage = ISL_AUX_USAGE_NONE;
-         const uint32_t plane =
-            anv_image_aspect_to_plane(dst_iview->image, VK_IMAGE_ASPECT_STENCIL_BIT);
-         enum isl_aux_usage dst_aux_usage =
-            dst_iview->image->planes[plane].aux_usage;
-
-         enum blorp_filter filter =
-            vk_to_blorp_resolve_mode(subpass->stencil_resolve_mode);
-
-         anv_image_msaa_resolve(cmd_buffer,
-                                src_iview->image, src_aux_usage,
-                                src_iview->planes[0].isl.base_level,
-                                src_iview->planes[0].isl.base_array_layer,
-                                dst_iview->image, dst_aux_usage,
-                                dst_iview->planes[0].isl.base_level,
-                                dst_iview->planes[0].isl.base_array_layer,
-                                VK_IMAGE_ASPECT_STENCIL_BIT,
-                                render_area.offset.x, render_area.offset.y,
-                                render_area.offset.x, render_area.offset.y,
-                                render_area.extent.width,
-                                render_area.extent.height,
-                                fb->layers, filter);
-      }
-   }
-
-#if GFX_VER == 7
-   /* On gfx7, we have to store a texturable version of the stencil buffer in
-    * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
-    * forth at strategic points. Stencil writes are only allowed in following
-    * layouts:
-    *
-    *  - VK_IMAGE_LAYOUT_GENERAL
-    *  - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
-    *  - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
-    *  - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
-    *  - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL_KHR
-    *
-    * For general, we have no nice opportunity to transition so we do the copy
-    * to the shadow unconditionally at the end of the subpass. For transfer
-    * destinations, we can update it as part of the transfer op. For the other
-    * layouts, we delay the copy until a transition into some other layout.
-    */
-   if (subpass->depth_stencil_attachment) {
-      uint32_t a = subpass->depth_stencil_attachment->attachment;
-      assert(a != VK_ATTACHMENT_UNUSED);
-
-      struct anv_attachment_state *att_state = &cmd_state->attachments[a];
-      struct anv_image_view *iview = cmd_state->attachments[a].image_view;;
-      const struct anv_image *image = iview->image;
-
-      if (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
-         const uint32_t plane =
-            anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
-
-         if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
-             att_state->current_stencil_layout == VK_IMAGE_LAYOUT_GENERAL) {
-            assert(image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
-            anv_image_copy_to_shadow(cmd_buffer, image,
-                                     VK_IMAGE_ASPECT_STENCIL_BIT,
-                                     iview->planes[plane].isl.base_level, 1,
-                                     iview->planes[plane].isl.base_array_layer,
-                                     fb->layers);
-         }
-      }
    }
-#endif /* GFX_VER == 7 */
 
-   for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
-      const uint32_t a = subpass->attachments[i].attachment;
-      if (a == VK_ATTACHMENT_UNUSED)
+   for (uint32_t i = 0; i < gfx->color_att_count; i++) {
+      const struct anv_attachment *att = &gfx->color_att[i];
+      if (att->resolve_mode == VK_RESOLVE_MODE_NONE ||
+          (gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
          continue;
 
-      if (cmd_state->pass->attachments[a].last_subpass_idx != subpass_id)
-         continue;
-
-      assert(a < cmd_state->pass->attachment_count);
-      struct anv_attachment_state *att_state = &cmd_state->attachments[a];
-      struct anv_image_view *iview = cmd_state->attachments[a].image_view;
-      const struct anv_image *image = iview->image;
-
-      /* Transition the image into the final layout for this render pass */
-      VkImageLayout target_layout =
-         cmd_state->pass->attachments[a].final_layout;
-      VkImageLayout target_stencil_layout =
-         cmd_state->pass->attachments[a].stencil_final_layout;
-
-      uint32_t base_layer, layer_count;
-      if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
-         base_layer = 0;
-         layer_count = anv_minify(iview->image->vk.extent.depth,
-                                  iview->planes[0].isl.base_level);
-      } else {
-         base_layer = iview->planes[0].isl.base_array_layer;
-         layer_count = fb->layers;
-      }
-
-      if (image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
-         assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
-         transition_color_buffer(cmd_buffer, image, VK_IMAGE_ASPECT_COLOR_BIT,
-                                 iview->planes[0].isl.base_level, 1,
-                                 base_layer, layer_count,
-                                 att_state->current_layout, target_layout,
-                                 VK_QUEUE_FAMILY_IGNORED,
-                                 VK_QUEUE_FAMILY_IGNORED,
-                                 false /* will_full_fast_clear */);
-      }
-
-      if (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
-         transition_depth_buffer(cmd_buffer, image,
-                                 base_layer, layer_count,
-                                 att_state->current_layout, target_layout,
-                                 false /* will_full_fast_clear */);
-      }
-
-      if (image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
-         transition_stencil_buffer(cmd_buffer, image,
-                                   iview->planes[0].isl.base_level, 1,
-                                   base_layer, layer_count,
-                                   att_state->current_stencil_layout,
-                                   target_stencil_layout,
-                                   false /* will_full_fast_clear */);
-      }
+      anv_attachment_msaa_resolve(cmd_buffer, att, att->layout,
+                                  VK_IMAGE_ASPECT_COLOR_BIT);
    }
 
-   /* Accumulate any subpass flushes that need to happen after the subpass.
-    * Yes, they do get accumulated twice in the NextSubpass case but since
-    * genX_CmdNextSubpass just calls end/begin back-to-back, we just end up
-    * ORing the bits in twice so it's harmless.
-    */
-   anv_add_pending_pipe_bits(cmd_buffer,
-                             cmd_buffer->state.pass->subpass_flushes[subpass_id + 1],
-                             "end subpass deps/attachments");
-}
-
-void genX(CmdBeginRenderPass2)(
-    VkCommandBuffer                             commandBuffer,
-    const VkRenderPassBeginInfo*                pRenderPassBeginInfo,
-    const VkSubpassBeginInfoKHR*                pSubpassBeginInfo)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   ANV_FROM_HANDLE(anv_render_pass, pass, pRenderPassBeginInfo->renderPass);
-   ANV_FROM_HANDLE(anv_framebuffer, framebuffer, pRenderPassBeginInfo->framebuffer);
-   VkResult result;
-
-   cmd_buffer->state.framebuffer = framebuffer;
-   cmd_buffer->state.pass = pass;
-   cmd_buffer->state.render_area = pRenderPassBeginInfo->renderArea;
+   if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
+       !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
+      const struct anv_image_view *src_iview = gfx->depth_att.iview;
 
-   anv_measure_beginrenderpass(cmd_buffer);
-
-   result = genX(cmd_buffer_setup_attachments)(cmd_buffer, pass,
-                                               framebuffer,
-                                               pRenderPassBeginInfo);
-   if (result != VK_SUCCESS) {
-      assert(anv_batch_has_error(&cmd_buffer->batch));
-      return;
+      /* MSAA resolves sample from the source attachment.  Transition the
+       * depth attachment first to get rid of any HiZ that we may not be
+       * able to handle.
+       */
+      transition_depth_buffer(cmd_buffer, src_iview->image, 0, 1,
+                              src_iview->planes[0].isl.base_array_layer,
+                              layers,
+                              gfx->depth_att.layout,
+                              VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+                              false /* will_full_fast_clear */);
+
+      anv_attachment_msaa_resolve(cmd_buffer, &gfx->depth_att,
+                                  VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+                                  VK_IMAGE_ASPECT_DEPTH_BIT);
+
+      /* Transition the source back to the original layout.  This seems a bit
+       * inefficient but, since HiZ resolves aren't destructive, going from
+       * less HiZ to more is generally a no-op.
+       */
+      transition_depth_buffer(cmd_buffer, src_iview->image, 0, 1,
+                              src_iview->planes[0].isl.base_array_layer,
+                              layers,
+                              VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+                              gfx->depth_att.layout,
+                              false /* will_full_fast_clear */);
    }
 
-   genX(flush_pipeline_select_3d)(cmd_buffer);
-
-   cmd_buffer_begin_subpass(cmd_buffer, 0);
-}
-
-void genX(CmdNextSubpass2)(
-    VkCommandBuffer                             commandBuffer,
-    const VkSubpassBeginInfoKHR*                pSubpassBeginInfo,
-    const VkSubpassEndInfoKHR*                  pSubpassEndInfo)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   if (anv_batch_has_error(&cmd_buffer->batch))
-      return;
-
-   assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
-
-   uint32_t prev_subpass = anv_get_subpass_id(&cmd_buffer->state);
-   cmd_buffer_end_subpass(cmd_buffer);
-   cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
-}
-
-void genX(CmdEndRenderPass2)(
-    VkCommandBuffer                             commandBuffer,
-    const VkSubpassEndInfoKHR*                  pSubpassEndInfo)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-
-   if (anv_batch_has_error(&cmd_buffer->batch))
-      return;
+   if (gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
+       !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
+      anv_attachment_msaa_resolve(cmd_buffer, &gfx->stencil_att,
+                                  gfx->stencil_att.layout,
+                                  VK_IMAGE_ASPECT_STENCIL_BIT);
+   }
 
-   cmd_buffer_end_subpass(cmd_buffer);
 
-   cmd_buffer->state.hiz_enabled = false;
+   trace_intel_end_render_pass(&cmd_buffer->trace,
+                               gfx->render_area.extent.width,
+                               gfx->render_area.extent.height,
+                               gfx->color_att_count,
+                               gfx->samples);
 
-   /* Remove references to render pass specific state. This enables us to
-    * detect whether or not we're in a renderpass.
-    */
-   cmd_buffer->state.framebuffer = NULL;
-   cmd_buffer->state.pass = NULL;
-   cmd_buffer->state.subpass = NULL;
+   anv_cmd_buffer_reset_rendering(cmd_buffer);
 }
 
 void
 genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer)
 {
-#if GFX_VERx10 >= 75
    struct mi_builder b;
-   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
 
    mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
                 mi_reg32(ANV_PREDICATE_RESULT_REG));
@@ -6795,10 +5466,8 @@ genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer)
       mip.CombineOperation = COMBINE_SET;
       mip.CompareOperation = COMPARE_SRCS_EQUAL;
    }
-#endif
 }
 
-#if GFX_VERx10 >= 75
 void genX(CmdBeginConditionalRenderingEXT)(
    VkCommandBuffer                             commandBuffer,
    const VkConditionalRenderingBeginInfoEXT*   pConditionalRenderingBegin)
@@ -6817,7 +5486,9 @@ void genX(CmdBeginConditionalRenderingEXT)(
    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 
    struct mi_builder b;
-   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+   const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &value_address);
+   mi_builder_set_mocs(&b, mocs);
 
    /* Section 19.4 of the Vulkan 1.1.85 spec says:
     *
@@ -6849,121 +5520,162 @@ void genX(CmdEndConditionalRenderingEXT)(
 
    cmd_state->conditional_render_enabled = false;
 }
-#endif
 
-/* Set of stage bits for which are pipelined, i.e. they get queued by the
- * command streamer for later execution.
+/* Set of stage bits for which are pipelined, i.e. they get queued
+ * by the command streamer for later execution.
  */
 #define ANV_PIPELINE_STAGE_PIPELINED_BITS \
-   (VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | \
-    VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | \
-    VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT | \
-    VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | \
-    VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | \
-    VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | \
-    VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | \
-    VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | \
-    VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | \
-    VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | \
-    VK_PIPELINE_STAGE_TRANSFER_BIT | \
-    VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT | \
-    VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT | \
-    VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)
-
-void genX(CmdSetEvent)(
+   ~(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | \
+     VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | \
+     VK_PIPELINE_STAGE_2_HOST_BIT | \
+     VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT)
+
+void genX(CmdSetEvent2)(
     VkCommandBuffer                             commandBuffer,
     VkEvent                                     _event,
-    VkPipelineStageFlags                        stageMask)
+    const VkDependencyInfo*                     pDependencyInfo)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_event, event, _event);
 
+   if (anv_cmd_buffer_is_video_queue(cmd_buffer)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
+         flush.PostSyncOperation = WriteImmediateData;
+         flush.Address = anv_state_pool_state_address(
+            &cmd_buffer->device->dynamic_state_pool,
+            event->state);
+         flush.ImmediateData = VK_EVENT_SET;
+      }
+      return;
+   }
+
+   VkPipelineStageFlags2 src_stages = 0;
+
+   for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
+      src_stages |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
+   for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
+      src_stages |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
+   for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
+      src_stages |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
+
    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-      if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
-         pc.StallAtPixelScoreboard = true;
-         pc.CommandStreamerStallEnable = true;
-      }
-
-      pc.DestinationAddressType  = DAT_PPGTT,
-      pc.PostSyncOperation       = WriteImmediateData,
-      pc.Address = (struct anv_address) {
-         cmd_buffer->device->dynamic_state_pool.block_pool.bo,
-         event->state.offset
-      };
-      pc.ImmediateData           = VK_EVENT_SET;
-      anv_debug_dump_pc(pc);
-   }
+   enum anv_pipe_bits pc_bits = 0;
+   if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
+      pc_bits |= ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
+      pc_bits |= ANV_PIPE_CS_STALL_BIT;
+  }
+
+   genx_batch_emit_pipe_control_write
+      (&cmd_buffer->batch, cmd_buffer->device->info,
+       cmd_buffer->state.current_pipeline, WriteImmediateData,
+       anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
+                                    event->state),
+       VK_EVENT_SET, pc_bits);
 }
 
-void genX(CmdResetEvent)(
+void genX(CmdResetEvent2)(
     VkCommandBuffer                             commandBuffer,
     VkEvent                                     _event,
-    VkPipelineStageFlags                        stageMask)
+    VkPipelineStageFlags2                       stageMask)
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_event, event, _event);
 
+   if (anv_cmd_buffer_is_video_queue(cmd_buffer)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
+         flush.PostSyncOperation = WriteImmediateData;
+         flush.Address = anv_state_pool_state_address(
+            &cmd_buffer->device->dynamic_state_pool,
+            event->state);
+         flush.ImmediateData = VK_EVENT_RESET;
+      }
+      return;
+   }
+
    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-      if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
-         pc.StallAtPixelScoreboard = true;
-         pc.CommandStreamerStallEnable = true;
-      }
-
-      pc.DestinationAddressType  = DAT_PPGTT;
-      pc.PostSyncOperation       = WriteImmediateData;
-      pc.Address = (struct anv_address) {
-         cmd_buffer->device->dynamic_state_pool.block_pool.bo,
-         event->state.offset
-      };
-      pc.ImmediateData           = VK_EVENT_RESET;
-      anv_debug_dump_pc(pc);
-   }
+   enum anv_pipe_bits pc_bits = 0;
+   if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
+      pc_bits |= ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
+      pc_bits |= ANV_PIPE_CS_STALL_BIT;
+    }
+
+   genx_batch_emit_pipe_control_write
+      (&cmd_buffer->batch, cmd_buffer->device->info,
+       cmd_buffer->state.current_pipeline, WriteImmediateData,
+       anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
+                                    event->state),
+       VK_EVENT_RESET,
+       pc_bits);
 }
 
-void genX(CmdWaitEvents)(
+void genX(CmdWaitEvents2)(
     VkCommandBuffer                             commandBuffer,
     uint32_t                                    eventCount,
     const VkEvent*                              pEvents,
-    VkPipelineStageFlags                        srcStageMask,
-    VkPipelineStageFlags                        destStageMask,
-    uint32_t                                    memoryBarrierCount,
-    const VkMemoryBarrier*                      pMemoryBarriers,
-    uint32_t                                    bufferMemoryBarrierCount,
-    const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
-    uint32_t                                    imageMemoryBarrierCount,
-    const VkImageMemoryBarrier*                 pImageMemoryBarriers)
+    const VkDependencyInfo*                     pDependencyInfos)
 {
-#if GFX_VER >= 8
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 
    for (uint32_t i = 0; i < eventCount; i++) {
       ANV_FROM_HANDLE(anv_event, event, pEvents[i]);
 
       anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
-         sem.WaitMode            = PollingMode,
-         sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD,
-         sem.SemaphoreDataDword  = VK_EVENT_SET,
-         sem.SemaphoreAddress = (struct anv_address) {
-            cmd_buffer->device->dynamic_state_pool.block_pool.bo,
-            event->state.offset
-         };
+         sem.WaitMode            = PollingMode;
+         sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD;
+         sem.SemaphoreDataDword  = VK_EVENT_SET;
+         sem.SemaphoreAddress    = anv_state_pool_state_address(
+            &cmd_buffer->device->dynamic_state_pool,
+            event->state);
       }
    }
-#else
-   anv_finishme("Implement events on gfx7");
-#endif
 
-   genX(CmdPipelineBarrier)(commandBuffer, srcStageMask, destStageMask,
-                            false, /* byRegion */
-                            memoryBarrierCount, pMemoryBarriers,
-                            bufferMemoryBarrierCount, pBufferMemoryBarriers,
-                            imageMemoryBarrierCount, pImageMemoryBarriers);
+   cmd_buffer_barrier(cmd_buffer, pDependencyInfos, "wait event");
+}
+
+static uint32_t vk_to_intel_index_type(VkIndexType type)
+{
+   switch (type) {
+   case VK_INDEX_TYPE_UINT8_KHR:
+      return INDEX_BYTE;
+   case VK_INDEX_TYPE_UINT16:
+      return INDEX_WORD;
+   case VK_INDEX_TYPE_UINT32:
+      return INDEX_DWORD;
+   default:
+      unreachable("invalid index type");
+   }
+}
+
+void genX(CmdBindIndexBuffer2KHR)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    VkDeviceSize                                size,
+    VkIndexType                                 indexType)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+
+   uint32_t restart_index = vk_index_to_restart(indexType);
+   if (cmd_buffer->state.gfx.restart_index != restart_index) {
+      cmd_buffer->state.gfx.restart_index = restart_index;
+      cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RESTART_INDEX;
+   }
+
+   uint32_t index_type = vk_to_intel_index_type(indexType);
+   if (cmd_buffer->state.gfx.index_buffer != buffer ||
+       cmd_buffer->state.gfx.index_type != index_type ||
+       cmd_buffer->state.gfx.index_offset != offset) {
+      cmd_buffer->state.gfx.index_buffer = buffer;
+      cmd_buffer->state.gfx.index_type = vk_to_intel_index_type(indexType);
+      cmd_buffer->state.gfx.index_offset = offset;
+      cmd_buffer->state.gfx.index_size = buffer ? vk_buffer_range(&buffer->vk, offset, size) : 0;
+      cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
+   }
 }
 
 VkResult genX(CmdSetPerformanceOverrideINTEL)(
@@ -6974,21 +5686,12 @@ VkResult genX(CmdSetPerformanceOverrideINTEL)(
 
    switch (pOverrideInfo->type) {
    case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: {
-#if GFX_VER >= 9
       anv_batch_write_reg(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2), csdm2) {
          csdm2._3DRenderingInstructionDisable = pOverrideInfo->enable;
          csdm2.MediaInstructionDisable = pOverrideInfo->enable;
          csdm2._3DRenderingInstructionDisableMask = true;
          csdm2.MediaInstructionDisableMask = true;
       }
-#else
-      anv_batch_write_reg(&cmd_buffer->batch, GENX(INSTPM), instpm) {
-         instpm._3DRenderingInstructionDisable = pOverrideInfo->enable;
-         instpm.MediaInstructionDisable = pOverrideInfo->enable;
-         instpm._3DRenderingInstructionDisableMask = true;
-         instpm.MediaInstructionDisableMask = true;
-      }
-#endif
       break;
    }
 
@@ -7019,13 +5722,495 @@ VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
    return VK_SUCCESS;
 }
 
+#define TIMESTAMP 0x2358
+
 void genX(cmd_emit_timestamp)(struct anv_batch *batch,
-                              struct anv_bo *bo,
-                              uint32_t offset) {
-   anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
-      pc.CommandStreamerStallEnable = true;
-      pc.PostSyncOperation       = WriteTimestamp;
-      pc.Address = (struct anv_address) {bo, offset};
-      anv_debug_dump_pc(pc);
+                              struct anv_device *device,
+                              struct anv_address addr,
+                              enum anv_timestamp_capture_type type,
+                              void *data) {
+   /* Make sure ANV_TIMESTAMP_CAPTURE_AT_CS_STALL and
+    * ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER capture type are not set for
+    * transfer queue.
+    */
+   if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
+       (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO)) {
+      assert(type != ANV_TIMESTAMP_CAPTURE_AT_CS_STALL &&
+             type != ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER);
    }
+
+   switch (type) {
+   case ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE: {
+      struct mi_builder b;
+      mi_builder_init(&b, device->info, batch);
+      mi_store(&b, mi_mem64(addr), mi_reg64(TIMESTAMP));
+      break;
+   }
+
+   case ANV_TIMESTAMP_CAPTURE_END_OF_PIPE: {
+      if ((batch->engine_class == INTEL_ENGINE_CLASS_COPY) ||
+          (batch->engine_class == INTEL_ENGINE_CLASS_VIDEO)) {
+         /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
+         if (intel_needs_workaround(device->info, 16018063123))
+            genX(batch_emit_fast_color_dummy_blit)(batch, device);
+         anv_batch_emit(batch, GENX(MI_FLUSH_DW), fd) {
+            fd.PostSyncOperation = WriteTimestamp;
+            fd.Address = addr;
+         }
+      } else {
+         genx_batch_emit_pipe_control_write(batch, device->info, 0,
+                                            WriteTimestamp, addr, 0, 0);
+      }
+      break;
+   }
+
+   case ANV_TIMESTAMP_CAPTURE_AT_CS_STALL:
+      genx_batch_emit_pipe_control_write
+           (batch, device->info, 0, WriteTimestamp, addr, 0,
+            ANV_PIPE_CS_STALL_BIT);
+      break;
+
+#if GFX_VERx10 >= 125
+   case ANV_TIMESTAMP_REWRITE_COMPUTE_WALKER: {
+      uint32_t dwords[GENX(COMPUTE_WALKER_length)];
+
+      GENX(COMPUTE_WALKER_pack)(batch, dwords, &(struct GENX(COMPUTE_WALKER)) {
+            .PostSync = (struct GENX(POSTSYNC_DATA)) {
+               .Operation = WriteTimestamp,
+               .DestinationAddress = addr,
+               .MOCS = anv_mocs(device, NULL, 0),
+            },
+         });
+
+      for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++)
+         ((uint32_t *)data)[i] |= dwords[i];
+      break;
+   }
+
+   case ANV_TIMESTAMP_REWRITE_INDIRECT_DISPATCH: {
+      uint32_t dwords[GENX(EXECUTE_INDIRECT_DISPATCH_length)];
+
+      GENX(EXECUTE_INDIRECT_DISPATCH_pack)
+      (batch, dwords, &(struct GENX(EXECUTE_INDIRECT_DISPATCH)) {
+            .MOCS = anv_mocs(device, NULL, 0),
+            .COMPUTE_WALKER_BODY = {
+               .PostSync = (struct GENX(POSTSYNC_DATA)) {
+                  .Operation = WriteTimestamp,
+                  .DestinationAddress = addr,
+                  .MOCS = anv_mocs(device, NULL, 0),
+               },
+            }
+      });
+
+      for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++)
+         ((uint32_t *)data)[i] |= dwords[i];
+      break;
+   }
+#endif
+
+   default:
+      unreachable("invalid");
+   }
+}
+
+void genX(batch_emit_secondary_call)(struct anv_batch *batch,
+                                     struct anv_address secondary_addr,
+                                     struct anv_address secondary_return_addr)
+{
+   /* Emit a write to change the return address of the secondary */
+   uint64_t *write_return_addr =
+      anv_batch_emitn(batch,
+                      GENX(MI_STORE_DATA_IMM_length) + 1 /* QWord write */,
+                      GENX(MI_STORE_DATA_IMM),
+#if GFX_VER >= 12
+                      .ForceWriteCompletionCheck = true,
+#endif
+                      .Address = secondary_return_addr) +
+      GENX(MI_STORE_DATA_IMM_ImmediateData_start) / 8;
+
+#if GFX_VER >= 12
+   /* Disable prefetcher before jumping into a secondary */
+   anv_batch_emit(batch, GENX(MI_ARB_CHECK), arb) {
+      arb.PreParserDisableMask = true;
+      arb.PreParserDisable = true;
+   }
+#endif
+
+   /* Jump into the secondary */
+   anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
+      bbs.AddressSpaceIndicator = ASI_PPGTT;
+      bbs.SecondLevelBatchBuffer = Firstlevelbatch;
+      bbs.BatchBufferStartAddress = secondary_addr;
+   }
+
+   /* Replace the return address written by the MI_STORE_DATA_IMM above with
+    * the primary's current batch address (immediately after the jump).
+    */
+   *write_return_addr =
+      anv_address_physical(anv_batch_current_address(batch));
+}
+
+void *
+genX(batch_emit_return)(struct anv_batch *batch)
+{
+   return anv_batch_emitn(batch,
+                          GENX(MI_BATCH_BUFFER_START_length),
+                          GENX(MI_BATCH_BUFFER_START),
+                          .AddressSpaceIndicator = ASI_PPGTT,
+                          .SecondLevelBatchBuffer = Firstlevelbatch);
+}
+
+void
+genX(batch_emit_post_3dprimitive_was)(struct anv_batch *batch,
+                                      const struct anv_device *device,
+                                      uint32_t primitive_topology,
+                                      uint32_t vertex_count)
+{
+#if INTEL_WA_22014412737_GFX_VER || INTEL_WA_16014538804_GFX_VER
+   if (intel_needs_workaround(device->info, 22014412737) &&
+       (primitive_topology == _3DPRIM_POINTLIST ||
+        primitive_topology == _3DPRIM_LINELIST ||
+        primitive_topology == _3DPRIM_LINESTRIP ||
+        primitive_topology == _3DPRIM_LINELIST_ADJ ||
+        primitive_topology == _3DPRIM_LINESTRIP_ADJ ||
+        primitive_topology == _3DPRIM_LINELOOP ||
+        primitive_topology == _3DPRIM_POINTLIST_BF ||
+        primitive_topology == _3DPRIM_LINESTRIP_CONT ||
+        primitive_topology == _3DPRIM_LINESTRIP_BF ||
+        primitive_topology == _3DPRIM_LINESTRIP_CONT_BF) &&
+       (vertex_count == 1 || vertex_count == 2)) {
+      genx_batch_emit_pipe_control_write
+         (batch, device->info, 0, WriteImmediateData,
+          device->workaround_address, 0, 0);
+
+      /* Reset counter because we just emitted a PC */
+      batch->num_3d_primitives_emitted = 0;
+   } else if (intel_needs_workaround(device->info, 16014538804)) {
+      batch->num_3d_primitives_emitted++;
+      /* WA 16014538804:
+       *    After every 3 3D_Primitive command,
+       *    atleast 1 pipe_control must be inserted.
+       */
+      if (batch->num_3d_primitives_emitted == 3) {
+         anv_batch_emit(batch, GENX(PIPE_CONTROL), pc);
+         batch->num_3d_primitives_emitted = 0;
+      }
+   }
+#endif
+}
+
+/* Wa_16018063123 */
+ALWAYS_INLINE void
+genX(batch_emit_fast_color_dummy_blit)(struct anv_batch *batch,
+                                      struct anv_device *device)
+{
+#if GFX_VERx10 >= 125
+   anv_batch_emit(batch, GENX(XY_FAST_COLOR_BLT), blt) {
+      blt.DestinationBaseAddress = device->workaround_address;
+      blt.DestinationMOCS = device->isl_dev.mocs.blitter_dst;
+      blt.DestinationPitch = 63;
+      blt.DestinationX2 = 1;
+      blt.DestinationY2 = 4;
+      blt.DestinationSurfaceWidth = 1;
+      blt.DestinationSurfaceHeight = 4;
+      blt.DestinationSurfaceType = XY_SURFTYPE_2D;
+      blt.DestinationSurfaceQPitch = 4;
+      blt.DestinationTiling = XY_TILE_LINEAR;
+   }
+#endif
+}
+
+void
+genX(urb_workaround)(struct anv_cmd_buffer *cmd_buffer,
+                     const struct intel_urb_config *urb_cfg)
+{
+#if INTEL_NEEDS_WA_16014912113
+   const struct intel_urb_config *current =
+      &cmd_buffer->state.gfx.urb_cfg;
+   if (intel_urb_setup_changed(urb_cfg, current, MESA_SHADER_TESS_EVAL) &&
+       current->size[0] != 0) {
+      for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
+         anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_VS), urb) {
+            urb._3DCommandSubOpcode      += i;
+            urb.VSURBStartingAddress      = current->start[i];
+            urb.VSURBEntryAllocationSize  = current->size[i] - 1;
+            urb.VSNumberofURBEntries      = i == 0 ? 256 : 0;
+         }
+      }
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.HDCPipelineFlushEnable = true;
+      }
+   }
+#endif
+}
+
+struct anv_state
+genX(cmd_buffer_begin_companion_rcs_syncpoint)(
+      struct anv_cmd_buffer   *cmd_buffer)
+{
+#if GFX_VERx10 >= 125
+   const struct intel_device_info *info = cmd_buffer->device->info;
+   struct anv_state syncpoint =
+      anv_cmd_buffer_alloc_temporary_state(cmd_buffer, 2 * sizeof(uint32_t), 4);
+   struct anv_address xcs_wait_addr =
+      anv_cmd_buffer_temporary_state_address(cmd_buffer, syncpoint);
+   struct anv_address rcs_wait_addr = anv_address_add(xcs_wait_addr, 4);
+
+   /* Reset the sync point */
+   memset(syncpoint.map, 0, 2 * sizeof(uint32_t));
+
+   struct mi_builder b;
+
+   /* On CCS:
+    *    - flush all caches & invalidate
+    *    - unblock RCS
+    *    - wait on RCS to complete
+    *    - clear the value we waited on
+    */
+
+   if (anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
+      anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_FLUSH_BITS |
+                                            ANV_PIPE_INVALIDATE_BITS |
+                                            ANV_PIPE_STALL_BITS,
+                                "post main cmd buffer invalidate");
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+   } else if (anv_cmd_buffer_is_blitter_queue(cmd_buffer)) {
+      /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
+      if (intel_needs_workaround(cmd_buffer->device->info, 16018063123)) {
+         genX(batch_emit_fast_color_dummy_blit)(&cmd_buffer->batch,
+                                                cmd_buffer->device);
+      }
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), fd) {
+         fd.FlushCCS = true; /* Maybe handle Flush LLC */
+      }
+   }
+
+   {
+      mi_builder_init(&b, info, &cmd_buffer->batch);
+      mi_store(&b, mi_mem32(rcs_wait_addr), mi_imm(0x1));
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
+         sem.WaitMode            = PollingMode;
+         sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD;
+         sem.SemaphoreDataDword  = 0x1;
+         sem.SemaphoreAddress    = xcs_wait_addr;
+      }
+      /* Make sure to reset the semaphore in case the command buffer is run
+       * multiple times.
+       */
+      mi_store(&b, mi_mem32(xcs_wait_addr), mi_imm(0x0));
+   }
+
+   /* On RCS:
+    *    - wait on CCS signal
+    *    - clear the value we waited on
+    */
+   {
+      mi_builder_init(&b, info, &cmd_buffer->companion_rcs_cmd_buffer->batch);
+      anv_batch_emit(&cmd_buffer->companion_rcs_cmd_buffer->batch,
+                     GENX(MI_SEMAPHORE_WAIT),
+                     sem) {
+         sem.WaitMode            = PollingMode;
+         sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD;
+         sem.SemaphoreDataDword  = 0x1;
+         sem.SemaphoreAddress    = rcs_wait_addr;
+      }
+      /* Make sure to reset the semaphore in case the command buffer is run
+       * multiple times.
+       */
+      mi_store(&b, mi_mem32(rcs_wait_addr), mi_imm(0x0));
+   }
+
+   return syncpoint;
+#else
+   unreachable("Not implemented");
+#endif
+}
+
+void
+genX(cmd_buffer_end_companion_rcs_syncpoint)(struct anv_cmd_buffer *cmd_buffer,
+                                             struct anv_state syncpoint)
+{
+#if GFX_VERx10 >= 125
+   struct anv_address xcs_wait_addr =
+      anv_cmd_buffer_temporary_state_address(cmd_buffer, syncpoint);
+
+   struct mi_builder b;
+
+   /* On RCS:
+    *    - flush all caches & invalidate
+    *    - unblock the CCS
+    */
+   anv_add_pending_pipe_bits(cmd_buffer->companion_rcs_cmd_buffer,
+                             ANV_PIPE_FLUSH_BITS |
+                             ANV_PIPE_INVALIDATE_BITS |
+                             ANV_PIPE_STALL_BITS,
+                             "post rcs flush");
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer->companion_rcs_cmd_buffer);
+
+   mi_builder_init(&b, cmd_buffer->device->info,
+                   &cmd_buffer->companion_rcs_cmd_buffer->batch);
+   mi_store(&b, mi_mem32(xcs_wait_addr), mi_imm(0x1));
+#else
+   unreachable("Not implemented");
+#endif
+}
+
+VkResult
+genX(write_trtt_entries)(struct anv_trtt_submission *submit)
+{
+#if GFX_VER >= 12
+   const struct intel_device_info *devinfo =
+      submit->sparse->queue->device->info;
+
+   size_t batch_size = submit->l3l2_binds_len * 20 +
+                       submit->l1_binds_len * 16 +
+                       GENX(PIPE_CONTROL_length) * sizeof(uint32_t) + 8;
+   STACK_ARRAY(uint32_t, cmds, batch_size);
+   struct anv_batch batch = {
+      .start = cmds,
+      .next = cmds,
+      .end = (void *)cmds + batch_size,
+   };
+
+   /* BSpec says:
+    *   "DWord Length programmed must not exceed 0x3FE."
+    * For a single dword write the programmed length is 2, and for a single
+    * qword it's 3. This is the value we actually write to the register field,
+    * so it's not considering the bias.
+    */
+   uint32_t dword_write_len = 2;
+   uint32_t qword_write_len = 3;
+   uint32_t max_dword_extra_writes = 0x3FE - dword_write_len;
+   uint32_t max_qword_extra_writes = (0x3FE - qword_write_len) / 2;
+
+   /* What makes the code below quite complicated is the fact that we can
+    * write multiple values with MI_STORE_DATA_IMM as long as the writes go to
+    * contiguous addresses.
+    */
+
+   for (int i = 0; i < submit->l3l2_binds_len; i++) {
+      int extra_writes = 0;
+      for (int j = i + 1;
+           j < submit->l3l2_binds_len &&
+            extra_writes <= max_qword_extra_writes;
+           j++) {
+         if (submit->l3l2_binds[i].pte_addr + (j - i) * 8 ==
+             submit->l3l2_binds[j].pte_addr) {
+            extra_writes++;
+         } else {
+            break;
+         }
+      }
+      bool is_last_write = submit->l1_binds_len == 0 &&
+                           i + extra_writes + 1 == submit->l3l2_binds_len;
+
+      uint32_t total_len = GENX(MI_STORE_DATA_IMM_length_bias) +
+                           qword_write_len + (extra_writes * 2);
+      uint32_t *dw;
+      dw = anv_batch_emitn(&batch, total_len, GENX(MI_STORE_DATA_IMM),
+         .ForceWriteCompletionCheck = is_last_write,
+         .StoreQword = true,
+         .Address = anv_address_from_u64(submit->l3l2_binds[i].pte_addr),
+      );
+      dw += 3;
+      for (int j = 0; j < extra_writes + 1; j++) {
+         uint64_t entry_addr_64b = submit->l3l2_binds[i + j].entry_addr;
+         *dw = entry_addr_64b & 0xFFFFFFFF;
+         dw++;
+         *dw = (entry_addr_64b >> 32) & 0xFFFFFFFF;
+         dw++;
+      }
+      assert(dw == batch.next);
+
+      i += extra_writes;
+   }
+
+   for (int i = 0; i < submit->l1_binds_len; i++) {
+      int extra_writes = 0;
+      for (int j = i + 1;
+           j < submit->l1_binds_len && extra_writes <= max_dword_extra_writes;
+           j++) {
+         if (submit->l1_binds[i].pte_addr + (j - i) * 4 ==
+             submit->l1_binds[j].pte_addr) {
+            extra_writes++;
+         } else {
+            break;
+         }
+      }
+
+      bool is_last_write = i + extra_writes + 1 == submit->l1_binds_len;
+
+      uint32_t total_len = GENX(MI_STORE_DATA_IMM_length_bias) +
+                           dword_write_len + extra_writes;
+      uint32_t *dw;
+      dw = anv_batch_emitn(&batch, total_len, GENX(MI_STORE_DATA_IMM),
+         .ForceWriteCompletionCheck = is_last_write,
+         .Address = anv_address_from_u64(submit->l1_binds[i].pte_addr),
+      );
+      dw += 3;
+      for (int j = 0; j < extra_writes + 1; j++) {
+         *dw = (submit->l1_binds[i + j].entry_addr >> 16) & 0xFFFFFFFF;
+         dw++;
+      }
+      assert(dw == batch.next);
+
+      i += extra_writes;
+   }
+
+   genx_batch_emit_pipe_control(&batch, devinfo, _3D,
+                                ANV_PIPE_CS_STALL_BIT |
+                                ANV_PIPE_TLB_INVALIDATE_BIT);
+
+   anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
+
+   assert(batch.next <= batch.end);
+
+   VkResult result = anv_queue_submit_trtt_batch(submit->sparse, &batch);
+   STACK_ARRAY_FINISH(cmds);
+
+   return result;
+
+#endif
+   return VK_SUCCESS;
+}
+
+void
+genX(CmdWriteBufferMarker2AMD)(VkCommandBuffer commandBuffer,
+                               VkPipelineStageFlags2 stage,
+                               VkBuffer dstBuffer,
+                               VkDeviceSize dstOffset,
+                               uint32_t marker)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, dstBuffer);
+
+   /* The barriers inserted by the application to make dstBuffer writable
+    * should already have the L1/L2 cache flushes. On platforms where the
+    * command streamer is not coherent with L3, we need an additional set of
+    * cache flushes.
+    */
+   enum anv_pipe_bits bits =
+      (ANV_DEVINFO_HAS_COHERENT_L3_CS(cmd_buffer->device->info) ? 0 :
+       (ANV_PIPE_DATA_CACHE_FLUSH_BIT | ANV_PIPE_TILE_CACHE_FLUSH_BIT)) |
+      ANV_PIPE_END_OF_PIPE_SYNC_BIT;
+
+   trace_intel_begin_write_buffer_marker(&cmd_buffer->trace);
+
+   anv_add_pending_pipe_bits(cmd_buffer, bits, "write buffer marker");
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   /* Emitting a PIPE_CONTROL with Post-Sync Op = Write Immediate Data
+    * would be the logical way to implement this extension, as it could
+    * do a pipelined marker write.  Unfortunately, it requires writing
+    * whole 64-bit QWords, and VK_AMD_buffer_marker requires writing a
+    * 32-bit value.  MI_STORE_DATA_IMM is the only good way to do that,
+    * and unfortunately it requires stalling.
+    */
+   mi_store(&b, mi_mem32(anv_address_add(buffer->address, dstOffset)),
+                mi_imm(marker));
+
+   trace_intel_end_write_buffer_marker(&cmd_buffer->trace);
 }
diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c
new file mode 100644
index 00000000000..7f05139e43f
--- /dev/null
+++ b/src/intel/vulkan/genX_cmd_compute.c
@@ -0,0 +1,1168 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "anv_private.h"
+#include "anv_measure.h"
+#include "vk_render_pass.h"
+#include "vk_util.h"
+
+#include "common/intel_aux_map.h"
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+#include "genxml/genX_rt_pack.h"
+#include "common/intel_genX_state_brw.h"
+
+#include "ds/intel_tracepoints.h"
+
+/* We reserve :
+ *    - GPR 14 for secondary command buffer returns
+ *    - GPR 15 for conditional rendering
+ */
+#define MI_BUILDER_NUM_ALLOC_GPRS 14
+#define __gen_get_batch_dwords anv_batch_emit_dwords
+#define __gen_address_offset anv_address_add
+#define __gen_get_batch_address(b, a) anv_batch_address(b, a)
+#include "common/mi_builder.h"
+
+void
+genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
+                                  uint32_t total_scratch)
+{
+#if GFX_VERx10 >= 125
+   assert(cmd_buffer->state.current_pipeline == GPGPU);
+
+   struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
+
+   if (total_scratch <= comp_state->scratch_size)
+      return;
+
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+   anv_batch_emit(&cmd_buffer->batch, GENX(CFE_STATE), cfe) {
+      cfe.MaximumNumberofThreads =
+         devinfo->max_cs_threads * devinfo->subslice_total;
+
+      uint32_t scratch_surf = 0xffffffff;
+      if (total_scratch > 0) {
+         struct anv_bo *scratch_bo =
+               anv_scratch_pool_alloc(cmd_buffer->device,
+                                      &cmd_buffer->device->scratch_pool,
+                                      MESA_SHADER_COMPUTE,
+                                      total_scratch);
+         anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+                               scratch_bo);
+         scratch_surf =
+            anv_scratch_pool_get_surf(cmd_buffer->device,
+                                      &cmd_buffer->device->scratch_pool,
+                                      total_scratch);
+         cfe.ScratchSpaceBuffer = scratch_surf >> 4;
+      }
+
+      cfe.OverDispatchControl = 2; /* 50% overdispatch */
+   }
+
+   comp_state->scratch_size = total_scratch;
+#else
+   unreachable("Invalid call");
+#endif
+}
+
+static void
+genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
+   struct anv_compute_pipeline *pipeline =
+      anv_pipeline_to_compute(comp_state->base.pipeline);
+   const UNUSED struct intel_device_info *devinfo = cmd_buffer->device->info;
+
+   assert(pipeline->cs);
+
+   genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
+
+   genX(flush_descriptor_buffers)(cmd_buffer, &comp_state->base);
+
+   genX(flush_pipeline_select_gpgpu)(cmd_buffer);
+
+   /* Apply any pending pipeline flushes we may have.  We want to apply them
+    * now because, if any of those flushes are for things like push constants,
+    * the GPU will read the state at weird times.
+    */
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   if (cmd_buffer->state.compute.pipeline_dirty) {
+#if GFX_VERx10 < 125
+      /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
+       *
+       *    "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
+       *    the only bits that are changed are scoreboard related: Scoreboard
+       *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
+       *    these scoreboard related states, a MEDIA_STATE_FLUSH is
+       *    sufficient."
+       */
+      anv_add_pending_pipe_bits(cmd_buffer,
+                              ANV_PIPE_CS_STALL_BIT,
+                              "flush compute state");
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+#endif
+
+      anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
+
+#if GFX_VERx10 >= 125
+      const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
+      genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, prog_data->base.total_scratch);
+#endif
+
+      /* The workgroup size of the pipeline affects our push constant layout
+       * so flag push constants as dirty if we change the pipeline.
+       */
+      cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+      comp_state->base.push_constants_data_dirty = true;
+   }
+
+   cmd_buffer->state.descriptors_dirty |=
+      genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
+                                              &cmd_buffer->state.compute.base,
+                                              &pipeline->base);
+
+   if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
+       cmd_buffer->state.compute.pipeline_dirty) {
+      genX(cmd_buffer_flush_descriptor_sets)(cmd_buffer,
+                                             &cmd_buffer->state.compute.base,
+                                             VK_SHADER_STAGE_COMPUTE_BIT,
+                                             &pipeline->cs, 1);
+      cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
+
+#if GFX_VERx10 < 125
+      uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
+      struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
+         .BindingTablePointer =
+            cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
+         .SamplerStatePointer =
+            cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
+      };
+      GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
+
+      struct anv_state state =
+         anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
+                                      pipeline->interface_descriptor_data,
+                                      GENX(INTERFACE_DESCRIPTOR_DATA_length),
+                                      64);
+
+      uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
+      anv_batch_emit(&cmd_buffer->batch,
+                     GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
+         mid.InterfaceDescriptorTotalLength        = size;
+         mid.InterfaceDescriptorDataStartAddress   = state.offset;
+      }
+#endif
+   }
+
+   if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
+
+      if (comp_state->push_data.alloc_size == 0 ||
+          comp_state->base.push_constants_data_dirty) {
+         comp_state->push_data =
+            anv_cmd_buffer_cs_push_constants(cmd_buffer);
+         comp_state->base.push_constants_data_dirty = false;
+      }
+
+#if GFX_VERx10 < 125
+      if (comp_state->push_data.alloc_size) {
+         anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
+            curbe.CURBETotalDataLength    = comp_state->push_data.alloc_size;
+            curbe.CURBEDataStartAddress   = comp_state->push_data.offset;
+         }
+      }
+#endif
+
+      cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
+   }
+
+   cmd_buffer->state.compute.pipeline_dirty = false;
+
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+}
+
+static void
+anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer,
+                                  uint32_t baseGroupX,
+                                  uint32_t baseGroupY,
+                                  uint32_t baseGroupZ)
+{
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   struct anv_push_constants *push =
+      &cmd_buffer->state.compute.base.push_constants;
+   if (push->cs.base_work_group_id[0] != baseGroupX ||
+       push->cs.base_work_group_id[1] != baseGroupY ||
+       push->cs.base_work_group_id[2] != baseGroupZ) {
+      push->cs.base_work_group_id[0] = baseGroupX;
+      push->cs.base_work_group_id[1] = baseGroupY;
+      push->cs.base_work_group_id[2] = baseGroupZ;
+
+      cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+      cmd_buffer->state.compute.base.push_constants_data_dirty = true;
+   }
+}
+
+#define GPGPU_DISPATCHDIMX 0x2500
+#define GPGPU_DISPATCHDIMY 0x2504
+#define GPGPU_DISPATCHDIMZ 0x2508
+
+static void
+compute_load_indirect_params(struct anv_cmd_buffer *cmd_buffer,
+                             const struct anv_address indirect_addr)
+{
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0));
+   struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4));
+   struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8));
+
+   mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
+   mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
+   mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
+}
+
+static void
+compute_store_indirect_params(struct anv_cmd_buffer *cmd_buffer,
+                             const struct anv_address indirect_addr)
+{
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0));
+   struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4));
+   struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8));
+
+   mi_store(&b, size_x, mi_reg32(GPGPU_DISPATCHDIMX));
+   mi_store(&b, size_y, mi_reg32(GPGPU_DISPATCHDIMY));
+   mi_store(&b, size_z, mi_reg32(GPGPU_DISPATCHDIMZ));
+}
+
+
+#if GFX_VERx10 >= 125
+
+static inline struct GENX(INTERFACE_DESCRIPTOR_DATA)
+get_interface_descriptor_data(struct anv_cmd_buffer *cmd_buffer,
+                              const struct anv_shader_bin *shader,
+                              const struct brw_cs_prog_data *prog_data,
+                              const struct intel_cs_dispatch_info *dispatch)
+{
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+
+   return (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
+      .KernelStartPointer = shader->kernel.offset,
+      .SamplerStatePointer = cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
+      .BindingTablePointer = cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
+      /* Typically set to 0 to avoid prefetching on every thread dispatch. */
+      .BindingTableEntryCount = devinfo->verx10 == 125 ?
+         0 : 1 + MIN2(shader->bind_map.surface_count, 30),
+      .NumberofThreadsinGPGPUThreadGroup = dispatch->threads,
+      .SharedLocalMemorySize = encode_slm_size(GFX_VER, prog_data->base.total_shared),
+      .PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo),
+      .NumberOfBarriers = prog_data->uses_barrier,
+   };
+}
+
+static inline void
+emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer,
+                             const struct anv_shader_bin *shader,
+                             const struct brw_cs_prog_data *prog_data,
+                             struct anv_address indirect_addr)
+{
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+   assert(devinfo->has_indirect_unroll);
+
+   struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
+   bool predicate = cmd_buffer->state.conditional_render_enabled;
+
+   const struct intel_cs_dispatch_info dispatch =
+      brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
+   const int dispatch_size = dispatch.simd_size / 16;
+
+   struct GENX(COMPUTE_WALKER_BODY) body =  {
+      .SIMDSize                 = dispatch_size,
+      .MessageSIMD              = dispatch_size,
+      .IndirectDataStartAddress = comp_state->push_data.offset,
+      .IndirectDataLength       = comp_state->push_data.alloc_size,
+      .LocalXMaximum            = prog_data->local_size[0] - 1,
+      .LocalYMaximum            = prog_data->local_size[1] - 1,
+      .LocalZMaximum            = prog_data->local_size[2] - 1,
+      .ExecutionMask            = dispatch.right_mask,
+      .PostSync.MOCS            = anv_mocs(cmd_buffer->device, NULL, 0),
+      .InterfaceDescriptor =
+         get_interface_descriptor_data(cmd_buffer, shader, prog_data,
+                                       &dispatch),
+   };
+
+   cmd_buffer->last_indirect_dispatch =
+      anv_batch_emitn(
+         &cmd_buffer->batch,
+         GENX(EXECUTE_INDIRECT_DISPATCH_length),
+         GENX(EXECUTE_INDIRECT_DISPATCH),
+         .PredicateEnable            = predicate,
+         .MaxCount                   = 1,
+         .COMPUTE_WALKER_BODY        = body,
+         .ArgumentBufferStartAddress = indirect_addr,
+         .MOCS                       = anv_mocs(cmd_buffer->device,
+                                                indirect_addr.bo, 0),
+      );
+}
+
+static inline void
+emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
+                    const struct anv_compute_pipeline *pipeline, bool indirect,
+                    const struct brw_cs_prog_data *prog_data,
+                    uint32_t groupCountX, uint32_t groupCountY,
+                    uint32_t groupCountZ)
+{
+   const struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
+   const bool predicate = cmd_buffer->state.conditional_render_enabled;
+
+   const struct intel_device_info *devinfo = pipeline->base.device->info;
+   const struct intel_cs_dispatch_info dispatch =
+      brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
+
+   cmd_buffer->last_compute_walker =
+      anv_batch_emitn(
+         &cmd_buffer->batch,
+         GENX(COMPUTE_WALKER_length),
+         GENX(COMPUTE_WALKER),
+         .IndirectParameterEnable        = indirect,
+         .PredicateEnable                = predicate,
+         .SIMDSize                       = dispatch.simd_size / 16,
+         .MessageSIMD                    = dispatch.simd_size / 16,
+         .IndirectDataStartAddress       = comp_state->push_data.offset,
+         .IndirectDataLength             = comp_state->push_data.alloc_size,
+#if GFX_VERx10 == 125
+         .SystolicModeEnable             = prog_data->uses_systolic,
+#endif
+         .GenerateLocalID                = prog_data->generate_local_id != 0,
+         .EmitLocal                      = prog_data->generate_local_id,
+         .WalkOrder                      = prog_data->walk_order,
+         .TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
+                       TileY32bpe : Linear,
+         .LocalXMaximum                  = prog_data->local_size[0] - 1,
+         .LocalYMaximum                  = prog_data->local_size[1] - 1,
+         .LocalZMaximum                  = prog_data->local_size[2] - 1,
+         .ThreadGroupIDXDimension        = groupCountX,
+         .ThreadGroupIDYDimension        = groupCountY,
+         .ThreadGroupIDZDimension        = groupCountZ,
+         .ExecutionMask                  = dispatch.right_mask,
+         .PostSync                       = {
+            .MOCS                        = anv_mocs(pipeline->base.device, NULL, 0),
+         },
+         .InterfaceDescriptor =
+            get_interface_descriptor_data(cmd_buffer, pipeline->cs,
+                                          prog_data, &dispatch),
+      );
+}
+
+#else /* #if GFX_VERx10 >= 125 */
+
+static inline void
+emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
+                  const struct anv_compute_pipeline *pipeline, bool indirect,
+                  const struct brw_cs_prog_data *prog_data,
+                  uint32_t groupCountX, uint32_t groupCountY,
+                  uint32_t groupCountZ)
+{
+   const bool predicate = cmd_buffer->state.conditional_render_enabled;
+
+   const struct intel_device_info *devinfo = pipeline->base.device->info;
+   const struct intel_cs_dispatch_info dispatch =
+      brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
+      ggw.IndirectParameterEnable      = indirect;
+      ggw.PredicateEnable              = predicate;
+      ggw.SIMDSize                     = dispatch.simd_size / 16;
+      ggw.ThreadDepthCounterMaximum    = 0;
+      ggw.ThreadHeightCounterMaximum   = 0;
+      ggw.ThreadWidthCounterMaximum    = dispatch.threads - 1;
+      ggw.ThreadGroupIDXDimension      = groupCountX;
+      ggw.ThreadGroupIDYDimension      = groupCountY;
+      ggw.ThreadGroupIDZDimension      = groupCountZ;
+      ggw.RightExecutionMask           = dispatch.right_mask;
+      ggw.BottomExecutionMask          = 0xffffffff;
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
+}
+
+#endif /* #if GFX_VERx10 >= 125 */
+
+static inline void
+emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
+               const struct anv_compute_pipeline *pipeline,
+               const struct brw_cs_prog_data *prog_data,
+               struct anv_address indirect_addr,
+               uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
+{
+   bool is_indirect = !anv_address_is_null(indirect_addr);
+
+#if GFX_VERx10 >= 125
+   if (is_indirect && cmd_buffer->device->info->has_indirect_unroll) {
+      emit_indirect_compute_walker(cmd_buffer, pipeline->cs, prog_data,
+                                   indirect_addr);
+      return;
+   }
+#endif
+
+   if (is_indirect)
+      compute_load_indirect_params(cmd_buffer, indirect_addr);
+
+#if GFX_VERx10 >= 125
+   emit_compute_walker(cmd_buffer, pipeline, is_indirect, prog_data,
+                       groupCountX, groupCountY, groupCountZ);
+#else
+   emit_gpgpu_walker(cmd_buffer, pipeline, is_indirect, prog_data,
+                     groupCountX, groupCountY, groupCountZ);
+#endif
+}
+
+void genX(CmdDispatchBase)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    baseGroupX,
+    uint32_t                                    baseGroupY,
+    uint32_t                                    baseGroupZ,
+    uint32_t                                    groupCountX,
+    uint32_t                                    groupCountY,
+    uint32_t                                    groupCountZ)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_compute_pipeline *pipeline =
+      anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
+   const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
+
+   anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX,
+                                     baseGroupY, baseGroupZ);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_COMPUTE,
+                        "compute",
+                        groupCountX * groupCountY * groupCountZ *
+                        prog_data->local_size[0] * prog_data->local_size[1] *
+                        prog_data->local_size[2]);
+
+   trace_intel_begin_compute(&cmd_buffer->trace);
+
+   if (prog_data->uses_num_work_groups) {
+      struct anv_state state =
+         anv_cmd_buffer_alloc_temporary_state(cmd_buffer, 12, 4);
+      uint32_t *sizes = state.map;
+      sizes[0] = groupCountX;
+      sizes[1] = groupCountY;
+      sizes[2] = groupCountZ;
+      cmd_buffer->state.compute.num_workgroups =
+         anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
+
+      /* The num_workgroups buffer goes in the binding table */
+      cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+   }
+
+   genX(cmd_buffer_flush_compute_state)(cmd_buffer);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   emit_cs_walker(cmd_buffer, pipeline, prog_data,
+                  ANV_NULL_ADDRESS /* no indirect data */,
+                  groupCountX, groupCountY, groupCountZ);
+
+   trace_intel_end_compute(&cmd_buffer->trace,
+                           groupCountX, groupCountY, groupCountZ);
+}
+
+void genX(CmdDispatchIndirect)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   struct anv_compute_pipeline *pipeline =
+      anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
+   const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
+   struct anv_address addr = anv_address_add(buffer->address, offset);
+   UNUSED struct anv_batch *batch = &cmd_buffer->batch;
+
+   anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0);
+
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_COMPUTE,
+                        "compute indirect",
+                        0);
+   trace_intel_begin_compute(&cmd_buffer->trace);
+
+   if (prog_data->uses_num_work_groups) {
+      cmd_buffer->state.compute.num_workgroups = addr;
+
+      /* The num_workgroups buffer goes in the binding table */
+      cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
+   }
+
+   genX(cmd_buffer_flush_compute_state)(cmd_buffer);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   emit_cs_walker(cmd_buffer, pipeline, prog_data, addr, 0, 0, 0);
+
+   trace_intel_end_compute(&cmd_buffer->trace, 0, 0, 0);
+}
+
+struct anv_address
+genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
+{
+#if GFX_VERx10 >= 125
+   struct anv_device *device = cmd_buffer->device;
+
+   struct anv_state state =
+      anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
+                                           BRW_RT_DISPATCH_GLOBALS_SIZE, 64);
+   struct brw_rt_scratch_layout layout;
+   uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
+                                       * some cases?
+                                       */
+   brw_rt_compute_scratch_layout(&layout, device->info,
+                                 stack_ids_per_dss, 1 << 10);
+
+   const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
+      .MemBaseAddress = (struct anv_address) {
+         /* The ray query HW computes offsets from the top of the buffer, so
+          * let the address at the end of the buffer.
+          */
+         .bo = device->ray_query_bo,
+         .offset = device->ray_query_bo->size
+      },
+      .AsyncRTStackSize = layout.ray_stack_stride / 64,
+      .NumDSSRTStacks = layout.stack_ids_per_dss,
+      .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
+      .Flags = RT_DEPTH_TEST_LESS_EQUAL,
+      .ResumeShaderTable = (struct anv_address) {
+         .bo = cmd_buffer->state.ray_query_shadow_bo,
+      },
+   };
+   GENX(RT_DISPATCH_GLOBALS_pack)(NULL, state.map, &rtdg);
+
+   return anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
+#else
+   unreachable("Not supported");
+#endif
+}
+
+#if GFX_VERx10 >= 125
+void
+genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer,
+                                 struct anv_kernel *kernel,
+                                 const uint32_t *global_size,
+                                 uint32_t arg_count,
+                                 const struct anv_kernel_arg *args)
+{
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+   const struct brw_cs_prog_data *cs_prog_data =
+      brw_cs_prog_data_const(kernel->bin->prog_data);
+
+   genX(cmd_buffer_config_l3)(cmd_buffer, kernel->l3_config);
+
+   genX(flush_pipeline_select_gpgpu)(cmd_buffer);
+
+   /* Apply any pending pipeline flushes we may have.  We want to apply them
+    * now because, if any of those flushes are for things like push constants,
+    * the GPU will read the state at weird times.
+    */
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   uint32_t indirect_data_size = sizeof(struct brw_kernel_sysvals);
+   indirect_data_size += kernel->bin->bind_map.kernel_args_size;
+   indirect_data_size = ALIGN(indirect_data_size, 64);
+   struct anv_state indirect_data =
+      anv_cmd_buffer_alloc_general_state(cmd_buffer,
+                                         indirect_data_size, 64);
+   memset(indirect_data.map, 0, indirect_data.alloc_size);
+
+   struct brw_kernel_sysvals sysvals = {};
+   if (global_size != NULL) {
+      for (unsigned i = 0; i < 3; i++)
+         sysvals.num_work_groups[i] = global_size[i];
+      memcpy(indirect_data.map, &sysvals, sizeof(sysvals));
+   } else {
+      struct anv_address sysvals_addr = {
+         .bo = NULL, /* General state buffer is always 0. */
+         .offset = indirect_data.offset,
+      };
+
+      compute_store_indirect_params(cmd_buffer, sysvals_addr);
+   }
+
+   void *args_map = indirect_data.map + sizeof(sysvals);
+   for (unsigned i = 0; i < kernel->bin->bind_map.kernel_arg_count; i++) {
+      struct brw_kernel_arg_desc *arg_desc =
+         &kernel->bin->bind_map.kernel_args[i];
+      assert(i < arg_count);
+      const struct anv_kernel_arg *arg = &args[i];
+      if (arg->is_ptr) {
+         memcpy(args_map + arg_desc->offset, arg->ptr, arg_desc->size);
+      } else {
+         assert(arg_desc->size <= sizeof(arg->u64));
+         memcpy(args_map + arg_desc->offset, &arg->u64, arg_desc->size);
+      }
+   }
+
+   struct intel_cs_dispatch_info dispatch =
+      brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
+      cw.PredicateEnable                = false;
+      cw.SIMDSize                       = dispatch.simd_size / 16;
+      cw.MessageSIMD                    = dispatch.simd_size / 16;
+      cw.IndirectDataStartAddress       = indirect_data.offset;
+      cw.IndirectDataLength             = indirect_data.alloc_size;
+      cw.LocalXMaximum                  = cs_prog_data->local_size[0] - 1;
+      cw.LocalYMaximum                  = cs_prog_data->local_size[1] - 1;
+      cw.LocalZMaximum                  = cs_prog_data->local_size[2] - 1;
+      cw.ExecutionMask                  = dispatch.right_mask;
+      cw.PostSync.MOCS                  = cmd_buffer->device->isl_dev.mocs.internal;
+
+      if (global_size != NULL) {
+         cw.ThreadGroupIDXDimension     = global_size[0];
+         cw.ThreadGroupIDYDimension     = global_size[1];
+         cw.ThreadGroupIDZDimension     = global_size[2];
+      } else {
+         cw.IndirectParameterEnable     = true;
+      }
+
+      cw.InterfaceDescriptor =
+         get_interface_descriptor_data(cmd_buffer,
+                                       kernel->bin,
+                                       cs_prog_data,
+                                       &dispatch);
+   }
+
+   /* We just blew away the compute pipeline state */
+   cmd_buffer->state.compute.pipeline_dirty = true;
+}
+
+static void
+calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
+{
+   unsigned total_shift = 0;
+   memset(local_shift, 0, 3);
+
+   bool progress;
+   do {
+      progress = false;
+      for (unsigned i = 0; i < 3; i++) {
+         assert(global[i] > 0);
+         if ((1 << local_shift[i]) < global[i]) {
+            progress = true;
+            local_shift[i]++;
+            total_shift++;
+         }
+
+         if (total_shift == 3)
+            return;
+      }
+   } while(progress);
+
+   /* Assign whatever's left to x */
+   local_shift[0] += 3 - total_shift;
+}
+
+static struct GENX(RT_SHADER_TABLE)
+vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)
+{
+   return (struct GENX(RT_SHADER_TABLE)) {
+      .BaseAddress = anv_address_from_u64(region->deviceAddress),
+      .Stride = region->stride,
+   };
+}
+
+struct trace_params {
+   /* If is_sbt_indirect, use indirect_sbts_addr to build RT_DISPATCH_GLOBALS
+    * with mi_builder.
+    */
+   bool is_sbt_indirect;
+   const VkStridedDeviceAddressRegionKHR *raygen_sbt;
+   const VkStridedDeviceAddressRegionKHR *miss_sbt;
+   const VkStridedDeviceAddressRegionKHR *hit_sbt;
+   const VkStridedDeviceAddressRegionKHR *callable_sbt;
+
+   /* A pointer to a VkTraceRaysIndirectCommand2KHR structure */
+   uint64_t indirect_sbts_addr;
+
+   /* If is_indirect, use launch_size_addr to program the dispatch size. */
+   bool is_launch_size_indirect;
+   uint32_t launch_size[3];
+
+   /* A pointer a uint32_t[3] */
+   uint64_t launch_size_addr;
+};
+
+static struct anv_state
+cmd_buffer_emit_rt_dispatch_globals(struct anv_cmd_buffer *cmd_buffer,
+                                    struct trace_params *params)
+{
+   assert(!params->is_sbt_indirect);
+   assert(params->miss_sbt != NULL);
+   assert(params->hit_sbt != NULL);
+   assert(params->callable_sbt != NULL);
+
+   struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
+
+   struct anv_state rtdg_state =
+      anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
+                                           BRW_RT_PUSH_CONST_OFFSET +
+                                           sizeof(struct anv_push_constants),
+                                           64);
+
+   struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
+      .MemBaseAddress     = (struct anv_address) {
+         .bo = rt->scratch.bo,
+         .offset = rt->scratch.layout.ray_stack_start,
+      },
+      .CallStackHandler   = anv_shader_bin_get_bsr(
+         cmd_buffer->device->rt_trivial_return, 0),
+      .AsyncRTStackSize   = rt->scratch.layout.ray_stack_stride / 64,
+      .NumDSSRTStacks     = rt->scratch.layout.stack_ids_per_dss,
+      .MaxBVHLevels       = BRW_RT_MAX_BVH_LEVELS,
+      .Flags              = RT_DEPTH_TEST_LESS_EQUAL,
+      .HitGroupTable      = vk_sdar_to_shader_table(params->hit_sbt),
+      .MissGroupTable     = vk_sdar_to_shader_table(params->miss_sbt),
+      .SWStackSize        = rt->scratch.layout.sw_stack_size / 64,
+      .LaunchWidth        = params->launch_size[0],
+      .LaunchHeight       = params->launch_size[1],
+      .LaunchDepth        = params->launch_size[2],
+      .CallableGroupTable = vk_sdar_to_shader_table(params->callable_sbt),
+   };
+   GENX(RT_DISPATCH_GLOBALS_pack)(NULL, rtdg_state.map, &rtdg);
+
+   return rtdg_state;
+}
+
+static struct mi_value
+mi_build_sbt_entry(struct mi_builder *b,
+                   uint64_t addr_field_addr,
+                   uint64_t stride_field_addr)
+{
+   return mi_ior(b,
+                 mi_iand(b, mi_mem64(anv_address_from_u64(addr_field_addr)),
+                            mi_imm(BITFIELD64_BIT(49) - 1)),
+                 mi_ishl_imm(b, mi_mem32(anv_address_from_u64(stride_field_addr)),
+                                48));
+}
+
+static struct anv_state
+cmd_buffer_emit_rt_dispatch_globals_indirect(struct anv_cmd_buffer *cmd_buffer,
+                                             struct trace_params *params)
+{
+   struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
+
+   struct anv_state rtdg_state =
+      anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
+                                           BRW_RT_PUSH_CONST_OFFSET +
+                                           sizeof(struct anv_push_constants),
+                                           64);
+
+   struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
+      .MemBaseAddress     = (struct anv_address) {
+         .bo = rt->scratch.bo,
+         .offset = rt->scratch.layout.ray_stack_start,
+      },
+      .CallStackHandler   = anv_shader_bin_get_bsr(
+         cmd_buffer->device->rt_trivial_return, 0),
+      .AsyncRTStackSize   = rt->scratch.layout.ray_stack_stride / 64,
+      .NumDSSRTStacks     = rt->scratch.layout.stack_ids_per_dss,
+      .MaxBVHLevels       = BRW_RT_MAX_BVH_LEVELS,
+      .Flags              = RT_DEPTH_TEST_LESS_EQUAL,
+      .SWStackSize        = rt->scratch.layout.sw_stack_size / 64,
+   };
+   GENX(RT_DISPATCH_GLOBALS_pack)(NULL, rtdg_state.map, &rtdg);
+
+   struct anv_address rtdg_addr =
+      anv_cmd_buffer_temporary_state_address(cmd_buffer, rtdg_state);
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+   const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &rtdg_addr);
+   mi_builder_set_mocs(&b, mocs);
+
+   /* Fill the MissGroupTable, HitGroupTable & CallableGroupTable fields of
+    * RT_DISPATCH_GLOBALS using the mi_builder.
+    */
+   mi_store(&b,
+            mi_mem64(
+               anv_address_add(
+                  rtdg_addr,
+                  GENX(RT_DISPATCH_GLOBALS_MissGroupTable_start) / 8)),
+            mi_build_sbt_entry(&b,
+                               params->indirect_sbts_addr +
+                               offsetof(VkTraceRaysIndirectCommand2KHR,
+                                        missShaderBindingTableAddress),
+                               params->indirect_sbts_addr +
+                               offsetof(VkTraceRaysIndirectCommand2KHR,
+                                        missShaderBindingTableStride)));
+   mi_store(&b,
+            mi_mem64(
+               anv_address_add(
+                  rtdg_addr,
+                  GENX(RT_DISPATCH_GLOBALS_HitGroupTable_start) / 8)),
+            mi_build_sbt_entry(&b,
+                               params->indirect_sbts_addr +
+                               offsetof(VkTraceRaysIndirectCommand2KHR,
+                                        hitShaderBindingTableAddress),
+                               params->indirect_sbts_addr +
+                               offsetof(VkTraceRaysIndirectCommand2KHR,
+                                        hitShaderBindingTableStride)));
+   mi_store(&b,
+            mi_mem64(
+               anv_address_add(
+                  rtdg_addr,
+                  GENX(RT_DISPATCH_GLOBALS_CallableGroupTable_start) / 8)),
+            mi_build_sbt_entry(&b,
+                               params->indirect_sbts_addr +
+                               offsetof(VkTraceRaysIndirectCommand2KHR,
+                                        callableShaderBindingTableAddress),
+                               params->indirect_sbts_addr +
+                               offsetof(VkTraceRaysIndirectCommand2KHR,
+                                        callableShaderBindingTableStride)));
+
+   return rtdg_state;
+}
+
+static void
+cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
+                      struct trace_params *params)
+{
+   struct anv_device *device = cmd_buffer->device;
+   struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
+   struct anv_ray_tracing_pipeline *pipeline =
+      anv_pipeline_to_ray_tracing(rt->base.pipeline);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   /* If we have a known degenerate launch size, just bail */
+   if (!params->is_launch_size_indirect &&
+       (params->launch_size[0] == 0 ||
+        params->launch_size[1] == 0 ||
+        params->launch_size[2] == 0))
+      return;
+
+   trace_intel_begin_rays(&cmd_buffer->trace);
+
+   genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
+
+   genX(flush_descriptor_buffers)(cmd_buffer, &rt->base);
+
+   genX(flush_pipeline_select_gpgpu)(cmd_buffer);
+
+   cmd_buffer->state.rt.pipeline_dirty = false;
+
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
+                                           &cmd_buffer->state.rt.base,
+                                           &pipeline->base);
+
+   /* Add these to the reloc list as they're internal buffers that don't
+    * actually have relocs to pick them up manually.
+    *
+    * TODO(RT): This is a bit of a hack
+    */
+   anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+                         rt->scratch.bo);
+   anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+                         cmd_buffer->device->btd_fifo_bo);
+
+   /* Allocate and set up our RT_DISPATCH_GLOBALS */
+   struct anv_state rtdg_state =
+      params->is_sbt_indirect ?
+      cmd_buffer_emit_rt_dispatch_globals_indirect(cmd_buffer, params) :
+      cmd_buffer_emit_rt_dispatch_globals(cmd_buffer, params);
+
+   assert(rtdg_state.alloc_size >= (BRW_RT_PUSH_CONST_OFFSET +
+                                    sizeof(struct anv_push_constants)));
+   assert(GENX(RT_DISPATCH_GLOBALS_length) * 4 <= BRW_RT_PUSH_CONST_OFFSET);
+   /* Push constants go after the RT_DISPATCH_GLOBALS */
+   memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET,
+          &cmd_buffer->state.rt.base.push_constants,
+          sizeof(struct anv_push_constants));
+
+   struct anv_address rtdg_addr =
+      anv_cmd_buffer_temporary_state_address(cmd_buffer, rtdg_state);
+
+   uint8_t local_size_log2[3];
+   uint32_t global_size[3] = {};
+   if (params->is_launch_size_indirect) {
+      /* Pick a local size that's probably ok.  We assume most TraceRays calls
+       * will use a two-dimensional dispatch size.  Worst case, our initial
+       * dispatch will be a little slower than it has to be.
+       */
+      local_size_log2[0] = 2;
+      local_size_log2[1] = 1;
+      local_size_log2[2] = 0;
+
+      struct mi_builder b;
+      mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+      const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &rtdg_addr);
+      mi_builder_set_mocs(&b, mocs);
+
+      struct mi_value launch_size[3] = {
+         mi_mem32(anv_address_from_u64(params->launch_size_addr + 0)),
+         mi_mem32(anv_address_from_u64(params->launch_size_addr + 4)),
+         mi_mem32(anv_address_from_u64(params->launch_size_addr + 8)),
+      };
+
+      /* Store the original launch size into RT_DISPATCH_GLOBALS */
+      mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
+                                            GENX(RT_DISPATCH_GLOBALS_LaunchWidth_start) / 8)),
+               mi_value_ref(&b, launch_size[0]));
+      mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
+                                            GENX(RT_DISPATCH_GLOBALS_LaunchHeight_start) / 8)),
+               mi_value_ref(&b, launch_size[1]));
+      mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
+                                            GENX(RT_DISPATCH_GLOBALS_LaunchDepth_start) / 8)),
+               mi_value_ref(&b, launch_size[2]));
+
+      /* Compute the global dispatch size */
+      for (unsigned i = 0; i < 3; i++) {
+         if (local_size_log2[i] == 0)
+            continue;
+
+         /* global_size = DIV_ROUND_UP(launch_size, local_size)
+          *
+          * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm
+          * has the semantics of shifting the enture 64-bit value and taking
+          * the bottom 32 so we don't have to worry about roll-over.
+          */
+         uint32_t local_size = 1 << local_size_log2[i];
+         launch_size[i] = mi_iadd(&b, launch_size[i],
+                                      mi_imm(local_size - 1));
+         launch_size[i] = mi_ushr32_imm(&b, launch_size[i],
+                                            local_size_log2[i]);
+      }
+
+      mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]);
+      mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]);
+      mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]);
+
+   } else {
+      calc_local_trace_size(local_size_log2, params->launch_size);
+
+      for (unsigned i = 0; i < 3; i++) {
+         /* We have to be a bit careful here because DIV_ROUND_UP adds to the
+          * numerator value may overflow.  Cast to uint64_t to avoid this.
+          */
+         uint32_t local_size = 1 << local_size_log2[i];
+         global_size[i] = DIV_ROUND_UP((uint64_t)params->launch_size[i], local_size);
+      }
+   }
+
+#if GFX_VERx10 == 125
+   /* Wa_14014427904 - We need additional invalidate/flush when
+    * emitting NP state commands with ATS-M in compute mode.
+    */
+   if (intel_device_info_is_atsm(device->info) &&
+      cmd_buffer->queue_family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
+      genx_batch_emit_pipe_control(&cmd_buffer->batch,
+                                   cmd_buffer->device->info,
+                                   cmd_buffer->state.current_pipeline,
+                                   ANV_PIPE_CS_STALL_BIT |
+                                   ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
+                                   ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
+                                   ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
+                                   ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
+                                   ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
+                                   ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
+   }
+#endif
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BTD), btd) {
+      /* TODO: This is the timeout after which the bucketed thread dispatcher
+       *       will kick off a wave of threads. We go with the lowest value
+       *       for now. It could be tweaked on a per application basis
+       *       (drirc).
+       */
+      btd.DispatchTimeoutCounter = _64clocks;
+      /* BSpec 43851: "This field must be programmed to 6h i.e. memory backed
+       *               buffer must be 128KB."
+       */
+      btd.PerDSSMemoryBackedBufferSize = 6;
+      btd.MemoryBackedBufferBasePointer = (struct anv_address) { .bo = device->btd_fifo_bo };
+      if (pipeline->base.scratch_size > 0) {
+         struct anv_bo *scratch_bo =
+            anv_scratch_pool_alloc(device,
+                                   &device->scratch_pool,
+                                   MESA_SHADER_COMPUTE,
+                                   pipeline->base.scratch_size);
+         anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
+                               scratch_bo);
+         uint32_t scratch_surf =
+            anv_scratch_pool_get_surf(cmd_buffer->device,
+                                      &device->scratch_pool,
+                                      pipeline->base.scratch_size);
+         btd.ScratchSpaceBuffer = scratch_surf >> 4;
+      }
+   }
+
+   genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, pipeline->base.scratch_size);
+
+   const struct brw_cs_prog_data *cs_prog_data =
+      brw_cs_prog_data_const(device->rt_trampoline->prog_data);
+   struct intel_cs_dispatch_info dispatch =
+      brw_cs_get_dispatch_info(device->info, cs_prog_data, NULL);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
+      cw.IndirectParameterEnable        = params->is_launch_size_indirect;
+      cw.PredicateEnable                = cmd_buffer->state.conditional_render_enabled;
+      cw.SIMDSize                       = dispatch.simd_size / 16;
+      cw.MessageSIMD                    = dispatch.simd_size / 16;
+      cw.LocalXMaximum                  = (1 << local_size_log2[0]) - 1;
+      cw.LocalYMaximum                  = (1 << local_size_log2[1]) - 1;
+      cw.LocalZMaximum                  = (1 << local_size_log2[2]) - 1;
+      cw.ThreadGroupIDXDimension        = global_size[0];
+      cw.ThreadGroupIDYDimension        = global_size[1];
+      cw.ThreadGroupIDZDimension        = global_size[2];
+      cw.ExecutionMask                  = 0xff;
+      cw.EmitInlineParameter            = true;
+      cw.PostSync.MOCS                  = anv_mocs(pipeline->base.device, NULL, 0);
+
+      const gl_shader_stage s = MESA_SHADER_RAYGEN;
+      struct anv_device *device = cmd_buffer->device;
+      struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];
+      struct anv_state *samplers = &cmd_buffer->state.samplers[s];
+      cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
+         .KernelStartPointer = device->rt_trampoline->kernel.offset,
+         .SamplerStatePointer = samplers->offset,
+         /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */
+         .SamplerCount = 0,
+         .BindingTablePointer = surfaces->offset,
+         .NumberofThreadsinGPGPUThreadGroup = 1,
+         .BTDMode = true,
+      };
+
+      struct brw_rt_raygen_trampoline_params trampoline_params = {
+         .rt_disp_globals_addr = anv_address_physical(rtdg_addr),
+         .raygen_bsr_addr =
+            params->is_sbt_indirect ?
+            (params->indirect_sbts_addr +
+             offsetof(VkTraceRaysIndirectCommand2KHR,
+                      raygenShaderRecordAddress)) :
+            params->raygen_sbt->deviceAddress,
+         .is_indirect = params->is_sbt_indirect,
+         .local_group_size_log2 = {
+            local_size_log2[0],
+            local_size_log2[1],
+            local_size_log2[2],
+         },
+      };
+      STATIC_ASSERT(sizeof(trampoline_params) == 32);
+      memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params));
+   }
+
+   trace_intel_end_rays(&cmd_buffer->trace,
+                        params->launch_size[0],
+                        params->launch_size[1],
+                        params->launch_size[2]);
+}
+
+void
+genX(CmdTraceRaysKHR)(
+    VkCommandBuffer                             commandBuffer,
+    const VkStridedDeviceAddressRegionKHR*      pRaygenShaderBindingTable,
+    const VkStridedDeviceAddressRegionKHR*      pMissShaderBindingTable,
+    const VkStridedDeviceAddressRegionKHR*      pHitShaderBindingTable,
+    const VkStridedDeviceAddressRegionKHR*      pCallableShaderBindingTable,
+    uint32_t                                    width,
+    uint32_t                                    height,
+    uint32_t                                    depth)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct trace_params params = {
+      .is_sbt_indirect         = false,
+      .raygen_sbt              = pRaygenShaderBindingTable,
+      .miss_sbt                = pMissShaderBindingTable,
+      .hit_sbt                 = pHitShaderBindingTable,
+      .callable_sbt            = pCallableShaderBindingTable,
+      .is_launch_size_indirect = false,
+      .launch_size             = {
+         width,
+         height,
+         depth,
+      },
+   };
+
+   cmd_buffer_trace_rays(cmd_buffer, &params);
+}
+
+void
+genX(CmdTraceRaysIndirectKHR)(
+    VkCommandBuffer                             commandBuffer,
+    const VkStridedDeviceAddressRegionKHR*      pRaygenShaderBindingTable,
+    const VkStridedDeviceAddressRegionKHR*      pMissShaderBindingTable,
+    const VkStridedDeviceAddressRegionKHR*      pHitShaderBindingTable,
+    const VkStridedDeviceAddressRegionKHR*      pCallableShaderBindingTable,
+    VkDeviceAddress                             indirectDeviceAddress)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct trace_params params = {
+      .is_sbt_indirect         = false,
+      .raygen_sbt              = pRaygenShaderBindingTable,
+      .miss_sbt                = pMissShaderBindingTable,
+      .hit_sbt                 = pHitShaderBindingTable,
+      .callable_sbt            = pCallableShaderBindingTable,
+      .is_launch_size_indirect = true,
+      .launch_size_addr        = indirectDeviceAddress,
+   };
+
+   cmd_buffer_trace_rays(cmd_buffer, &params);
+}
+
+void
+genX(CmdTraceRaysIndirect2KHR)(
+    VkCommandBuffer                             commandBuffer,
+    VkDeviceAddress                             indirectDeviceAddress)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct trace_params params = {
+      .is_sbt_indirect         = true,
+      .indirect_sbts_addr      = indirectDeviceAddress,
+      .is_launch_size_indirect = true,
+      .launch_size_addr        = indirectDeviceAddress +
+                                 offsetof(VkTraceRaysIndirectCommand2KHR, width),
+   };
+
+   cmd_buffer_trace_rays(cmd_buffer, &params);
+}
+
+#endif /* GFX_VERx10 >= 125 */
diff --git a/src/intel/vulkan/genX_cmd_draw.c b/src/intel/vulkan/genX_cmd_draw.c
new file mode 100644
index 00000000000..64a806659b6
--- /dev/null
+++ b/src/intel/vulkan/genX_cmd_draw.c
@@ -0,0 +1,2330 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "anv_private.h"
+#include "anv_measure.h"
+#include "vk_render_pass.h"
+#include "vk_util.h"
+
+#include "common/intel_aux_map.h"
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+#include "genxml/genX_rt_pack.h"
+#include "common/intel_genX_state_brw.h"
+
+#include "ds/intel_tracepoints.h"
+
+/* We reserve :
+ *    - GPR 14 for secondary command buffer returns
+ *    - GPR 15 for conditional rendering
+ */
+#define MI_BUILDER_NUM_ALLOC_GPRS 14
+#define __gen_get_batch_dwords anv_batch_emit_dwords
+#define __gen_address_offset anv_address_add
+#define __gen_get_batch_address(b, a) anv_batch_address(b, a)
+#include "common/mi_builder.h"
+
+static void
+cmd_buffer_alloc_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+   VkShaderStageFlags stages = pipeline->base.base.active_stages;
+
+   /* In order to avoid thrash, we assume that vertex and fragment stages
+    * always exist.  In the rare case where one is missing *and* the other
+    * uses push concstants, this may be suboptimal.  However, avoiding stalls
+    * seems more important.
+    */
+   stages |= VK_SHADER_STAGE_FRAGMENT_BIT;
+   if (anv_pipeline_is_primitive(pipeline))
+      stages |= VK_SHADER_STAGE_VERTEX_BIT;
+
+   if (stages == cmd_buffer->state.gfx.push_constant_stages)
+      return;
+
+   unsigned push_constant_kb;
+
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+   if (anv_pipeline_is_mesh(pipeline))
+      push_constant_kb = devinfo->mesh_max_constant_urb_size_kb;
+   else
+      push_constant_kb = devinfo->max_constant_urb_size_kb;
+
+   const unsigned num_stages =
+      util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
+   unsigned size_per_stage = push_constant_kb / num_stages;
+
+   /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
+    * units of 2KB.  Incidentally, these are the same platforms that have
+    * 32KB worth of push constant space.
+    */
+   if (push_constant_kb == 32)
+      size_per_stage &= ~1u;
+
+   uint32_t kb_used = 0;
+   for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
+      const unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
+      anv_batch_emit(&cmd_buffer->batch,
+                     GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
+         alloc._3DCommandSubOpcode  = 18 + i;
+         alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
+         alloc.ConstantBufferSize   = push_size;
+      }
+      kb_used += push_size;
+   }
+
+   anv_batch_emit(&cmd_buffer->batch,
+                  GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
+      alloc.ConstantBufferOffset = kb_used;
+      alloc.ConstantBufferSize = push_constant_kb - kb_used;
+   }
+
+#if GFX_VERx10 == 125
+   /* DG2: Wa_22011440098
+    * MTL: Wa_18022330953
+    *
+    * In 3D mode, after programming push constant alloc command immediately
+    * program push constant command(ZERO length) without any commit between
+    * them.
+    */
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
+      /* Update empty push constants for all stages (bitmask = 11111b) */
+      c.ShaderUpdateEnable = 0x1f;
+      c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
+   }
+#endif
+
+   cmd_buffer->state.gfx.push_constant_stages = stages;
+
+   /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
+    *
+    *    "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
+    *    the next 3DPRIMITIVE command after programming the
+    *    3DSTATE_PUSH_CONSTANT_ALLOC_VS"
+    *
+    * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
+    * pipeline setup, we need to dirty push constants.
+    */
+   cmd_buffer->state.push_constants_dirty |= stages;
+}
+
+static void
+cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
+                                    uint32_t stages)
+{
+   static const uint32_t sampler_state_opcodes[] = {
+      [MESA_SHADER_VERTEX]                      = 43,
+      [MESA_SHADER_TESS_CTRL]                   = 44, /* HS */
+      [MESA_SHADER_TESS_EVAL]                   = 45, /* DS */
+      [MESA_SHADER_GEOMETRY]                    = 46,
+      [MESA_SHADER_FRAGMENT]                    = 47,
+   };
+
+   static const uint32_t binding_table_opcodes[] = {
+      [MESA_SHADER_VERTEX]                      = 38,
+      [MESA_SHADER_TESS_CTRL]                   = 39,
+      [MESA_SHADER_TESS_EVAL]                   = 40,
+      [MESA_SHADER_GEOMETRY]                    = 41,
+      [MESA_SHADER_FRAGMENT]                    = 42,
+   };
+
+   anv_foreach_stage(s, stages) {
+      assert(s < ARRAY_SIZE(binding_table_opcodes));
+
+      if (cmd_buffer->state.samplers[s].alloc_size > 0) {
+         anv_batch_emit(&cmd_buffer->batch,
+                        GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
+            ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
+            ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
+         }
+      }
+
+      /* Always emit binding table pointers if we're asked to, since on SKL
+       * this is what flushes push constants. */
+      anv_batch_emit(&cmd_buffer->batch,
+                     GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
+         btp._3DCommandSubOpcode = binding_table_opcodes[s];
+         btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
+      }
+   }
+}
+
+static struct anv_address
+get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
+                       const struct anv_shader_bin *shader,
+                       const struct anv_push_range *range)
+{
+   struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+   switch (range->set) {
+   case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
+      /* This is a descriptor set buffer so the set index is
+       * actually given by binding->binding.  (Yes, that's
+       * confusing.)
+       */
+      struct anv_descriptor_set *set =
+         gfx_state->base.descriptors[range->index];
+      return anv_descriptor_set_address(set);
+   }
+
+   case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER: {
+      return anv_address_from_u64(
+         anv_cmd_buffer_descriptor_buffer_address(
+            cmd_buffer,
+            gfx_state->base.descriptor_buffers[range->index].buffer_index) +
+         gfx_state->base.descriptor_buffers[range->index].buffer_offset);
+   }
+
+   case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
+      if (gfx_state->base.push_constants_state.alloc_size == 0) {
+         gfx_state->base.push_constants_state =
+            anv_cmd_buffer_gfx_push_constants(cmd_buffer);
+      }
+      return anv_cmd_buffer_temporary_state_address(
+         cmd_buffer, gfx_state->base.push_constants_state);
+   }
+
+   default: {
+      assert(range->set < MAX_SETS);
+      struct anv_descriptor_set *set =
+         gfx_state->base.descriptors[range->set];
+      const struct anv_descriptor *desc =
+         &set->descriptors[range->index];
+
+      if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
+         if (desc->buffer) {
+            return anv_address_add(desc->buffer->address,
+                                   desc->offset);
+         }
+      } else {
+         assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
+         if (desc->buffer) {
+            const struct anv_cmd_pipeline_state *pipe_state = &gfx_state->base;
+            uint32_t dynamic_offset =
+               pipe_state->dynamic_offsets[
+                  range->set].offsets[range->dynamic_offset_index];
+            return anv_address_add(desc->buffer->address,
+                                   desc->offset + dynamic_offset);
+         }
+      }
+
+      /* For NULL UBOs, we just return an address in the workaround BO.  We do
+       * writes to it for workarounds but always at the bottom.  The higher
+       * bytes should be all zeros.
+       */
+      assert(range->length * 32 <= 2048);
+      return (struct anv_address) {
+         .bo = cmd_buffer->device->workaround_bo,
+         .offset = 1024,
+      };
+   }
+   }
+}
+
+
+/** Returns the size in bytes of the bound buffer
+ *
+ * The range is relative to the start of the buffer, not the start of the
+ * range.  The returned range may be smaller than
+ *
+ *    (range->start + range->length) * 32;
+ */
+static uint32_t
+get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
+                          const struct anv_shader_bin *shader,
+                          const struct anv_push_range *range)
+{
+   assert(shader->stage != MESA_SHADER_COMPUTE);
+   const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+   switch (range->set) {
+   case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
+      struct anv_descriptor_set *set =
+         gfx_state->base.descriptors[range->index];
+      struct anv_state state = set->desc_surface_mem;
+      assert(range->start * 32 < state.alloc_size);
+      assert((range->start + range->length) * 32 <= state.alloc_size);
+      return state.alloc_size;
+   }
+
+   case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER:
+      return gfx_state->base.pipeline->layout.set[
+         range->index].layout->descriptor_buffer_surface_size;
+
+   case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
+      return (range->start + range->length) * 32;
+
+   default: {
+      assert(range->set < MAX_SETS);
+      struct anv_descriptor_set *set =
+         gfx_state->base.descriptors[range->set];
+      const struct anv_descriptor *desc =
+         &set->descriptors[range->index];
+
+      if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
+         /* Here we promote a UBO to a binding table entry so that we can avoid a layer of indirection.
+            * We use the descriptor set's internally allocated surface state to fill the binding table entry.
+         */
+         if (!desc->buffer)
+            return 0;
+
+         if (range->start * 32 > desc->bind_range)
+            return 0;
+
+         return desc->bind_range;
+      } else {
+         if (!desc->buffer)
+            return 0;
+
+         assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
+         /* Compute the offset within the buffer */
+         const struct anv_cmd_pipeline_state *pipe_state = &gfx_state->base;
+         uint32_t dynamic_offset =
+            pipe_state->dynamic_offsets[
+               range->set].offsets[range->dynamic_offset_index];
+         uint64_t offset = desc->offset + dynamic_offset;
+         /* Clamp to the buffer size */
+         offset = MIN2(offset, desc->buffer->vk.size);
+         /* Clamp the range to the buffer size */
+         uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset);
+
+         /* Align the range for consistency */
+         bound_range = align(bound_range, ANV_UBO_ALIGNMENT);
+
+         return bound_range;
+      }
+   }
+   }
+}
+
+static void
+cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
+                              gl_shader_stage stage,
+                              struct anv_address *buffers,
+                              unsigned buffer_count)
+{
+   const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+   const struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(gfx_state->base.pipeline);
+
+   static const uint32_t push_constant_opcodes[] = {
+      [MESA_SHADER_VERTEX]                      = 21,
+      [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
+      [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
+      [MESA_SHADER_GEOMETRY]                    = 22,
+      [MESA_SHADER_FRAGMENT]                    = 23,
+   };
+
+   assert(stage < ARRAY_SIZE(push_constant_opcodes));
+
+   UNUSED uint32_t mocs = anv_mocs(cmd_buffer->device, NULL, 0);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
+      c._3DCommandSubOpcode = push_constant_opcodes[stage];
+
+      /* Set MOCS.
+       *
+       * We only have one MOCS field for the whole packet, not one per
+       * buffer.  We could go out of our way here to walk over all of
+       * the buffers and see if any of them are used externally and use
+       * the external MOCS.  However, the notion that someone would use
+       * the same bit of memory for both scanout and a UBO is nuts.
+       *
+       * Let's not bother and assume it's all internal.
+       */
+      c.MOCS = mocs;
+
+      if (anv_pipeline_has_stage(pipeline, stage)) {
+         const struct anv_pipeline_bind_map *bind_map =
+            &pipeline->base.shaders[stage]->bind_map;
+
+         /* The Skylake PRM contains the following restriction:
+          *
+          *    "The driver must ensure The following case does not occur
+          *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
+          *     buffer 3 read length equal to zero committed followed by a
+          *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
+          *     zero committed."
+          *
+          * To avoid this, we program the buffers in the highest slots.
+          * This way, slot 0 is only used if slot 3 is also used.
+          */
+         assert(buffer_count <= 4);
+         const unsigned shift = 4 - buffer_count;
+         for (unsigned i = 0; i < buffer_count; i++) {
+            const struct anv_push_range *range = &bind_map->push_ranges[i];
+
+            /* At this point we only have non-empty ranges */
+            assert(range->length > 0);
+
+            c.ConstantBody.ReadLength[i + shift] = range->length;
+            c.ConstantBody.Buffer[i + shift] =
+               anv_address_add(buffers[i], range->start * 32);
+         }
+      }
+   }
+}
+
+#if GFX_VER >= 12
+static void
+cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer,
+                                  uint32_t shader_mask,
+                                  struct anv_address *buffers,
+                                  uint32_t buffer_count)
+{
+   if (buffer_count == 0) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
+         c.ShaderUpdateEnable = shader_mask;
+         c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
+      }
+      return;
+   }
+
+   const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+   const struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(gfx_state->base.pipeline);
+
+   gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask);
+
+   const struct anv_pipeline_bind_map *bind_map =
+      &pipeline->base.shaders[stage]->bind_map;
+
+   uint32_t *dw;
+   const uint32_t buffer_mask = (1 << buffer_count) - 1;
+   const uint32_t num_dwords = 2 + 2 * buffer_count;
+
+   dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
+                        GENX(3DSTATE_CONSTANT_ALL),
+                        .ShaderUpdateEnable = shader_mask,
+                        .PointerBufferMask = buffer_mask,
+                        .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false));
+
+   for (int i = 0; i < buffer_count; i++) {
+      const struct anv_push_range *range = &bind_map->push_ranges[i];
+      GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
+         &cmd_buffer->batch, dw + 2 + i * 2,
+         &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
+            .PointerToConstantBuffer =
+               anv_address_add(buffers[i], range->start * 32),
+            .ConstantBufferReadLength = range->length,
+         });
+   }
+}
+#endif
+
+static void
+cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer,
+                                    VkShaderStageFlags dirty_stages)
+{
+   VkShaderStageFlags flushed = 0;
+   struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+   const struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(gfx_state->base.pipeline);
+
+#if GFX_VER >= 12
+   uint32_t nobuffer_stages = 0;
+#endif
+
+   /* Compute robust pushed register access mask for each stage. */
+   anv_foreach_stage(stage, dirty_stages) {
+      if (!anv_pipeline_has_stage(pipeline, stage))
+         continue;
+
+      const struct anv_shader_bin *shader = pipeline->base.shaders[stage];
+      if (shader->prog_data->zero_push_reg) {
+         const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
+         struct anv_push_constants *push = &gfx_state->base.push_constants;
+
+         push->push_reg_mask[stage] = 0;
+         /* Start of the current range in the shader, relative to the start of
+          * push constants in the shader.
+          */
+         unsigned range_start_reg = 0;
+         for (unsigned i = 0; i < 4; i++) {
+            const struct anv_push_range *range = &bind_map->push_ranges[i];
+            if (range->length == 0)
+               continue;
+
+            unsigned bound_size =
+               get_push_range_bound_size(cmd_buffer, shader, range);
+            if (bound_size >= range->start * 32) {
+               unsigned bound_regs =
+                  MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
+                       range->length);
+               assert(range_start_reg + bound_regs <= 64);
+               push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,
+                                                              bound_regs);
+            }
+
+            cmd_buffer->state.push_constants_dirty |=
+               mesa_to_vk_shader_stage(stage);
+
+            range_start_reg += range->length;
+         }
+      }
+   }
+
+    /* Setting NULL resets the push constant state so that we allocate a new one
+    * if needed. If push constant data not dirty, get_push_range_address can
+    * re-use existing allocation.
+    */
+   if (gfx_state->base.push_constants_data_dirty)
+      gfx_state->base.push_constants_state = ANV_STATE_NULL;
+
+   anv_foreach_stage(stage, dirty_stages) {
+      unsigned buffer_count = 0;
+      flushed |= mesa_to_vk_shader_stage(stage);
+      UNUSED uint32_t max_push_range = 0;
+
+      struct anv_address buffers[4] = {};
+      if (anv_pipeline_has_stage(pipeline, stage)) {
+         const struct anv_shader_bin *shader = pipeline->base.shaders[stage];
+         const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
+
+         /* We have to gather buffer addresses as a second step because the
+          * loop above puts data into the push constant area and the call to
+          * get_push_range_address is what locks our push constants and copies
+          * them into the actual GPU buffer.  If we did the two loops at the
+          * same time, we'd risk only having some of the sizes in the push
+          * constant buffer when we did the copy.
+          */
+         for (unsigned i = 0; i < 4; i++) {
+            const struct anv_push_range *range = &bind_map->push_ranges[i];
+            if (range->length == 0)
+               break;
+
+            buffers[i] = get_push_range_address(cmd_buffer, shader, range);
+            max_push_range = MAX2(max_push_range, range->length);
+            buffer_count++;
+         }
+
+         /* We have at most 4 buffers but they should be tightly packed */
+         for (unsigned i = buffer_count; i < 4; i++)
+            assert(bind_map->push_ranges[i].length == 0);
+      }
+
+#if GFX_VER >= 12
+      /* If this stage doesn't have any push constants, emit it later in a
+       * single CONSTANT_ALL packet.
+       */
+      if (buffer_count == 0) {
+         nobuffer_stages |= 1 << stage;
+         continue;
+      }
+
+      /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
+       * contains only 5 bits, so we can only use it for buffers smaller than
+       * 32.
+       *
+       * According to Wa_16011448509, Gfx12.0 misinterprets some address bits
+       * in 3DSTATE_CONSTANT_ALL.  It should still be safe to use the command
+       * for disabling stages, where all address bits are zero.  However, we
+       * can't safely use it for general buffers with arbitrary addresses.
+       * Just fall back to the individual 3DSTATE_CONSTANT_XS commands in that
+       * case.
+       */
+      if (max_push_range < 32 && GFX_VERx10 > 120) {
+         cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage,
+                                           buffers, buffer_count);
+         continue;
+      }
+#endif
+
+      cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
+   }
+
+#if GFX_VER >= 12
+   if (nobuffer_stages)
+      /* Wa_16011448509: all address bits are zero */
+      cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0);
+#endif
+
+   cmd_buffer->state.push_constants_dirty &= ~flushed;
+   gfx_state->base.push_constants_data_dirty = false;
+}
+
+#if GFX_VERx10 >= 125
+static void
+cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer,
+                                  VkShaderStageFlags dirty_stages)
+{
+   struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
+   const struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(gfx_state->base.pipeline);
+
+   if (dirty_stages & VK_SHADER_STAGE_TASK_BIT_EXT &&
+       anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
+
+      const struct anv_shader_bin *shader = pipeline->base.shaders[MESA_SHADER_TASK];
+      const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TASK_SHADER_DATA), data) {
+         const struct anv_push_range *range = &bind_map->push_ranges[0];
+         if (range->length > 0) {
+            struct anv_address buffer =
+               get_push_range_address(cmd_buffer, shader, range);
+
+            uint64_t addr = anv_address_physical(buffer);
+            data.InlineData[0] = addr & 0xffffffff;
+            data.InlineData[1] = addr >> 32;
+
+            memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
+                   cmd_buffer->state.gfx.base.push_constants.client_data,
+                   BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
+         }
+      }
+   }
+
+   if (dirty_stages & VK_SHADER_STAGE_MESH_BIT_EXT &&
+       anv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) {
+
+      const struct anv_shader_bin *shader = pipeline->base.shaders[MESA_SHADER_MESH];
+      const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_MESH_SHADER_DATA), data) {
+         const struct anv_push_range *range = &bind_map->push_ranges[0];
+         if (range->length > 0) {
+            struct anv_address buffer =
+               get_push_range_address(cmd_buffer, shader, range);
+
+            uint64_t addr = anv_address_physical(buffer);
+            data.InlineData[0] = addr & 0xffffffff;
+            data.InlineData[1] = addr >> 32;
+
+            memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
+                   cmd_buffer->state.gfx.base.push_constants.client_data,
+                   BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
+         }
+      }
+   }
+
+   cmd_buffer->state.push_constants_dirty &= ~dirty_stages;
+}
+#endif
+
+ALWAYS_INLINE static void
+genX(emit_hs)(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
+      return;
+
+   anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.hs);
+}
+
+ALWAYS_INLINE static void
+genX(emit_ds)(struct anv_cmd_buffer *cmd_buffer)
+{
+#if INTEL_NEEDS_WA_22018402687
+   /* Wa_22018402687:
+    *   In any 3D enabled context, just before any Tessellation enabled draw
+    *   call (3D Primitive), re-send the last programmed 3DSTATE_DS again.
+    *   This will make sure that the 3DSTATE_INT generated just before the
+    *   draw call will have TDS dirty which will make sure TDS will launch the
+    *   state thread before the draw call.
+    *
+    * This fixes a hang resulting from running anything using tessellation
+    * after a switch away from the mesh pipeline.
+    * We don't need to track said switch, as it matters at the HW level, and
+    * can be triggered even across processes, so we apply the Wa at all times.
+    */
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
+      return;
+
+   anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ds);
+#endif
+}
+
+ALWAYS_INLINE static void
+genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+   uint32_t *p;
+
+   assert((pipeline->base.base.active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
+
+   genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.base.l3_config);
+
+   genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
+
+   genX(flush_descriptor_buffers)(cmd_buffer, &cmd_buffer->state.gfx.base);
+
+   genX(flush_pipeline_select_3d)(cmd_buffer);
+
+   /* Wa_14015814527
+    *
+    * Apply task URB workaround when switching from task to primitive.
+    */
+   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
+      if (anv_pipeline_is_primitive(pipeline)) {
+         genX(apply_task_urb_workaround)(cmd_buffer);
+      } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
+         cmd_buffer->state.gfx.used_task_shader = true;
+      }
+   }
+
+   /* Apply any pending pipeline flushes we may have.  We want to apply them
+    * now because, if any of those flushes are for things like push constants,
+    * the GPU will read the state at weird times.
+    */
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   /* Check what vertex buffers have been rebound against the set of bindings
+    * being used by the current set of vertex attributes.
+    */
+   uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & dyn->vi->bindings_valid;
+   /* If the pipeline changed, the we have to consider all the valid bindings. */
+   if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
+      vb_emit |= dyn->vi->bindings_valid;
+
+   if (vb_emit) {
+      const uint32_t num_buffers = __builtin_popcount(vb_emit);
+      const uint32_t num_dwords = 1 + num_buffers * 4;
+
+      p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
+                          GENX(3DSTATE_VERTEX_BUFFERS));
+      uint32_t i = 0;
+      u_foreach_bit(vb, vb_emit) {
+         struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
+         uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
+
+         struct GENX(VERTEX_BUFFER_STATE) state;
+         if (buffer) {
+            uint32_t stride = dyn->vi_binding_strides[vb];
+            UNUSED uint32_t size = cmd_buffer->state.vertex_bindings[vb].size;
+
+            state = (struct GENX(VERTEX_BUFFER_STATE)) {
+               .VertexBufferIndex = vb,
+
+               .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,
+                                ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
+               .AddressModifyEnable = true,
+               .BufferPitch = stride,
+               .BufferStartingAddress = anv_address_add(buffer->address, offset),
+               .NullVertexBuffer = offset >= buffer->vk.size,
+#if GFX_VER >= 12
+               .L3BypassDisable = true,
+#endif
+
+               .BufferSize = size,
+            };
+         } else {
+            state = (struct GENX(VERTEX_BUFFER_STATE)) {
+               .VertexBufferIndex = vb,
+               .NullVertexBuffer = true,
+               .MOCS = anv_mocs(cmd_buffer->device, NULL,
+                                ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
+            };
+         }
+
+#if GFX_VER == 9
+         genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
+                                                        state.BufferStartingAddress,
+                                                        state.BufferSize);
+#endif
+
+         GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
+         i++;
+      }
+   }
+
+   cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
+
+   const bool any_dynamic_state_dirty =
+      vk_dynamic_graphics_state_any_dirty(dyn);
+   uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
+                                pipeline->base.base.active_stages;
+
+   descriptors_dirty |=
+      genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
+                                              &cmd_buffer->state.gfx.base,
+                                              &pipeline->base.base);
+
+   /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive. */
+   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE ||
+       (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343)) {
+      genX(emit_hs)(cmd_buffer);
+   }
+
+   if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
+       !any_dynamic_state_dirty &&
+       ((cmd_buffer->state.push_constants_dirty &
+         (VK_SHADER_STAGE_ALL_GRAPHICS |
+          VK_SHADER_STAGE_TASK_BIT_EXT |
+          VK_SHADER_STAGE_MESH_BIT_EXT)) == 0))
+      return;
+
+   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) {
+      /* Wa_16011411144:
+       *
+       * SW must insert a PIPE_CONTROL cmd before and after the
+       * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
+       * state is not combined with other state changes.
+       */
+      if (intel_needs_workaround(cmd_buffer->device->info, 16011411144)) {
+         anv_add_pending_pipe_bits(cmd_buffer,
+                                   ANV_PIPE_CS_STALL_BIT,
+                                   "before SO_BUFFER change WA");
+         genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+      }
+
+      /* We don't need any per-buffer dirty tracking because you're not
+       * allowed to bind different XFB buffers while XFB is enabled.
+       */
+      for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
+         struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
+         anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
+#if GFX_VER < 12
+            sob.SOBufferIndex = idx;
+#else
+            sob._3DCommandOpcode = 0;
+            sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx;
+#endif
+
+            if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
+               sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo,
+                                   ISL_SURF_USAGE_STREAM_OUT_BIT);
+               sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,
+                                                        xfb->offset);
+               sob.SOBufferEnable = true;
+               sob.StreamOffsetWriteEnable = false;
+               /* Size is in DWords - 1 */
+               sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
+            } else {
+               sob.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
+            }
+         }
+      }
+
+      if (intel_needs_workaround(cmd_buffer->device->info, 16011411144)) {
+         /* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */
+         anv_add_pending_pipe_bits(cmd_buffer,
+                                   ANV_PIPE_CS_STALL_BIT,
+                                   "after SO_BUFFER change WA");
+         genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+      } else if (GFX_VER >= 10) {
+         /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */
+         anv_add_pending_pipe_bits(cmd_buffer,
+                                   ANV_PIPE_CS_STALL_BIT,
+                                   "after 3DSTATE_SO_BUFFER call");
+      }
+   }
+
+   /* Flush the runtime state into the HW state tracking */
+   if (cmd_buffer->state.gfx.dirty || any_dynamic_state_dirty)
+      genX(cmd_buffer_flush_gfx_runtime_state)(cmd_buffer);
+
+   /* Flush the HW state into the commmand buffer */
+   if (!BITSET_IS_EMPTY(cmd_buffer->state.gfx.dyn_state.dirty))
+      genX(cmd_buffer_flush_gfx_hw_state)(cmd_buffer);
+
+   /* If the pipeline changed, we may need to re-allocate push constant space
+    * in the URB.
+    */
+   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
+      cmd_buffer_alloc_gfx_push_constants(cmd_buffer);
+
+      /* Also add the relocations (scratch buffers) */
+      VkResult result = anv_reloc_list_append(cmd_buffer->batch.relocs,
+                                              pipeline->base.base.batch.relocs);
+      if (result != VK_SUCCESS) {
+         anv_batch_set_error(&cmd_buffer->batch, result);
+         return;
+      }
+   }
+
+   /* Render targets live in the same binding table as fragment descriptors */
+   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
+      descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
+
+   /* We emit the binding tables and sampler tables first, then emit push
+    * constants and then finally emit binding table and sampler table
+    * pointers.  It has to happen in this order, since emitting the binding
+    * tables may change the push constants (in case of storage images). After
+    * emitting push constants, on SKL+ we have to emit the corresponding
+    * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
+    */
+   uint32_t dirty = 0;
+   if (descriptors_dirty) {
+      dirty = genX(cmd_buffer_flush_descriptor_sets)(
+         cmd_buffer,
+         &cmd_buffer->state.gfx.base,
+         descriptors_dirty,
+         pipeline->base.shaders,
+         ARRAY_SIZE(pipeline->base.shaders));
+      cmd_buffer->state.descriptors_dirty &= ~dirty;
+   }
+
+   if (dirty || cmd_buffer->state.push_constants_dirty) {
+      /* Because we're pushing UBOs, we have to push whenever either
+       * descriptors or push constants is dirty.
+       */
+      dirty |= cmd_buffer->state.push_constants_dirty &
+               pipeline->base.base.active_stages;
+      cmd_buffer_flush_gfx_push_constants(cmd_buffer,
+                                      dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
+#if GFX_VERx10 >= 125
+      cmd_buffer_flush_mesh_inline_data(
+         cmd_buffer, dirty & (VK_SHADER_STAGE_TASK_BIT_EXT |
+                              VK_SHADER_STAGE_MESH_BIT_EXT));
+#endif
+   }
+
+   if (dirty & VK_SHADER_STAGE_ALL_GRAPHICS) {
+      cmd_buffer_emit_descriptor_pointers(cmd_buffer,
+                                          dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
+   }
+
+   /* When we're done, there is no more dirty gfx state. */
+   cmd_buffer->state.gfx.dirty = 0;
+}
+
+ALWAYS_INLINE static bool
+anv_use_generated_draws(const struct anv_cmd_buffer *cmd_buffer, uint32_t count)
+{
+   const struct anv_device *device = cmd_buffer->device;
+   const struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+
+   /* We cannot generate readable commands in protected mode. */
+   if (cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
+      return false;
+
+   /* Limit generated draws to pipelines without HS stage. This makes things
+    * simpler for implementing Wa_1306463417, Wa_16011107343.
+    */
+   if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
+       anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL))
+      return false;
+
+   return count >= device->physical->instance->generated_indirect_threshold;
+}
+
+#include "genX_cmd_draw_helpers.h"
+#include "genX_cmd_draw_generated_indirect.h"
+
+#if GFX_VER >= 11
+#define _3DPRIMITIVE_DIRECT GENX(3DPRIMITIVE_EXTENDED)
+#else
+#define _3DPRIMITIVE_DIRECT GENX(3DPRIMITIVE)
+#endif
+
+void genX(CmdDraw)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    vertexCount,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstVertex,
+    uint32_t                                    firstInstance)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   const uint32_t count =
+      vertexCount * instanceCount * pipeline->instance_multiplier;
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_DRAW,
+                        "draw", count);
+   trace_intel_begin_draw(&cmd_buffer->trace);
+
+   /* Select pipeline here to allow
+    * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
+    * cmd_buffer_flush_gfx_state().
+    */
+   genX(flush_pipeline_select_3d)(cmd_buffer);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+#if GFX_VER < 11
+   cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer,
+                                              get_vs_prog_data(pipeline),
+                                              firstVertex, firstInstance, 0,
+                                              false /* force_flush */);
+#endif
+
+   genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+   genX(emit_ds)(cmd_buffer);
+
+   genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+
+   anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
+      prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
+#if GFX_VERx10 >= 125
+      prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
+#endif
+      prim.VertexAccessType         = SEQUENTIAL;
+      prim.VertexCountPerInstance   = vertexCount;
+      prim.StartVertexLocation      = firstVertex;
+      prim.InstanceCount            = instanceCount *
+                                      pipeline->instance_multiplier;
+      prim.StartInstanceLocation    = firstInstance;
+      prim.BaseVertexLocation       = 0;
+#if GFX_VER >= 11
+      prim.ExtendedParametersPresent = true;
+      prim.ExtendedParameter0       = firstVertex;
+      prim.ExtendedParameter1       = firstInstance;
+      prim.ExtendedParameter2       = 0;
+#endif
+   }
+
+   genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+                                         cmd_buffer->device,
+                                         cmd_buffer->state.gfx.primitive_topology,
+                                         vertexCount);
+   genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
+
+   update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
+
+   trace_intel_end_draw(&cmd_buffer->trace, count);
+}
+
+void genX(CmdDrawMultiEXT)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    drawCount,
+    const VkMultiDrawInfoEXT                   *pVertexInfo,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstInstance,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   UNUSED struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   uint32_t i = 0;
+#if GFX_VER < 11
+   vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
+      cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer,
+                                                 get_vs_prog_data(pipeline),
+                                                 draw->firstVertex,
+                                                 firstInstance, i, !i);
+
+      const uint32_t count =
+         draw->vertexCount * instanceCount * pipeline->instance_multiplier;
+      anv_measure_snapshot(cmd_buffer,
+                           INTEL_SNAPSHOT_DRAW,
+                           "draw multi", count);
+      trace_intel_begin_draw_multi(&cmd_buffer->trace);
+
+      genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+         prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
+         prim.VertexAccessType         = SEQUENTIAL;
+         prim.VertexCountPerInstance   = draw->vertexCount;
+         prim.StartVertexLocation      = draw->firstVertex;
+         prim.InstanceCount            = instanceCount *
+                                         pipeline->instance_multiplier;
+         prim.StartInstanceLocation    = firstInstance;
+         prim.BaseVertexLocation       = 0;
+      }
+
+      genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+                                            cmd_buffer->device,
+                                            cmd_buffer->state.gfx.primitive_topology,
+                                            drawCount == 0 ? 0 :
+                                            pVertexInfo[drawCount - 1].vertexCount);
+
+      genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
+      trace_intel_end_draw_multi(&cmd_buffer->trace, count);
+   }
+#else
+   vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
+
+      /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
+       * first one was handled by cmd_buffer_flush_gfx_state.
+       */
+      if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
+         genX(emit_hs)(cmd_buffer);
+      genX(emit_ds)(cmd_buffer);
+
+      const uint32_t count = draw->vertexCount * instanceCount;
+      anv_measure_snapshot(cmd_buffer,
+                           INTEL_SNAPSHOT_DRAW,
+                           "draw multi", count);
+      trace_intel_begin_draw_multi(&cmd_buffer->trace);
+
+      genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+
+      anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
+#if GFX_VERx10 >= 125
+         prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
+#endif
+         prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
+         prim.VertexAccessType         = SEQUENTIAL;
+         prim.VertexCountPerInstance   = draw->vertexCount;
+         prim.StartVertexLocation      = draw->firstVertex;
+         prim.InstanceCount            = instanceCount;
+         prim.StartInstanceLocation    = firstInstance;
+         prim.BaseVertexLocation       = 0;
+         prim.ExtendedParametersPresent = true;
+         prim.ExtendedParameter0       = draw->firstVertex;
+         prim.ExtendedParameter1       = firstInstance;
+         prim.ExtendedParameter2       = i;
+      }
+
+      genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+                                            cmd_buffer->device,
+                                            cmd_buffer->state.gfx.primitive_topology,
+                                            drawCount == 0 ? 0 :
+                                            pVertexInfo[drawCount - 1].vertexCount);
+
+      genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
+      trace_intel_end_draw_multi(&cmd_buffer->trace, count);
+   }
+#endif
+
+   update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
+}
+
+void genX(CmdDrawIndexed)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    indexCount,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstIndex,
+    int32_t                                     vertexOffset,
+    uint32_t                                    firstInstance)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   const uint32_t count =
+      indexCount * instanceCount * pipeline->instance_multiplier;
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_DRAW,
+                        "draw indexed",
+                        count);
+   trace_intel_begin_draw_indexed(&cmd_buffer->trace);
+
+   /* Select pipeline here to allow
+    * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
+    * cmd_buffer_flush_gfx_state().
+    */
+   genX(flush_pipeline_select_3d)(cmd_buffer);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+#if GFX_VER < 11
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+   cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
+                                              vertexOffset, firstInstance,
+                                              0, false /* force_flush */);
+#endif
+
+   genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+   genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+
+   anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
+      prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
+#if GFX_VERx10 >= 125
+      prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
+#endif
+      prim.VertexAccessType         = RANDOM;
+      prim.VertexCountPerInstance   = indexCount;
+      prim.StartVertexLocation      = firstIndex;
+      prim.InstanceCount            = instanceCount *
+                                      pipeline->instance_multiplier;
+      prim.StartInstanceLocation    = firstInstance;
+      prim.BaseVertexLocation       = vertexOffset;
+#if GFX_VER >= 11
+      prim.ExtendedParametersPresent = true;
+      prim.ExtendedParameter0       = vertexOffset;
+      prim.ExtendedParameter1       = firstInstance;
+      prim.ExtendedParameter2       = 0;
+#endif
+   }
+
+   genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+                                         cmd_buffer->device,
+                                         cmd_buffer->state.gfx.primitive_topology,
+                                         indexCount);
+   genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
+
+   update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
+
+   trace_intel_end_draw_indexed(&cmd_buffer->trace, count);
+}
+
+void genX(CmdDrawMultiIndexedEXT)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    drawCount,
+    const VkMultiDrawIndexedInfoEXT            *pIndexInfo,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstInstance,
+    uint32_t                                    stride,
+    const int32_t                              *pVertexOffset)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   uint32_t i = 0;
+#if GFX_VER < 11
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+   if (pVertexOffset) {
+      if (vs_prog_data->uses_drawid) {
+         bool emitted = true;
+         if (vs_prog_data->uses_firstvertex ||
+             vs_prog_data->uses_baseinstance) {
+            emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
+            emitted = true;
+         }
+         vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
+            if (vs_prog_data->uses_drawid) {
+               emit_draw_index(cmd_buffer, i);
+               emitted = true;
+            }
+            /* Emitting draw index or vertex index BOs may result in needing
+             * additional VF cache flushes.
+             */
+            if (emitted)
+               genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+            const uint32_t count =
+               draw->indexCount * instanceCount * pipeline->instance_multiplier;
+            anv_measure_snapshot(cmd_buffer,
+                                 INTEL_SNAPSHOT_DRAW,
+                                 "draw indexed multi",
+                                 count);
+            trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
+            genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
+                                  true);
+
+            anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+               prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
+               prim.VertexAccessType         = RANDOM;
+               prim.VertexCountPerInstance   = draw->indexCount;
+               prim.StartVertexLocation      = draw->firstIndex;
+               prim.InstanceCount            = instanceCount *
+                                               pipeline->instance_multiplier;
+               prim.StartInstanceLocation    = firstInstance;
+               prim.BaseVertexLocation       = *pVertexOffset;
+            }
+
+            genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+                                                  cmd_buffer->device,
+                                                  cmd_buffer->state.gfx.primitive_topology,
+                                                  drawCount == 0 ? 0 :
+                                                  pIndexInfo[drawCount - 1].indexCount);
+
+            genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
+                                  false);
+            trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
+            emitted = false;
+         }
+      } else {
+         if (vs_prog_data->uses_firstvertex ||
+             vs_prog_data->uses_baseinstance) {
+            emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
+            /* Emitting draw index or vertex index BOs may result in needing
+             * additional VF cache flushes.
+             */
+            genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+         }
+         vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
+            const uint32_t count =
+               draw->indexCount * instanceCount * pipeline->instance_multiplier;
+            anv_measure_snapshot(cmd_buffer,
+                                 INTEL_SNAPSHOT_DRAW,
+                                 "draw indexed multi",
+                                 count);
+            trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
+            genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
+                                  true);
+
+            anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+               prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
+               prim.VertexAccessType         = RANDOM;
+               prim.VertexCountPerInstance   = draw->indexCount;
+               prim.StartVertexLocation      = draw->firstIndex;
+               prim.InstanceCount            = instanceCount *
+                                               pipeline->instance_multiplier;
+               prim.StartInstanceLocation    = firstInstance;
+               prim.BaseVertexLocation       = *pVertexOffset;
+            }
+
+            genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+                                                  cmd_buffer->device,
+                                                  cmd_buffer->state.gfx.primitive_topology,
+                                                  drawCount == 0 ? 0 :
+                                                  pIndexInfo[drawCount - 1].indexCount);
+
+            genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
+                                  false);
+            trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
+         }
+      }
+   } else {
+      vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
+         cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
+                                                    draw->vertexOffset,
+                                                    firstInstance, i, i != 0);
+
+         const uint32_t count =
+            draw->indexCount * instanceCount * pipeline->instance_multiplier;
+         anv_measure_snapshot(cmd_buffer,
+                              INTEL_SNAPSHOT_DRAW,
+                              "draw indexed multi",
+                              count);
+         trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
+         genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+
+         anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+            prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
+            prim.VertexAccessType         = RANDOM;
+            prim.VertexCountPerInstance   = draw->indexCount;
+            prim.StartVertexLocation      = draw->firstIndex;
+            prim.InstanceCount            = instanceCount *
+                                            pipeline->instance_multiplier;
+            prim.StartInstanceLocation    = firstInstance;
+            prim.BaseVertexLocation       = draw->vertexOffset;
+         }
+
+         genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+                                               cmd_buffer->device,
+                                               cmd_buffer->state.gfx.primitive_topology,
+                                               drawCount == 0 ? 0 :
+                                               pIndexInfo[drawCount - 1].indexCount);
+
+         genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
+         trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
+      }
+   }
+#else
+   vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
+
+      /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
+       * first one was handled by cmd_buffer_flush_gfx_state.
+       */
+      if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
+         genX(emit_hs)(cmd_buffer);
+      genX(emit_ds)(cmd_buffer);
+
+      const uint32_t count =
+         draw->indexCount * instanceCount * pipeline->instance_multiplier;
+      anv_measure_snapshot(cmd_buffer,
+                           INTEL_SNAPSHOT_DRAW,
+                           "draw indexed multi",
+                           count);
+      trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
+      genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE_EXTENDED), prim) {
+#if GFX_VERx10 >= 125
+         prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
+#endif
+         prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
+         prim.VertexAccessType         = RANDOM;
+         prim.VertexCountPerInstance   = draw->indexCount;
+         prim.StartVertexLocation      = draw->firstIndex;
+         prim.InstanceCount            = instanceCount *
+                                         pipeline->instance_multiplier;
+         prim.StartInstanceLocation    = firstInstance;
+         prim.BaseVertexLocation       = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
+         prim.ExtendedParametersPresent = true;
+         prim.ExtendedParameter0       = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
+         prim.ExtendedParameter1       = firstInstance;
+         prim.ExtendedParameter2       = i;
+      }
+
+      genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+                                            cmd_buffer->device,
+                                            cmd_buffer->state.gfx.primitive_topology,
+                                            drawCount == 0 ? 0 :
+                                            pIndexInfo[drawCount - 1].indexCount);
+
+      genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
+      trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
+   }
+#endif
+
+   update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
+}
+
+/* Auto-Draw / Indirect Registers */
+#define GFX7_3DPRIM_END_OFFSET          0x2420
+#define GFX7_3DPRIM_START_VERTEX        0x2430
+#define GFX7_3DPRIM_VERTEX_COUNT        0x2434
+#define GFX7_3DPRIM_INSTANCE_COUNT      0x2438
+#define GFX7_3DPRIM_START_INSTANCE      0x243C
+#define GFX7_3DPRIM_BASE_VERTEX         0x2440
+
+/* On Gen11+, we have three custom "extended parameters" which we can use to
+ * provide extra system-generated values to shaders.  Our assignment of these
+ * is arbitrary; we choose to assign them as follows:
+ *
+ *    gl_BaseVertex = XP0
+ *    gl_BaseInstance = XP1
+ *    gl_DrawID = XP2
+ *
+ * For gl_BaseInstance, we never actually have to set up the value because we
+ * can just program 3DSTATE_VF_SGVS_2 to load it implicitly.  We can also do
+ * that for gl_BaseVertex but it does the wrong thing for indexed draws.
+ */
+#define GEN11_3DPRIM_XP0                0x2690
+#define GEN11_3DPRIM_XP1                0x2694
+#define GEN11_3DPRIM_XP2                0x2698
+#define GEN11_3DPRIM_XP_BASE_VERTEX     GEN11_3DPRIM_XP0
+#define GEN11_3DPRIM_XP_BASE_INSTANCE   GEN11_3DPRIM_XP1
+#define GEN11_3DPRIM_XP_DRAW_ID         GEN11_3DPRIM_XP2
+
+void genX(CmdDrawIndirectByteCountEXT)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstInstance,
+    VkBuffer                                    counterBuffer,
+    VkDeviceSize                                counterBufferOffset,
+    uint32_t                                    counterOffset,
+    uint32_t                                    vertexStride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+
+   /* firstVertex is always zero for this draw function */
+   const uint32_t firstVertex = 0;
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_DRAW,
+                        "draw indirect byte count",
+                        instanceCount * pipeline->instance_multiplier);
+   trace_intel_begin_draw_indirect_byte_count(&cmd_buffer->trace);
+
+   /* Select pipeline here to allow
+    * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
+    * emit_base_vertex_instance() & emit_draw_index().
+    */
+   genX(flush_pipeline_select_3d)(cmd_buffer);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+#if GFX_VER < 11
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+   if (vs_prog_data->uses_firstvertex ||
+       vs_prog_data->uses_baseinstance)
+      emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
+   if (vs_prog_data->uses_drawid)
+      emit_draw_index(cmd_buffer, 0);
+#endif
+
+   genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+   const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &counter_buffer->address);
+   mi_builder_set_mocs(&b, mocs);
+   struct mi_value count =
+      mi_mem32(anv_address_add(counter_buffer->address,
+                                   counterBufferOffset));
+   if (counterOffset)
+      count = mi_isub(&b, count, mi_imm(counterOffset));
+   count = mi_udiv32_imm(&b, count, vertexStride);
+   mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
+
+   mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
+   mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT),
+            mi_imm(instanceCount * pipeline->instance_multiplier));
+   mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
+   mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
+
+#if GFX_VER >= 11
+   mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
+                mi_imm(firstVertex));
+   /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
+   mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_DRAW_ID), mi_imm(0));
+#endif
+
+   genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+   anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
+#if GFX_VERx10 >= 125
+      prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
+#endif
+      prim.IndirectParameterEnable  = true;
+      prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
+      prim.VertexAccessType         = SEQUENTIAL;
+#if GFX_VER >= 11
+      prim.ExtendedParametersPresent = true;
+#endif
+   }
+
+   genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+                                         cmd_buffer->device,
+                                         cmd_buffer->state.gfx.primitive_topology,
+                                         1);
+   genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
+
+   update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
+
+   trace_intel_end_draw_indirect_byte_count(&cmd_buffer->trace,
+      instanceCount * pipeline->instance_multiplier);
+}
+
+static void
+load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
+                         struct anv_address addr,
+                         bool indexed,
+                         uint32_t draw_id)
+{
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+   const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &addr);
+   mi_builder_set_mocs(&b, mocs);
+
+   mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
+                mi_mem32(anv_address_add(addr, 0)));
+
+   struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
+   if (pipeline->instance_multiplier > 1) {
+      instance_count = mi_imul_imm(&b, instance_count,
+                                   pipeline->instance_multiplier);
+   }
+   mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
+
+   mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
+                mi_mem32(anv_address_add(addr, 8)));
+
+   if (indexed) {
+      mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
+                   mi_mem32(anv_address_add(addr, 12)));
+      mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
+                   mi_mem32(anv_address_add(addr, 16)));
+#if GFX_VER >= 11
+      mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
+                   mi_mem32(anv_address_add(addr, 12)));
+      /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
+#endif
+   } else {
+      mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
+                   mi_mem32(anv_address_add(addr, 12)));
+      mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
+#if GFX_VER >= 11
+      mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
+                   mi_mem32(anv_address_add(addr, 8)));
+      /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
+#endif
+   }
+
+#if GFX_VER >= 11
+   mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_DRAW_ID),
+                mi_imm(draw_id));
+#endif
+}
+
+static const bool
+execute_indirect_draw_supported(struct anv_cmd_buffer *cmd_buffer)
+{
+#if GFX_VERx10 >= 125
+   const struct intel_device_info *devinfo = cmd_buffer->device->info;
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+   const bool is_multiview = pipeline->instance_multiplier > 1;
+
+   return (devinfo->has_indirect_unroll &&
+           !is_multiview &&
+           !vs_prog_data->uses_firstvertex &&
+           !vs_prog_data->uses_baseinstance &&
+           !vs_prog_data->uses_drawid);
+#else
+   return false;
+#endif
+}
+
+static void
+emit_indirect_draws(struct anv_cmd_buffer *cmd_buffer,
+                    struct anv_address indirect_data_addr,
+                    uint32_t indirect_data_stride,
+                    uint32_t draw_count,
+                    bool indexed)
+{
+#if GFX_VER < 11
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+#endif
+   UNUSED const struct intel_device_info *devinfo = cmd_buffer->device->info;
+   UNUSED const bool aligned_stride =
+      (indirect_data_stride == 0 ||
+       (!indexed && indirect_data_stride == sizeof(VkDrawIndirectCommand)) ||
+       (indexed && indirect_data_stride == sizeof(VkDrawIndexedIndirectCommand)));
+   UNUSED const bool execute_indirect_supported =
+      execute_indirect_draw_supported(cmd_buffer);
+
+   genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   uint32_t offset = 0;
+   for (uint32_t i = 0; i < draw_count; i++) {
+      struct anv_address draw = anv_address_add(indirect_data_addr, offset);
+
+#if GFX_VER < 11
+      /* TODO: We need to stomp base vertex to 0 somehow */
+
+      /* With sequential draws, we're dealing with the VkDrawIndirectCommand
+       * structure data. We want to load VkDrawIndirectCommand::firstVertex at
+       * offset 8 in the structure.
+       *
+       * With indexed draws, we're dealing with VkDrawIndexedIndirectCommand.
+       * We want the VkDrawIndirectCommand::vertexOffset field at offset 12 in
+       * the structure.
+       */
+      if (vs_prog_data->uses_firstvertex ||
+          vs_prog_data->uses_baseinstance) {
+         emit_base_vertex_instance_bo(cmd_buffer,
+                                      anv_address_add(draw, indexed ? 12 : 8));
+      }
+      if (vs_prog_data->uses_drawid)
+         emit_draw_index(cmd_buffer, i);
+#endif
+
+      /* Emitting draw index or vertex index BOs may result in needing
+       * additional VF cache flushes.
+       */
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+      /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
+       * first one was handled by cmd_buffer_flush_gfx_state.
+       */
+      if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
+         genX(emit_hs)(cmd_buffer);
+      genX(emit_ds)(cmd_buffer);
+
+      if (execute_indirect_supported) {
+#if GFX_VERx10 >= 125
+         genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+         anv_batch_emit(&cmd_buffer->batch, GENX(EXECUTE_INDIRECT_DRAW), ind) {
+            ind.ArgumentFormat             = indexed ? DRAWINDEXED : DRAW;
+            ind.TBIMREnabled               = cmd_buffer->state.gfx.dyn_state.use_tbimr;
+            ind.PredicateEnable            =
+               cmd_buffer->state.conditional_render_enabled;
+            ind.MaxCount                   = aligned_stride ? draw_count : 1;
+            ind.ArgumentBufferStartAddress = draw;
+            ind.MOCS                       =
+               anv_mocs(cmd_buffer->device, draw.bo, 0);
+         }
+         /* If all the indirect structures are aligned, then we can let the HW
+          * do the unrolling and we only need one instruction. Otherwise we
+          * need to emit one instruction per draw, but we're still avoiding
+          * the register loads with MI commands.
+          */
+         if (aligned_stride)
+            break;
+#else
+         unreachable("EXECUTE_INDIRECT_DRAW instruction expectation mismatch");
+#endif
+      } else {
+         load_indirect_parameters(cmd_buffer, draw, indexed, i);
+
+         genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+         anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
+#if GFX_VERx10 >= 125
+            prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
+#endif
+            prim.IndirectParameterEnable  = true;
+            prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
+            prim.VertexAccessType         = indexed ? RANDOM : SEQUENTIAL;
+#if GFX_VER >= 11
+            prim.ExtendedParametersPresent = true;
+#endif
+         }
+      }
+
+      genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+                                            cmd_buffer->device,
+                                            cmd_buffer->state.gfx.primitive_topology,
+                                            1);
+
+      genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
+
+      update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer,
+                                         indexed ? RANDOM : SEQUENTIAL);
+
+      offset += indirect_data_stride;
+   }
+}
+
+void genX(CmdDrawIndirect)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    uint32_t                                    drawCount,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_DRAW,
+                        "draw indirect",
+                        drawCount);
+   trace_intel_begin_draw_indirect(&cmd_buffer->trace);
+
+   if (anv_use_generated_draws(cmd_buffer, drawCount)) {
+      genX(cmd_buffer_emit_indirect_generated_draws)(
+         cmd_buffer,
+         anv_address_add(buffer->address, offset),
+         MAX2(stride, sizeof(VkDrawIndirectCommand)),
+         ANV_NULL_ADDRESS /* count_addr */,
+         drawCount,
+         false /* indexed */);
+   } else {
+      emit_indirect_draws(cmd_buffer,
+                          anv_address_add(buffer->address, offset),
+                          stride, drawCount, false /* indexed */);
+   }
+
+   trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount);
+}
+
+void genX(CmdDrawIndexedIndirect)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    uint32_t                                    drawCount,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_DRAW,
+                        "draw indexed indirect",
+                        drawCount);
+   trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace);
+
+   if (anv_use_generated_draws(cmd_buffer, drawCount)) {
+      genX(cmd_buffer_emit_indirect_generated_draws)(
+         cmd_buffer,
+         anv_address_add(buffer->address, offset),
+         MAX2(stride, sizeof(VkDrawIndexedIndirectCommand)),
+         ANV_NULL_ADDRESS /* count_addr */,
+         drawCount,
+         true /* indexed */);
+   } else {
+      emit_indirect_draws(cmd_buffer,
+                          anv_address_add(buffer->address, offset),
+                          stride, drawCount, true /* indexed */);
+   }
+
+   trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount);
+}
+
+#define MI_PREDICATE_SRC0    0x2400
+#define MI_PREDICATE_SRC1    0x2408
+#define MI_PREDICATE_RESULT  0x2418
+
+static struct mi_value
+prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
+                                 struct mi_builder *b,
+                                 struct anv_address count_address)
+{
+   struct mi_value ret = mi_imm(0);
+
+   if (cmd_buffer->state.conditional_render_enabled) {
+      ret = mi_new_gpr(b);
+      mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
+   } else {
+      /* Upload the current draw count from the draw parameters buffer to
+       * MI_PREDICATE_SRC0.
+       */
+      mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
+      mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
+   }
+
+   return ret;
+}
+
+static void
+emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
+                          struct mi_builder *b,
+                          uint32_t draw_index)
+{
+   /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
+   mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
+
+   if (draw_index == 0) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
+         mip.LoadOperation    = LOAD_LOADINV;
+         mip.CombineOperation = COMBINE_SET;
+         mip.CompareOperation = COMPARE_SRCS_EQUAL;
+      }
+   } else {
+      /* While draw_index < draw_count the predicate's result will be
+       *  (draw_index == draw_count) ^ TRUE = TRUE
+       * When draw_index == draw_count the result is
+       *  (TRUE) ^ TRUE = FALSE
+       * After this all results will be:
+       *  (FALSE) ^ FALSE = FALSE
+       */
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
+         mip.LoadOperation    = LOAD_LOAD;
+         mip.CombineOperation = COMBINE_XOR;
+         mip.CompareOperation = COMPARE_SRCS_EQUAL;
+      }
+   }
+}
+
+static void
+emit_draw_count_predicate_with_conditional_render(
+                          struct anv_cmd_buffer *cmd_buffer,
+                          struct mi_builder *b,
+                          uint32_t draw_index,
+                          struct mi_value max)
+{
+   struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
+   pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
+
+   mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
+}
+
+static void
+emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
+                               struct mi_builder *b,
+                               uint32_t draw_index,
+                               struct mi_value max)
+{
+   if (cmd_buffer->state.conditional_render_enabled) {
+      emit_draw_count_predicate_with_conditional_render(
+            cmd_buffer, b, draw_index, mi_value_ref(b, max));
+   } else {
+      emit_draw_count_predicate(cmd_buffer, b, draw_index);
+   }
+}
+
+static void
+emit_indirect_count_draws(struct anv_cmd_buffer *cmd_buffer,
+                          struct anv_address indirect_data_addr,
+                          uint64_t indirect_data_stride,
+                          struct anv_address draw_count_addr,
+                          uint32_t max_draw_count,
+                          bool indexed)
+{
+#if GFX_VER < 11
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+#endif
+
+   genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+   const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &draw_count_addr);
+   mi_builder_set_mocs(&b, mocs);
+   struct mi_value max =
+      prepare_for_draw_count_predicate(cmd_buffer, &b, draw_count_addr);
+
+   for (uint32_t i = 0; i < max_draw_count; i++) {
+      struct anv_address draw =
+         anv_address_add(indirect_data_addr, i * indirect_data_stride);
+
+      emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
+
+#if GFX_VER < 11
+      if (vs_prog_data->uses_firstvertex ||
+          vs_prog_data->uses_baseinstance) {
+         emit_base_vertex_instance_bo(cmd_buffer,
+                                      anv_address_add(draw, indexed ? 12 : 8));
+      }
+      if (vs_prog_data->uses_drawid)
+         emit_draw_index(cmd_buffer, i);
+
+      /* Emitting draw index or vertex index BOs may result in needing
+       * additional VF cache flushes.
+       */
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+#endif
+
+      load_indirect_parameters(cmd_buffer, draw, indexed, i);
+
+      /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
+       * first one was handled by cmd_buffer_flush_gfx_state.
+       */
+      if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
+         genX(emit_hs)(cmd_buffer);
+      genX(emit_ds)(cmd_buffer);
+
+      genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
+      anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
+#if GFX_VERx10 >= 125
+         prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
+#endif
+         prim.IndirectParameterEnable  = true;
+         prim.PredicateEnable          = true;
+         prim.VertexAccessType         = indexed ? RANDOM : SEQUENTIAL;
+#if GFX_VER >= 11
+         prim.ExtendedParametersPresent = true;
+#endif
+      }
+
+      genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
+                                            cmd_buffer->device,
+                                            cmd_buffer->state.gfx.primitive_topology,
+                                            1);
+      genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
+
+      update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
+   }
+
+   mi_value_unref(&b, max);
+}
+
+void genX(CmdDrawIndirectCount)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    VkBuffer                                    _countBuffer,
+    VkDeviceSize                                countBufferOffset,
+    uint32_t                                    maxDrawCount,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_DRAW,
+                        "draw indirect count",
+                        0);
+   trace_intel_begin_draw_indirect_count(&cmd_buffer->trace);
+
+   struct anv_address indirect_data_address =
+      anv_address_add(buffer->address, offset);
+   struct anv_address count_address =
+      anv_address_add(count_buffer->address, countBufferOffset);
+   stride = MAX2(stride, sizeof(VkDrawIndirectCommand));
+
+   if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
+      genX(cmd_buffer_emit_indirect_generated_draws)(
+         cmd_buffer,
+         indirect_data_address,
+         stride,
+         count_address,
+         maxDrawCount,
+         false /* indexed */);
+   } else {
+      emit_indirect_count_draws(cmd_buffer,
+                                indirect_data_address,
+                                stride,
+                                count_address,
+                                maxDrawCount,
+                                false /* indexed */);
+   }
+
+   trace_intel_end_draw_indirect_count(&cmd_buffer->trace, maxDrawCount);
+}
+
+void genX(CmdDrawIndexedIndirectCount)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    VkBuffer                                    _countBuffer,
+    VkDeviceSize                                countBufferOffset,
+    uint32_t                                    maxDrawCount,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_DRAW,
+                        "draw indexed indirect count",
+                        0);
+   trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace);
+
+   struct anv_address indirect_data_address =
+      anv_address_add(buffer->address, offset);
+   struct anv_address count_address =
+      anv_address_add(count_buffer->address, countBufferOffset);
+   stride = MAX2(stride, sizeof(VkDrawIndexedIndirectCommand));
+
+   if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
+      genX(cmd_buffer_emit_indirect_generated_draws)(
+         cmd_buffer,
+         indirect_data_address,
+         stride,
+         count_address,
+         maxDrawCount,
+         true /* indexed */);
+   } else {
+      emit_indirect_count_draws(cmd_buffer,
+                                indirect_data_address,
+                                stride,
+                                count_address,
+                                maxDrawCount,
+                                true /* indexed */);
+   }
+
+   trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace, maxDrawCount);
+
+}
+
+void genX(CmdBeginTransformFeedbackEXT)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    firstCounterBuffer,
+    uint32_t                                    counterBufferCount,
+    const VkBuffer*                             pCounterBuffers,
+    const VkDeviceSize*                         pCounterBufferOffsets)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   assert(firstCounterBuffer < MAX_XFB_BUFFERS);
+   assert(counterBufferCount <= MAX_XFB_BUFFERS);
+   assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
+
+   trace_intel_begin_xfb(&cmd_buffer->trace);
+
+   /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
+    *
+    *    "Ssoftware must ensure that no HW stream output operations can be in
+    *    process or otherwise pending at the point that the MI_LOAD/STORE
+    *    commands are processed. This will likely require a pipeline flush."
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_CS_STALL_BIT,
+                             "begin transform feedback");
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
+      /* If we have a counter buffer, this is a resume so we need to load the
+       * value into the streamout offset register.  Otherwise, this is a begin
+       * and we need to reset it to zero.
+       */
+      if (pCounterBuffers &&
+          idx >= firstCounterBuffer &&
+          idx - firstCounterBuffer < counterBufferCount &&
+          pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
+         uint32_t cb_idx = idx - firstCounterBuffer;
+         ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
+         uint64_t offset = pCounterBufferOffsets ?
+                           pCounterBufferOffsets[cb_idx] : 0;
+
+         anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+            lrm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
+            lrm.MemoryAddress    = anv_address_add(counter_buffer->address,
+                                                   offset);
+         }
+      } else {
+         anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+            lri.RegisterOffset   = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
+            lri.DataDWord        = 0;
+         }
+      }
+   }
+
+   cmd_buffer->state.xfb_enabled = true;
+   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
+}
+
+void genX(CmdEndTransformFeedbackEXT)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    firstCounterBuffer,
+    uint32_t                                    counterBufferCount,
+    const VkBuffer*                             pCounterBuffers,
+    const VkDeviceSize*                         pCounterBufferOffsets)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   assert(firstCounterBuffer < MAX_XFB_BUFFERS);
+   assert(counterBufferCount <= MAX_XFB_BUFFERS);
+   assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
+
+   /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
+    *
+    *    "Ssoftware must ensure that no HW stream output operations can be in
+    *    process or otherwise pending at the point that the MI_LOAD/STORE
+    *    commands are processed. This will likely require a pipeline flush."
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_CS_STALL_BIT,
+                             "end transform feedback");
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
+      unsigned idx = firstCounterBuffer + cb_idx;
+
+      /* If we have a counter buffer, this is a resume so we need to load the
+       * value into the streamout offset register.  Otherwise, this is a begin
+       * and we need to reset it to zero.
+       */
+      if (pCounterBuffers &&
+          cb_idx < counterBufferCount &&
+          pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
+         ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
+         uint64_t offset = pCounterBufferOffsets ?
+                           pCounterBufferOffsets[cb_idx] : 0;
+
+         anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
+            srm.MemoryAddress    = anv_address_add(counter_buffer->address,
+                                                   offset);
+            srm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
+         }
+      }
+   }
+
+   trace_intel_end_xfb(&cmd_buffer->trace);
+
+   cmd_buffer->state.xfb_enabled = false;
+   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
+}
+
+#if GFX_VERx10 >= 125
+
+void
+genX(CmdDrawMeshTasksEXT)(
+      VkCommandBuffer commandBuffer,
+      uint32_t x,
+      uint32_t y,
+      uint32_t z)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_DRAW,
+                        "draw mesh", x * y * z);
+
+   trace_intel_begin_draw_mesh(&cmd_buffer->trace);
+
+   /* TODO(mesh): Check if this is not emitting more packets than we need. */
+   genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DMESH_3D), m) {
+      m.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
+      m.ThreadGroupCountX = x;
+      m.ThreadGroupCountY = y;
+      m.ThreadGroupCountZ = z;
+   }
+
+   trace_intel_end_draw_mesh(&cmd_buffer->trace, x, y, z);
+}
+
+#define GFX125_3DMESH_TG_COUNT 0x26F0
+#define GFX10_3DPRIM_XP(n) (0x2690 + (n) * 4) /* n = { 0, 1, 2 } */
+
+static void
+mesh_load_indirect_parameters_3dmesh_3d(struct anv_cmd_buffer *cmd_buffer,
+                                        struct mi_builder *b,
+                                        struct anv_address addr,
+                                        bool emit_xp0,
+                                        uint32_t xp0)
+{
+   const size_t groupCountXOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountX);
+   const size_t groupCountYOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountY);
+   const size_t groupCountZOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountZ);
+
+   mi_store(b, mi_reg32(GFX125_3DMESH_TG_COUNT),
+               mi_mem32(anv_address_add(addr, groupCountXOff)));
+
+   mi_store(b, mi_reg32(GFX10_3DPRIM_XP(1)),
+               mi_mem32(anv_address_add(addr, groupCountYOff)));
+
+   mi_store(b, mi_reg32(GFX10_3DPRIM_XP(2)),
+               mi_mem32(anv_address_add(addr, groupCountZOff)));
+
+   if (emit_xp0)
+      mi_store(b, mi_reg32(GFX10_3DPRIM_XP(0)), mi_imm(xp0));
+}
+
+static void
+emit_indirect_3dmesh_3d(struct anv_batch *batch,
+                        bool predicate_enable,
+                        bool uses_drawid)
+{
+   uint32_t len = GENX(3DMESH_3D_length) + uses_drawid;
+   uint32_t *dw = anv_batch_emitn(batch, len, GENX(3DMESH_3D),
+                   .PredicateEnable           = predicate_enable,
+                   .IndirectParameterEnable   = true,
+                   .ExtendedParameter0Present = uses_drawid);
+   if (uses_drawid)
+      dw[len - 1] = 0;
+}
+
+void
+genX(CmdDrawMeshTasksIndirectEXT)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    uint32_t                                    drawCount,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+   const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
+   const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+   struct anv_cmd_state *cmd_state = &cmd_buffer->state;
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_DRAW,
+                        "draw mesh indirect", drawCount);
+
+   trace_intel_begin_draw_mesh_indirect(&cmd_buffer->trace);
+
+   genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+   if (cmd_state->conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
+                       mesh_prog_data->uses_drawid;
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   for (uint32_t i = 0; i < drawCount; i++) {
+      struct anv_address draw = anv_address_add(buffer->address, offset);
+
+      mesh_load_indirect_parameters_3dmesh_3d(cmd_buffer, &b, draw, uses_drawid, i);
+
+      emit_indirect_3dmesh_3d(&cmd_buffer->batch,
+            cmd_state->conditional_render_enabled, uses_drawid);
+
+      offset += stride;
+   }
+
+   trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, drawCount);
+}
+
+void
+genX(CmdDrawMeshTasksIndirectCountEXT)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    VkBuffer                                    _countBuffer,
+    VkDeviceSize                                countBufferOffset,
+    uint32_t                                    maxDrawCount,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+   const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
+   const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   anv_measure_snapshot(cmd_buffer,
+                        INTEL_SNAPSHOT_DRAW,
+                        "draw mesh indirect count", 0);
+
+   trace_intel_begin_draw_mesh_indirect_count(&cmd_buffer->trace);
+
+   genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+   bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
+                       mesh_prog_data->uses_drawid;
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+   const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &count_buffer->address);
+   mi_builder_set_mocs(&b, mocs);
+
+   struct mi_value max =
+         prepare_for_draw_count_predicate(
+            cmd_buffer, &b,
+            anv_address_add(count_buffer->address, countBufferOffset));
+
+   for (uint32_t i = 0; i < maxDrawCount; i++) {
+      struct anv_address draw = anv_address_add(buffer->address, offset);
+
+      emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
+
+      mesh_load_indirect_parameters_3dmesh_3d(cmd_buffer, &b, draw, uses_drawid, i);
+
+      emit_indirect_3dmesh_3d(&cmd_buffer->batch, true, uses_drawid);
+
+      offset += stride;
+   }
+
+   trace_intel_end_draw_mesh_indirect_count(&cmd_buffer->trace, maxDrawCount);
+}
+
+#endif /* GFX_VERx10 >= 125 */
diff --git a/src/intel/vulkan/genX_cmd_draw_generated_flush.h b/src/intel/vulkan/genX_cmd_draw_generated_flush.h
new file mode 100644
index 00000000000..2240d1e1918
--- /dev/null
+++ b/src/intel/vulkan/genX_cmd_draw_generated_flush.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright © 2024 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef GENX_CMD_DRAW_GENERATED_FLUSH_H
+#define GENX_CMD_DRAW_GENERATED_FLUSH_H
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "util/macros.h"
+
+#include "common/intel_genX_state_brw.h"
+
+#include "anv_private.h"
+
+static void
+genX(cmd_buffer_flush_generated_draws)(struct anv_cmd_buffer *cmd_buffer)
+{
+   if (!anv_cmd_buffer_is_render_queue(cmd_buffer))
+      return;
+
+   /* No return address setup means we don't have to do anything */
+   if (anv_address_is_null(cmd_buffer->generation.return_addr))
+      return;
+
+   struct anv_batch *batch = &cmd_buffer->generation.batch;
+
+   /* Wait for all the generation vertex shader to generate the commands. */
+   genX(emit_apply_pipe_flushes)(batch,
+                                 cmd_buffer->device,
+                                 _3D,
+#if GFX_VER == 9
+                                 ANV_PIPE_VF_CACHE_INVALIDATE_BIT |
+#endif
+                                 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
+                                 ANV_PIPE_CS_STALL_BIT,
+                                 NULL /* emitted_bits */);
+
+#if GFX_VER >= 12
+   anv_batch_emit(batch, GENX(MI_ARB_CHECK), arb) {
+      arb.PreParserDisableMask = true;
+      arb.PreParserDisable = true;
+   }
+#else
+   /* Prior to Gfx12 we cannot disable the CS prefetch but it doesn't matter
+    * as the prefetch shouldn't follow the MI_BATCH_BUFFER_START.
+    */
+#endif
+
+   /* Return to the main batch. */
+   anv_batch_emit(batch, GENX(MI_BATCH_BUFFER_START), bbs) {
+      bbs.AddressSpaceIndicator = ASI_PPGTT;
+      bbs.BatchBufferStartAddress = cmd_buffer->generation.return_addr;
+   }
+
+   cmd_buffer->generation.return_addr = ANV_NULL_ADDRESS;
+}
+
+#endif /* GENX_CMD_DRAW_GENERATED_FLUSH_H */
diff --git a/src/intel/vulkan/genX_cmd_draw_generated_indirect.h b/src/intel/vulkan/genX_cmd_draw_generated_indirect.h
new file mode 100644
index 00000000000..0db4cffb297
--- /dev/null
+++ b/src/intel/vulkan/genX_cmd_draw_generated_indirect.h
@@ -0,0 +1,656 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef GENX_CMD_DRAW_GENERATED_INDIRECT_H
+#define GENX_CMD_DRAW_GENERATED_INDIRECT_H
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "util/macros.h"
+
+#include "common/intel_genX_state_brw.h"
+
+#include "anv_private.h"
+#include "anv_internal_kernels.h"
+
+/* This is a maximum number of items a fragment shader can generate due to the
+ * viewport size.
+ */
+#define MAX_GENERATED_DRAW_COUNT (8192 * 8192)
+
+#define MAX_RING_BO_ITEMS (8192)
+
+static struct anv_state
+genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer,
+                                     struct anv_simple_shader *simple_state,
+                                     struct anv_address generated_cmds_addr,
+                                     uint32_t generated_cmd_stride,
+                                     struct anv_address indirect_data_addr,
+                                     uint32_t indirect_data_stride,
+                                     struct anv_address draw_id_addr,
+                                     uint32_t item_base,
+                                     uint32_t item_count,
+                                     struct anv_address count_addr,
+                                     uint32_t max_count,
+                                     bool indexed,
+                                     uint32_t ring_count)
+{
+   struct anv_device *device = cmd_buffer->device;
+
+   struct anv_state push_data_state =
+      genX(simple_shader_alloc_push)(simple_state,
+                                     sizeof(struct anv_gen_indirect_params));
+   if (push_data_state.map == NULL)
+      return ANV_STATE_NULL;
+
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+   const bool use_tbimr = cmd_buffer->state.gfx.dyn_state.use_tbimr;
+
+   struct anv_address draw_count_addr;
+   if (anv_address_is_null(count_addr)) {
+      draw_count_addr = anv_address_add(
+         genX(simple_shader_push_state_address)(simple_state, push_data_state),
+         offsetof(struct anv_gen_indirect_params, draw_count));
+   } else {
+      draw_count_addr = count_addr;
+   }
+
+   struct anv_gen_indirect_params *push_data = push_data_state.map;
+   *push_data = (struct anv_gen_indirect_params) {
+      .draw_id_addr           = anv_address_physical(draw_id_addr),
+      .indirect_data_addr     = anv_address_physical(indirect_data_addr),
+      .indirect_data_stride   = indirect_data_stride,
+      .flags                  = (use_tbimr ? ANV_GENERATED_FLAG_TBIMR : 0) |
+                                (indexed ? ANV_GENERATED_FLAG_INDEXED : 0) |
+                                (cmd_buffer->state.conditional_render_enabled ?
+                                 ANV_GENERATED_FLAG_PREDICATED : 0) |
+                                ((vs_prog_data->uses_firstvertex ||
+                                  vs_prog_data->uses_baseinstance) ?
+                                 ANV_GENERATED_FLAG_BASE : 0) |
+                                (vs_prog_data->uses_drawid ? ANV_GENERATED_FLAG_DRAWID : 0) |
+                                (anv_mocs(device, indirect_data_addr.bo,
+                                          ISL_SURF_USAGE_VERTEX_BUFFER_BIT) << 8) |
+                                (!anv_address_is_null(count_addr) ?
+                                 ANV_GENERATED_FLAG_COUNT : 0) |
+                                (ring_count != 0 ? ANV_GENERATED_FLAG_RING_MODE : 0) |
+                                ((generated_cmd_stride / 4) << 16),
+      .draw_base              = item_base,
+      .max_draw_count         = max_count,
+      .ring_count             = ring_count,
+      .instance_multiplier    = pipeline->instance_multiplier,
+      .draw_count             = anv_address_is_null(count_addr) ? max_count : 0,
+      .generated_cmds_addr    = anv_address_physical(generated_cmds_addr),
+      .draw_count_addr        = anv_address_physical(draw_count_addr),
+   };
+
+   genX(emit_simple_shader_dispatch)(simple_state, item_count, push_data_state);
+
+   return push_data_state;
+}
+
+static void
+genX(cmd_buffer_emit_indirect_generated_draws_init)(struct anv_cmd_buffer *cmd_buffer)
+{
+   anv_batch_emit_ensure_space(&cmd_buffer->generation.batch, 4);
+
+   trace_intel_begin_generate_draws(&cmd_buffer->trace);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_BATCH_BUFFER_START), bbs) {
+      bbs.AddressSpaceIndicator = ASI_PPGTT;
+      bbs.BatchBufferStartAddress =
+         anv_batch_current_address(&cmd_buffer->generation.batch);
+   }
+
+   cmd_buffer->generation.return_addr = anv_batch_current_address(&cmd_buffer->batch);
+
+#if GFX_VER >= 12
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
+      arb.PreParserDisableMask = true;
+      arb.PreParserDisable = false;
+   }
+#endif
+
+   trace_intel_end_generate_draws(&cmd_buffer->trace);
+
+   struct anv_shader_bin *gen_kernel;
+   VkResult ret =
+      anv_device_get_internal_shader(
+         cmd_buffer->device,
+         ANV_INTERNAL_KERNEL_GENERATED_DRAWS,
+         &gen_kernel);
+   if (ret != VK_SUCCESS) {
+      anv_batch_set_error(&cmd_buffer->batch, ret);
+      return;
+   }
+
+   struct anv_device *device = cmd_buffer->device;
+   struct anv_simple_shader *state = &cmd_buffer->generation.shader_state;
+   *state = (struct anv_simple_shader) {
+      .device               = device,
+      .cmd_buffer           = cmd_buffer,
+      .dynamic_state_stream = &cmd_buffer->dynamic_state_stream,
+      .general_state_stream = &cmd_buffer->general_state_stream,
+      .batch                = &cmd_buffer->generation.batch,
+      .kernel               = gen_kernel,
+      .l3_config            = device->internal_kernels_l3_config,
+      .urb_cfg              = &cmd_buffer->state.gfx.urb_cfg,
+   };
+
+   genX(emit_simple_shader_init)(state);
+}
+
+static struct anv_address
+genX(cmd_buffer_get_draw_id_addr)(struct anv_cmd_buffer *cmd_buffer,
+                                  uint32_t draw_id_count)
+{
+#if GFX_VER >= 11
+   return ANV_NULL_ADDRESS;
+#else
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+   if (!vs_prog_data->uses_drawid)
+      return ANV_NULL_ADDRESS;
+
+   struct anv_state draw_id_state =
+      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4 * draw_id_count, 4);
+   return anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
+                                       draw_id_state);
+#endif
+}
+
+static uint32_t
+genX(cmd_buffer_get_generated_draw_stride)(struct anv_cmd_buffer *cmd_buffer)
+{
+   /* With the extended parameters in 3DPRIMITIVE on Gfx11+ we can emit
+    * everything. Prior to this, we need to emit a couple of
+    * VERTEX_BUFFER_STATE.
+    */
+#if GFX_VER >= 11
+   return 4 * GENX(3DPRIMITIVE_EXTENDED_length);
+#else
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   uint32_t len = 0;
+
+   if (vs_prog_data->uses_firstvertex ||
+       vs_prog_data->uses_baseinstance ||
+       vs_prog_data->uses_drawid) {
+      len += 4; /* 3DSTATE_VERTEX_BUFFERS */
+
+      if (vs_prog_data->uses_firstvertex ||
+          vs_prog_data->uses_baseinstance)
+         len += 4 * GENX(VERTEX_BUFFER_STATE_length);
+
+      if (vs_prog_data->uses_drawid)
+         len += 4 * GENX(VERTEX_BUFFER_STATE_length);
+   }
+
+   return len + 4 * GENX(3DPRIMITIVE_length);
+#endif
+}
+
+static void
+genX(cmd_buffer_rewrite_forward_end_addr)(struct anv_cmd_buffer *cmd_buffer,
+                                          struct anv_gen_indirect_params *params)
+{
+   /* We don't know the end_addr until we have emitted all the generation
+    * draws. Go and edit the address of all the push parameters.
+    */
+   uint64_t end_addr =
+      anv_address_physical(anv_batch_current_address(&cmd_buffer->batch));
+   while (params != NULL) {
+      params->end_addr = end_addr;
+      params = params->prev;
+   }
+}
+
+static void
+genX(cmd_buffer_emit_indirect_generated_draws_inplace)(struct anv_cmd_buffer *cmd_buffer,
+                                                       struct anv_address indirect_data_addr,
+                                                       uint32_t indirect_data_stride,
+                                                       struct anv_address count_addr,
+                                                       uint32_t max_draw_count,
+                                                       bool indexed)
+{
+   const bool start_generation_batch =
+      anv_address_is_null(cmd_buffer->generation.return_addr);
+
+   genX(flush_pipeline_select_3d)(cmd_buffer);
+
+   struct anv_address draw_id_addr =
+      genX(cmd_buffer_get_draw_id_addr)(cmd_buffer, max_draw_count);
+
+#if GFX_VER == 9
+   /* Mark the VB-0 as using the entire dynamic state pool area, but only for
+    * the draw call starting the generation batch. All the following ones will
+    * use the same area.
+    */
+   if (start_generation_batch) {
+      struct anv_device *device = cmd_buffer->device;
+      genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(
+         cmd_buffer, 0,
+         (struct anv_address) {
+            .offset = device->physical->va.dynamic_state_pool.addr,
+         },
+         device->physical->va.dynamic_state_pool.size);
+   }
+
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   if (vs_prog_data->uses_baseinstance ||
+       vs_prog_data->uses_firstvertex) {
+      /* We're using the indirect buffer directly to source base instance &
+       * first vertex values. Mark the entire area as used.
+       */
+      genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
+                                                     indirect_data_addr,
+                                                     indirect_data_stride * max_draw_count);
+   }
+
+   if (vs_prog_data->uses_drawid) {
+      /* Mark the whole draw id buffer as used. */
+      genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
+                                                     draw_id_addr,
+                                                     sizeof(uint32_t) * max_draw_count);
+   }
+#endif
+
+   /* Apply the pipeline flush here so the indirect data is available for the
+    * generation shader.
+    */
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   if (start_generation_batch)
+      genX(cmd_buffer_emit_indirect_generated_draws_init)(cmd_buffer);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   /* Emit the 3D state in the main batch. */
+   genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+   const uint32_t draw_cmd_stride =
+      genX(cmd_buffer_get_generated_draw_stride)(cmd_buffer);
+
+   struct anv_gen_indirect_params *last_params = NULL;
+   uint32_t item_base = 0;
+   while (item_base < max_draw_count) {
+      const uint32_t item_count = MIN2(max_draw_count - item_base,
+                                       MAX_GENERATED_DRAW_COUNT);
+      const uint32_t draw_cmd_size = item_count * draw_cmd_stride;
+
+      /* Ensure we have enough contiguous space for all the draws so that the
+       * compute shader can edit all the 3DPRIMITIVEs from a single base
+       * address.
+       *
+       * TODO: we might have to split that if the amount of space is to large (at
+       *       1Mb?).
+       */
+      VkResult result = anv_batch_emit_ensure_space(&cmd_buffer->batch,
+                                                    draw_cmd_size);
+      if (result != VK_SUCCESS)
+         return;
+
+      struct anv_state params_state =
+         genX(cmd_buffer_emit_generate_draws)(
+            cmd_buffer,
+            &cmd_buffer->generation.shader_state,
+            anv_batch_current_address(&cmd_buffer->batch),
+            draw_cmd_stride,
+            indirect_data_addr,
+            indirect_data_stride,
+            anv_address_add(draw_id_addr, 4 * item_base),
+            item_base,
+            item_count,
+            count_addr,
+            max_draw_count,
+            indexed,
+            0 /* ring_count */);
+      struct anv_gen_indirect_params *params = params_state.map;
+      if (params == NULL)
+         return;
+
+      anv_batch_advance(&cmd_buffer->batch, draw_cmd_size);
+
+      item_base += item_count;
+
+      params->prev = last_params;
+      last_params = params;
+   }
+
+   genX(cmd_buffer_rewrite_forward_end_addr)(cmd_buffer, last_params);
+
+#if GFX_VER == 9
+   update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, indexed ? RANDOM : SEQUENTIAL);
+#endif
+}
+
+static void
+genX(cmd_buffer_emit_indirect_generated_draws_inring)(struct anv_cmd_buffer *cmd_buffer,
+                                                      struct anv_address indirect_data_addr,
+                                                      uint32_t indirect_data_stride,
+                                                      struct anv_address count_addr,
+                                                      uint32_t max_draw_count,
+                                                      bool indexed)
+{
+   struct anv_device *device = cmd_buffer->device;
+
+   genX(flush_pipeline_select_3d)(cmd_buffer);
+
+   const uint32_t draw_cmd_stride =
+      genX(cmd_buffer_get_generated_draw_stride)(cmd_buffer);
+
+   if (cmd_buffer->generation.ring_bo == NULL) {
+      const uint32_t bo_size = align(
+#if GFX_VER >= 12
+         GENX(MI_ARB_CHECK_length) * 4 +
+#endif
+         draw_cmd_stride * MAX_RING_BO_ITEMS +
+#if GFX_VER == 9
+         4 * MAX_RING_BO_ITEMS +
+#endif
+         GENX(MI_BATCH_BUFFER_START_length) * 4,
+         4096);
+      VkResult result = anv_bo_pool_alloc(&device->batch_bo_pool, bo_size,
+                                          &cmd_buffer->generation.ring_bo);
+      if (result != VK_SUCCESS) {
+         anv_batch_set_error(&cmd_buffer->batch, result);
+         return;
+      }
+   }
+
+   /* How many items will be generated by each iteration of the generation
+    * shader dispatch.
+    */
+   const uint32_t ring_count = MIN2(MAX_RING_BO_ITEMS, max_draw_count);
+
+   /* The ring bo has the following layout:
+    *
+    *   --------------------------------------------------
+    *   | MI_ARB_CHECK to resume CS prefetch (Gfx12+)    |
+    *   |------------------------------------------------|
+    *   |            ring_count * 3DPRIMITIVE            |
+    *   |------------------------------------------------|
+    *   | jump instruction (either back to generate more |
+    *   | commands or to the next set of commands)       |
+    *   |------------------------------------------------|
+    *   |          draw ids (only used on Gfx9)          |
+    *   --------------------------------------------------
+    */
+
+   struct anv_address draw_id_addr = (struct anv_address) {
+      .bo     = cmd_buffer->generation.ring_bo,
+      .offset = ring_count * draw_cmd_stride +
+                GENX(MI_BATCH_BUFFER_START_length) * 4,
+   };
+
+   struct anv_address draw_cmds_addr = (struct anv_address) {
+      .bo = cmd_buffer->generation.ring_bo,
+#if GFX_VER >= 12
+      .offset = GENX(MI_ARB_CHECK_length) * 4,
+#endif
+   };
+
+#if GFX_VER >= 12
+   struct GENX(MI_ARB_CHECK) resume_prefetch = {
+      .PreParserDisableMask = true,
+      .PreParserDisable = false,
+   };
+   GENX(MI_ARB_CHECK_pack)(NULL, cmd_buffer->generation.ring_bo->map,
+                           &resume_prefetch);
+#endif
+
+#if GFX_VER == 9
+   /* Mark the VB-0 as using the entire ring_bo, but only for the draw call
+    * starting the generation batch. All the following ones will use the same
+    * area.
+    */
+   genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(
+      cmd_buffer, 0,
+      (struct anv_address) {
+         .bo = cmd_buffer->generation.ring_bo,
+      },
+      cmd_buffer->generation.ring_bo->size);
+
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   if (vs_prog_data->uses_baseinstance ||
+       vs_prog_data->uses_firstvertex) {
+      /* We're using the indirect buffer directly to source base instance &
+       * first vertex values. Mark the entire area as used.
+       */
+      genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
+                                                     indirect_data_addr,
+                                                     indirect_data_stride * max_draw_count);
+   }
+
+   if (vs_prog_data->uses_drawid) {
+      /* Mark the whole draw id buffer as used. */
+      genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
+                                                     draw_id_addr,
+                                                     sizeof(uint32_t) * max_draw_count);
+   }
+#endif
+
+   /* Apply the pipeline flush here so the indirect data is available for the
+    * generation shader.
+    */
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   trace_intel_begin_generate_draws(&cmd_buffer->trace);
+
+   /***
+    * This is where the command buffer below will jump back to if we need to
+    * generate more draws.
+    */
+   struct anv_address gen_addr = anv_batch_current_address(&cmd_buffer->batch);
+
+   struct anv_shader_bin *gen_kernel;
+   VkResult ret =
+      anv_device_get_internal_shader(
+         cmd_buffer->device,
+         ANV_INTERNAL_KERNEL_GENERATED_DRAWS,
+         &gen_kernel);
+   if (ret != VK_SUCCESS) {
+      anv_batch_set_error(&cmd_buffer->batch, ret);
+      return;
+   }
+
+   struct anv_simple_shader simple_state = (struct anv_simple_shader) {
+      .device               = device,
+      .cmd_buffer           = cmd_buffer,
+      .dynamic_state_stream = &cmd_buffer->dynamic_state_stream,
+      .general_state_stream = &cmd_buffer->general_state_stream,
+      .batch                = &cmd_buffer->batch,
+      .kernel               = gen_kernel,
+      .l3_config            = device->internal_kernels_l3_config,
+      .urb_cfg              = &cmd_buffer->state.gfx.urb_cfg,
+   };
+   genX(emit_simple_shader_init)(&simple_state);
+
+   struct anv_state params_state =
+      genX(cmd_buffer_emit_generate_draws)(
+         cmd_buffer,
+         &simple_state,
+         draw_cmds_addr,
+         draw_cmd_stride,
+         indirect_data_addr,
+         indirect_data_stride,
+         draw_id_addr,
+         0 /* item_base */,
+         MIN2(MAX_RING_BO_ITEMS, max_draw_count) /* item_count */,
+         count_addr,
+         max_draw_count,
+         indexed,
+         ring_count);
+   struct anv_gen_indirect_params *params = params_state.map;
+
+   anv_add_pending_pipe_bits(cmd_buffer,
+#if GFX_VER == 9
+                             ANV_PIPE_VF_CACHE_INVALIDATE_BIT |
+#endif
+                             ANV_PIPE_DATA_CACHE_FLUSH_BIT |
+                             ANV_PIPE_CS_STALL_BIT,
+                             "after generation flush");
+
+   trace_intel_end_generate_draws(&cmd_buffer->trace);
+
+   if (cmd_buffer->state.conditional_render_enabled)
+      genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
+
+   /* Emit the 3D state in the main batch. */
+   genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
+
+   if (max_draw_count > 0) {
+#if GFX_VER >= 12
+      /* Prior to Gfx12 we cannot disable the CS prefetch but it doesn't matter
+       * as the prefetch shouldn't follow the MI_BATCH_BUFFER_START.
+       */
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
+         arb.PreParserDisableMask = true;
+         arb.PreParserDisable = true;
+      }
+#endif
+
+      /* Jump into the ring buffer. */
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_BATCH_BUFFER_START), bbs) {
+         bbs.AddressSpaceIndicator = ASI_PPGTT;
+         bbs.BatchBufferStartAddress = (struct anv_address) {
+            .bo = cmd_buffer->generation.ring_bo,
+         };
+      }
+
+      /***
+       * This is the location at which the ring buffer jumps to if it needs to
+       * generate more draw calls. We do the following :
+       *    - wait for draws in the ring buffer to complete (cs stall) so we're
+       *      sure the push constant data we're about to edit is not read anymore
+       *    - increment the base draw number by the number of draws
+       *      executed in the ring
+       *    - invalidate the constant cache since the
+       *      anv_generated_indirect_params::draw::draw_base is updated
+       *    - jump back to the generation shader
+       */
+      struct anv_address inc_addr =
+         anv_batch_current_address(&cmd_buffer->batch);
+
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_STALL_AT_SCOREBOARD_BIT |
+                                ANV_PIPE_CS_STALL_BIT,
+                                "after generated draws batch");
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+      struct mi_builder b;
+      mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+      struct anv_address draw_base_addr = anv_address_add(
+         genX(simple_shader_push_state_address)(
+            &simple_state, params_state),
+         offsetof(struct anv_gen_indirect_params, draw_base));
+
+      const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device,
+                                                 &draw_base_addr);
+      mi_builder_set_mocs(&b, mocs);
+
+      mi_store(&b, mi_mem32(draw_base_addr),
+                   mi_iadd(&b, mi_mem32(draw_base_addr),
+                               mi_imm(ring_count)));
+
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT,
+                                "after generated draws batch increment");
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_BATCH_BUFFER_START), bbs) {
+         bbs.AddressSpaceIndicator = ASI_PPGTT;
+         bbs.BatchBufferStartAddress = gen_addr;
+      }
+
+      /***
+       * This is the location at which the ring buffer jump to once all the draw
+       * calls have executed.
+       */
+      struct anv_address end_addr = anv_batch_current_address(&cmd_buffer->batch);
+
+      /* Reset the draw_base field in case we ever replay the command buffer. */
+      mi_store(&b, mi_mem32(draw_base_addr), mi_imm(0));
+
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT,
+                                "after generated draws end");
+
+      params->gen_addr = anv_address_physical(inc_addr);
+      params->end_addr = anv_address_physical(end_addr);
+   }
+}
+
+static void
+genX(cmd_buffer_emit_indirect_generated_draws)(struct anv_cmd_buffer *cmd_buffer,
+                                               struct anv_address indirect_data_addr,
+                                               uint32_t indirect_data_stride,
+                                               struct anv_address count_addr,
+                                               uint32_t max_draw_count,
+                                               bool indexed)
+{
+   /* In order to have the vertex fetch gather the data we need to have a non
+    * 0 stride. It's possible to have a 0 stride given by the application when
+    * draw_count is 1, but we need a correct value for the
+    * VERTEX_BUFFER_STATE::BufferPitch, so ensure the caller set this
+    * correctly :
+    *
+    * Vulkan spec, vkCmdDrawIndirect:
+    *
+    *   "If drawCount is less than or equal to one, stride is ignored."
+    */
+   assert(indirect_data_stride > 0);
+
+   const bool use_ring_buffer = max_draw_count >=
+      cmd_buffer->device->physical->instance->generated_indirect_ring_threshold;
+   if (use_ring_buffer) {
+      genX(cmd_buffer_emit_indirect_generated_draws_inring)(cmd_buffer,
+                                                            indirect_data_addr,
+                                                            indirect_data_stride,
+                                                            count_addr,
+                                                            max_draw_count,
+                                                            indexed);
+   } else {
+      genX(cmd_buffer_emit_indirect_generated_draws_inplace)(cmd_buffer,
+                                                             indirect_data_addr,
+                                                             indirect_data_stride,
+                                                             count_addr,
+                                                             max_draw_count,
+                                                             indexed);
+   }
+}
+
+#endif /* GENX_CMD_DRAW_GENERATED_INDIRECT_H */
diff --git a/src/intel/vulkan/genX_cmd_draw_helpers.h b/src/intel/vulkan/genX_cmd_draw_helpers.h
new file mode 100644
index 00000000000..2c370909ef1
--- /dev/null
+++ b/src/intel/vulkan/genX_cmd_draw_helpers.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef GENX_CMD_DRAW_HELPERS_H
+#define GENX_CMD_DRAW_HELPERS_H
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "anv_private.h"
+
+#if GFX_VER < 11
+static void
+emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,
+               struct anv_address addr,
+               uint32_t size, uint32_t index)
+{
+   uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
+                                 GENX(3DSTATE_VERTEX_BUFFERS));
+
+   GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
+      &(struct GENX(VERTEX_BUFFER_STATE)) {
+         .VertexBufferIndex = index,
+         .AddressModifyEnable = true,
+         .BufferPitch = 0,
+         .MOCS = anv_mocs(cmd_buffer->device, addr.bo,
+                          ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
+         .NullVertexBuffer = size == 0,
+         .BufferStartingAddress = addr,
+         .BufferSize = size
+      });
+
+#if GFX_VER == 9
+   genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer,
+                                                  index, addr, size);
+#endif
+}
+
+static void
+emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
+                             struct anv_address addr)
+{
+   emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX);
+}
+
+static void
+emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
+                          uint32_t base_vertex, uint32_t base_instance)
+{
+   if (base_vertex == 0 && base_instance == 0) {
+      emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS);
+      return;
+   }
+
+   struct anv_state id_state =
+      anv_cmd_buffer_alloc_temporary_state(cmd_buffer, 8, 4);
+
+   ((uint32_t *)id_state.map)[0] = base_vertex;
+   ((uint32_t *)id_state.map)[1] = base_instance;
+
+   struct anv_address addr =
+      anv_cmd_buffer_temporary_state_address(cmd_buffer, id_state);
+
+   emit_base_vertex_instance_bo(cmd_buffer, addr);
+}
+
+static void
+emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)
+{
+   struct anv_state state =
+      anv_cmd_buffer_alloc_temporary_state(cmd_buffer, 4, 4);
+
+   ((uint32_t *)state.map)[0] = draw_index;
+
+   struct anv_address addr =
+      anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
+
+   emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
+}
+#endif /* GFX_VER <= 11 */
+
+static void
+update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer,
+                                   uint32_t access_type)
+{
+#if GFX_VER == 9
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   uint64_t vb_used = dyn->vi->bindings_valid;
+   if (vs_prog_data->uses_firstvertex ||
+       vs_prog_data->uses_baseinstance)
+      vb_used |= 1ull << ANV_SVGS_VB_INDEX;
+   if (vs_prog_data->uses_drawid)
+      vb_used |= 1ull << ANV_DRAWID_VB_INDEX;
+
+   genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer,
+                                                       access_type,
+                                                       vb_used);
+#endif
+}
+
+#if GFX_VER < 11
+ALWAYS_INLINE static void
+cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer,
+                                           const struct brw_vs_prog_data *vs_prog_data,
+                                           uint32_t base_vertex,
+                                           uint32_t base_instance,
+                                           uint32_t draw_id,
+                                           bool force_flush)
+{
+   bool emitted = false;
+   if (vs_prog_data->uses_firstvertex ||
+       vs_prog_data->uses_baseinstance) {
+      emit_base_vertex_instance(cmd_buffer, base_vertex, base_instance);
+      emitted = true;
+   }
+   if (vs_prog_data->uses_drawid) {
+      emit_draw_index(cmd_buffer, draw_id);
+      emitted = true;
+   }
+   /* Emitting draw index or vertex index BOs may result in needing
+    * additional VF cache flushes.
+    */
+   if (emitted || force_flush)
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+}
+#endif
+
+#endif /* GENX_CMD_DRAW_HELPERS_H */
diff --git a/src/intel/vulkan/genX_cmd_video.c b/src/intel/vulkan/genX_cmd_video.c
new file mode 100644
index 00000000000..e7e94f16f25
--- /dev/null
+++ b/src/intel/vulkan/genX_cmd_video.c
@@ -0,0 +1,1195 @@
+/*
+ * Copyright © 2021 Red Hat
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+
+#include "util/vl_zscan_data.h"
+
+void
+genX(CmdBeginVideoCodingKHR)(VkCommandBuffer commandBuffer,
+                             const VkVideoBeginCodingInfoKHR *pBeginInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_video_session, vid, pBeginInfo->videoSession);
+   ANV_FROM_HANDLE(anv_video_session_params, params, pBeginInfo->videoSessionParameters);
+
+   cmd_buffer->video.vid = vid;
+   cmd_buffer->video.params = params;
+}
+
+void
+genX(CmdControlVideoCodingKHR)(VkCommandBuffer commandBuffer,
+                               const VkVideoCodingControlInfoKHR *pCodingControlInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   if (pCodingControlInfo->flags & VK_VIDEO_CODING_CONTROL_RESET_BIT_KHR) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
+         flush.VideoPipelineCacheInvalidate = 1;
+      }
+   }
+}
+
+void
+genX(CmdEndVideoCodingKHR)(VkCommandBuffer commandBuffer,
+                           const VkVideoEndCodingInfoKHR *pEndCodingInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer->video.vid = NULL;
+   cmd_buffer->video.params = NULL;
+}
+
+/*
+ * The default scan order of scaling lists is up-right-diagonal
+ * according to the spec. But the device requires raster order,
+ * so we need to convert from the passed scaling lists.
+ */
+static void
+anv_h265_matrix_from_uprightdiagonal(StdVideoH265ScalingLists *out_sl,
+                                     const StdVideoH265ScalingLists *sl)
+{
+  uint8_t i, j;
+
+  for (i = 0; i < 6; i++) {
+     for (j = 0; j < STD_VIDEO_H265_SCALING_LIST_4X4_NUM_ELEMENTS; j++)
+        out_sl->ScalingList4x4[i][vl_zscan_h265_up_right_diagonal_16[j]] =
+           sl->ScalingList4x4[i][j];
+
+     for (j = 0; j < STD_VIDEO_H265_SCALING_LIST_8X8_NUM_ELEMENTS; j++)
+        out_sl->ScalingList8x8[i][vl_zscan_h265_up_right_diagonal[j]] =
+           sl->ScalingList8x8[i][j];
+
+     for (j = 0; j < STD_VIDEO_H265_SCALING_LIST_16X16_NUM_ELEMENTS; j++)
+        out_sl->ScalingList16x16[i][vl_zscan_h265_up_right_diagonal[j]] =
+           sl->ScalingList16x16[i][j];
+  }
+
+  for (i = 0; i < STD_VIDEO_H265_SCALING_LIST_32X32_NUM_LISTS; i++) {
+     for (j = 0; j < STD_VIDEO_H265_SCALING_LIST_32X32_NUM_ELEMENTS; j++)
+        out_sl->ScalingList32x32[i][vl_zscan_h265_up_right_diagonal[j]] =
+           sl->ScalingList32x32[i][j];
+  }
+}
+
+static void
+scaling_list(struct anv_cmd_buffer *cmd_buffer,
+             const StdVideoH265ScalingLists *scaling_list)
+{
+   StdVideoH265ScalingLists out_sl = {0, };
+
+   anv_h265_matrix_from_uprightdiagonal(&out_sl, scaling_list);
+
+   /* 4x4, 8x8, 16x16, 32x32 */
+   for (uint8_t size = 0; size < 4; size++) {
+      /* Intra, Inter */
+      for (uint8_t pred = 0; pred < 2; pred++) {
+         /* Y, Cb, Cr */
+         for (uint8_t color = 0; color < 3; color++) {
+            if (size == 3 && color > 0)
+               continue;
+
+            anv_batch_emit(&cmd_buffer->batch, GENX(HCP_QM_STATE), qm) {
+               qm.SizeID = size;
+               qm.PredictionType = pred;
+               qm.ColorComponent = color;
+
+               qm.DCCoefficient = size > 1 ?
+                  (size == 2 ? scaling_list->ScalingListDCCoef16x16[3 * pred + color] :
+                               scaling_list->ScalingListDCCoef32x32[pred]) : 0;
+
+               if (size == 0) {
+                  for (uint8_t i = 0; i < 4; i++)
+                     for (uint8_t j = 0; j < 4; j++)
+                        qm.QuantizerMatrix8x8[4 * i + j] =
+                           out_sl.ScalingList4x4[3 * pred + color][4 * i + j];
+               } else if (size == 1) {
+                  for (uint8_t i = 0; i < 8; i++)
+                     for (uint8_t j = 0; j < 8; j++)
+                        qm.QuantizerMatrix8x8[8 * i + j] =
+                           out_sl.ScalingList8x8[3 * pred + color][8 * i + j];
+               } else if (size == 2) {
+                  for (uint8_t i = 0; i < 8; i++)
+                     for (uint8_t j = 0; j < 8; j++)
+                        qm.QuantizerMatrix8x8[8 * i + j] =
+                           out_sl.ScalingList16x16[3 * pred + color][8 * i + j];
+               } else if (size == 3) {
+                  for (uint8_t i = 0; i < 8; i++)
+                     for (uint8_t j = 0; j < 8; j++)
+                        qm.QuantizerMatrix8x8[8 * i + j] =
+                           out_sl.ScalingList32x32[pred][8 * i + j];
+               }
+            }
+         }
+      }
+   }
+}
+
+static void
+anv_h265_decode_video(struct anv_cmd_buffer *cmd_buffer,
+                      const VkVideoDecodeInfoKHR *frame_info)
+{
+   ANV_FROM_HANDLE(anv_buffer, src_buffer, frame_info->srcBuffer);
+   struct anv_video_session *vid = cmd_buffer->video.vid;
+   struct anv_video_session_params *params = cmd_buffer->video.params;
+
+   const struct VkVideoDecodeH265PictureInfoKHR *h265_pic_info =
+      vk_find_struct_const(frame_info->pNext, VIDEO_DECODE_H265_PICTURE_INFO_KHR);
+
+   const StdVideoH265SequenceParameterSet *sps =
+      vk_video_find_h265_dec_std_sps(&params->vk, h265_pic_info->pStdPictureInfo->pps_seq_parameter_set_id);
+   const StdVideoH265PictureParameterSet *pps =
+      vk_video_find_h265_dec_std_pps(&params->vk, h265_pic_info->pStdPictureInfo->pps_pic_parameter_set_id);
+
+   struct vk_video_h265_reference ref_slots[2][8] = { 0 };
+   uint8_t dpb_idx[ANV_VIDEO_H265_MAX_NUM_REF_FRAME] = { 0,};
+   bool is_10bit = sps->bit_depth_chroma_minus8 || sps->bit_depth_luma_minus8;
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
+      flush.VideoPipelineCacheInvalidate = 1;
+   };
+
+#if GFX_VER >= 12
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_FORCE_WAKEUP), wake) {
+      wake.HEVCPowerWellControl = 1;
+      wake.MaskBits = 768;
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(VD_CONTROL_STATE), cs) {
+      cs.PipelineInitialization = true;
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_WAIT), mfx) {
+      mfx.MFXSyncControlFlag = 1;
+   }
+#endif
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(HCP_PIPE_MODE_SELECT), sel) {
+      sel.CodecSelect = Decode;
+      sel.CodecStandardSelect = HEVC;
+   }
+
+#if GFX_VER >= 12
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_WAIT), mfx) {
+      mfx.MFXSyncControlFlag = 1;
+   }
+#endif
+
+   const struct anv_image_view *iv =
+      anv_image_view_from_handle(frame_info->dstPictureResource.imageViewBinding);
+   const struct anv_image *img = iv->image;
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(HCP_SURFACE_STATE), ss) {
+      ss.SurfacePitch = img->planes[0].primary_surface.isl.row_pitch_B - 1;
+      ss.SurfaceID = HCP_CurrentDecodedPicture;
+      ss.SurfaceFormat = is_10bit ? P010 : PLANAR_420_8;
+
+      ss.YOffsetforUCb = img->planes[1].primary_surface.memory_range.offset /
+         img->planes[0].primary_surface.isl.row_pitch_B;
+
+#if GFX_VER >= 11
+      ss.DefaultAlphaValue = 0xffff;
+#endif
+   }
+
+#if GFX_VER >= 12
+   /* Seems to need to set same states to ref as decode on gen12 */
+   anv_batch_emit(&cmd_buffer->batch, GENX(HCP_SURFACE_STATE), ss) {
+      ss.SurfacePitch = img->planes[0].primary_surface.isl.row_pitch_B - 1;
+      ss.SurfaceID = HCP_ReferencePicture;
+      ss.SurfaceFormat = is_10bit ? P010 : PLANAR_420_8;
+
+      ss.YOffsetforUCb = img->planes[1].primary_surface.memory_range.offset /
+         img->planes[0].primary_surface.isl.row_pitch_B;
+
+      ss.DefaultAlphaValue = 0xffff;
+   }
+#endif
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(HCP_PIPE_BUF_ADDR_STATE), buf) {
+      buf.DecodedPictureAddress =
+         anv_image_address(img, &img->planes[0].primary_surface.memory_range);
+
+      buf.DecodedPictureMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, buf.DecodedPictureAddress.bo, 0),
+      };
+
+      buf.DeblockingFilterLineBufferAddress = (struct anv_address) {
+         vid->vid_mem[ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_LINE].mem->bo,
+         vid->vid_mem[ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_LINE].offset
+      };
+
+      buf.DeblockingFilterLineBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, buf.DeblockingFilterLineBufferAddress.bo, 0),
+      };
+
+      buf.DeblockingFilterTileLineBufferAddress = (struct anv_address) {
+         vid->vid_mem[ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_TILE_LINE].mem->bo,
+         vid->vid_mem[ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_TILE_LINE].offset
+      };
+
+      buf.DeblockingFilterTileLineBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, buf.DeblockingFilterTileLineBufferAddress.bo, 0),
+      };
+
+      buf.DeblockingFilterTileColumnBufferAddress = (struct anv_address) {
+         vid->vid_mem[ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_TILE_COLUMN].mem->bo,
+         vid->vid_mem[ANV_VID_MEM_H265_DEBLOCK_FILTER_ROW_STORE_TILE_COLUMN].offset
+      };
+
+      buf.DeblockingFilterTileColumnBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, buf.DeblockingFilterTileColumnBufferAddress.bo, 0),
+      };
+
+      buf.MetadataLineBufferAddress = (struct anv_address) {
+         vid->vid_mem[ANV_VID_MEM_H265_METADATA_LINE].mem->bo,
+         vid->vid_mem[ANV_VID_MEM_H265_METADATA_LINE].offset
+      };
+
+      buf.MetadataLineBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, buf.MetadataLineBufferAddress.bo, 0),
+      };
+
+      buf.MetadataTileLineBufferAddress = (struct anv_address) {
+         vid->vid_mem[ANV_VID_MEM_H265_METADATA_TILE_LINE].mem->bo,
+         vid->vid_mem[ANV_VID_MEM_H265_METADATA_TILE_LINE].offset
+      };
+
+      buf.MetadataTileLineBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, buf.MetadataTileLineBufferAddress.bo, 0),
+      };
+
+      buf.MetadataTileColumnBufferAddress = (struct anv_address) {
+         vid->vid_mem[ANV_VID_MEM_H265_METADATA_TILE_COLUMN].mem->bo,
+         vid->vid_mem[ANV_VID_MEM_H265_METADATA_TILE_COLUMN].offset
+      };
+
+      buf.MetadataTileColumnBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, buf.MetadataTileColumnBufferAddress.bo, 0),
+      };
+
+      buf.SAOLineBufferAddress = (struct anv_address) {
+         vid->vid_mem[ANV_VID_MEM_H265_SAO_LINE].mem->bo,
+         vid->vid_mem[ANV_VID_MEM_H265_SAO_LINE].offset
+      };
+
+      buf.SAOLineBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, buf.SAOLineBufferAddress.bo, 0),
+      };
+
+      buf.SAOTileLineBufferAddress = (struct anv_address) {
+         vid->vid_mem[ANV_VID_MEM_H265_SAO_TILE_LINE].mem->bo,
+         vid->vid_mem[ANV_VID_MEM_H265_SAO_TILE_LINE].offset
+      };
+
+      buf.SAOTileLineBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, buf.SAOTileLineBufferAddress.bo, 0),
+      };
+
+      buf.SAOTileColumnBufferAddress = (struct anv_address) {
+         vid->vid_mem[ANV_VID_MEM_H265_SAO_TILE_COLUMN].mem->bo,
+         vid->vid_mem[ANV_VID_MEM_H265_SAO_TILE_COLUMN].offset
+      };
+
+      buf.SAOTileColumnBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, buf.SAOTileColumnBufferAddress.bo, 0),
+      };
+
+      buf.CurrentMVTemporalBufferAddress = anv_image_address(img, &img->vid_dmv_top_surface);
+
+      buf.CurrentMVTemporalBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, buf.CurrentMVTemporalBufferAddress.bo, 0),
+      };
+
+      for (unsigned i = 0; i < frame_info->referenceSlotCount; i++) {
+         const struct anv_image_view *ref_iv =
+            anv_image_view_from_handle(frame_info->pReferenceSlots[i].pPictureResource->imageViewBinding);
+         int slot_idx = frame_info->pReferenceSlots[i].slotIndex;
+
+         assert(slot_idx < ANV_VIDEO_H265_MAX_NUM_REF_FRAME);
+         dpb_idx[slot_idx] = i;
+
+         buf.ReferencePictureAddress[i] =
+            anv_image_address(ref_iv->image, &ref_iv->image->planes[0].primary_surface.memory_range);
+      }
+
+      buf.ReferencePictureMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+
+      buf.OriginalUncompressedPictureSourceMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+
+      buf.StreamOutDataDestinationMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+
+      buf.DecodedPictureStatusBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+
+      buf.LCUILDBStreamOutBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+
+      for (unsigned i = 0; i < frame_info->referenceSlotCount; i++) {
+         const struct anv_image_view *ref_iv =
+            anv_image_view_from_handle(frame_info->pReferenceSlots[i].pPictureResource->imageViewBinding);
+
+         buf.CollocatedMVTemporalBufferAddress[i] =
+            anv_image_address(ref_iv->image, &ref_iv->image->vid_dmv_top_surface);
+      }
+
+      buf.CollocatedMVTemporalBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, buf.CollocatedMVTemporalBufferAddress[0].bo, 0),
+      };
+
+      buf.VP9ProbabilityBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+
+      buf.VP9SegmentIDBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+
+      buf.VP9HVDLineRowStoreBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+
+      buf.VP9HVDTileRowStoreBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+#if GFX_VER >= 11
+      buf.SAOStreamOutDataDestinationBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      buf.FrameStatisticsStreamOutDataDestinationBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      buf.SSESourcePixelRowStoreBufferMemoryAddressAttributesReadWrite = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      buf.HCPScalabilitySliceStateBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      buf.HCPScalabilityCABACDecodedSyntaxElementsBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      buf.MVUpperRightColumnStoreBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      buf.IntraPredictionUpperRightColumnStoreBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      buf.IntraPredictionLeftReconColumnStoreBufferMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+#endif
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(HCP_IND_OBJ_BASE_ADDR_STATE), indirect) {
+      indirect.HCPIndirectBitstreamObjectBaseAddress =
+         anv_address_add(src_buffer->address, frame_info->srcBufferOffset & ~4095);
+
+      indirect.HCPIndirectBitstreamObjectMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, src_buffer->address.bo, 0),
+      };
+
+      indirect.HCPIndirectBitstreamObjectAccessUpperBound =
+         anv_address_add(src_buffer->address, align64(frame_info->srcBufferRange, 4096));
+
+      indirect.HCPIndirectCUObjectMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+
+      indirect.HCPPAKBSEObjectMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+
+#if GFX_VER >= 11
+      indirect.HCPVP9PAKCompressedHeaderSyntaxStreamInMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      indirect.HCPVP9PAKProbabilityCounterStreamOutMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      indirect.HCPVP9PAKProbabilityDeltasStreamInMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      indirect.HCPVP9PAKTileRecordStreamOutMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      indirect.HCPVP9PAKCULevelStatisticStreamOutMemoryAddressAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+#endif
+   }
+
+   if (sps->flags.scaling_list_enabled_flag) {
+      if (pps->flags.pps_scaling_list_data_present_flag) {
+         scaling_list(cmd_buffer, pps->pScalingLists);
+      } else if (sps->flags.sps_scaling_list_data_present_flag) {
+         scaling_list(cmd_buffer, sps->pScalingLists);
+      }
+   } else {
+      for (uint8_t size = 0; size < 4; size++) {
+         for (uint8_t pred = 0; pred < 2; pred++) {
+            for (uint8_t color = 0; color < 3; color++) {
+
+               if (size == 3 && color > 0)
+                  continue;
+
+               anv_batch_emit(&cmd_buffer->batch, GENX(HCP_QM_STATE), qm) {
+                  qm.SizeID = size;
+                  qm.PredictionType = pred;
+                  qm.ColorComponent = color;
+                  qm.DCCoefficient = (size > 1) ? 16 : 0;
+                  unsigned len = (size == 0) ? 16 : 64;
+
+                  for (uint8_t q = 0; q < len; q++)
+                     qm.QuantizerMatrix8x8[q] = 0x10;
+               }
+            }
+         }
+      }
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(HCP_PIC_STATE), pic) {
+      pic.FrameWidthInMinimumCodingBlockSize =
+         sps->pic_width_in_luma_samples / (1 << (sps->log2_min_luma_coding_block_size_minus3 + 3)) - 1;
+      pic.FrameHeightInMinimumCodingBlockSize =
+         sps->pic_height_in_luma_samples / (1 << (sps->log2_min_luma_coding_block_size_minus3 + 3))  - 1;
+
+      pic.MinCUSize = sps->log2_min_luma_coding_block_size_minus3 & 0x3;
+      pic.LCUSize = (sps->log2_diff_max_min_luma_coding_block_size +
+                     sps->log2_min_luma_coding_block_size_minus3) & 0x3;
+
+      pic.MinTUSize = sps->log2_min_luma_transform_block_size_minus2 & 0x3;
+      pic.MaxTUSize = (sps->log2_diff_max_min_luma_transform_block_size + sps->log2_min_luma_transform_block_size_minus2) & 0x3;
+      pic.MinPCMSize = sps->log2_min_pcm_luma_coding_block_size_minus3 & 0x3;
+      pic.MaxPCMSize = (sps->log2_diff_max_min_pcm_luma_coding_block_size + sps->log2_min_pcm_luma_coding_block_size_minus3) & 0x3;
+
+#if GFX_VER >= 11
+      pic.Log2SAOOffsetScaleLuma = pps->log2_sao_offset_scale_luma;
+      pic.Log2SAOOffsetScaleChroma = pps->log2_sao_offset_scale_chroma;
+      pic.ChromaQPOffsetListLength = pps->chroma_qp_offset_list_len_minus1;
+      pic.DiffCUChromaQPOffsetDepth = pps->diff_cu_chroma_qp_offset_depth;
+      pic.ChromaQPOffsetListEnable = pps->flags.chroma_qp_offset_list_enabled_flag;
+      pic.ChromaSubsampling = sps->chroma_format_idc;
+
+      pic.HighPrecisionOffsetsEnable = sps->flags.high_precision_offsets_enabled_flag;
+      pic.Log2MaxTransformSkipSize = pps->log2_max_transform_skip_block_size_minus2 + 2;
+      pic.CrossComponentPredictionEnable = pps->flags.cross_component_prediction_enabled_flag;
+      pic.CABACBypassAlignmentEnable = sps->flags.cabac_bypass_alignment_enabled_flag;
+      pic.PersistentRiceAdaptationEnable = sps->flags.persistent_rice_adaptation_enabled_flag;
+      pic.IntraSmoothingDisable = sps->flags.intra_smoothing_disabled_flag;
+      pic.ExplicitRDPCMEnable = sps->flags.explicit_rdpcm_enabled_flag;
+      pic.ImplicitRDPCMEnable = sps->flags.implicit_rdpcm_enabled_flag;
+      pic.TransformSkipContextEnable = sps->flags.transform_skip_context_enabled_flag;
+      pic.TransformSkipRotationEnable = sps->flags.transform_skip_rotation_enabled_flag;
+      pic.SPSRangeExtensionEnable = sps->flags.sps_range_extension_flag;
+#endif
+
+      pic.CollocatedPictureIsISlice = false;
+      pic.CurrentPictureIsISlice = false;
+      pic.SampleAdaptiveOffsetEnable = sps->flags.sample_adaptive_offset_enabled_flag;
+      pic.PCMEnable = sps->flags.pcm_enabled_flag;
+      pic.CUQPDeltaEnable = pps->flags.cu_qp_delta_enabled_flag;
+      pic.MaxDQPDepth = pps->diff_cu_qp_delta_depth;
+      pic.PCMLoopFilterDisable = sps->flags.pcm_loop_filter_disabled_flag;
+      pic.ConstrainedIntraPrediction = pps->flags.constrained_intra_pred_flag;
+      pic.Log2ParallelMergeLevel = pps->log2_parallel_merge_level_minus2;
+      pic.SignDataHiding = pps->flags.sign_data_hiding_enabled_flag;
+      pic.LoopFilterEnable = pps->flags.loop_filter_across_tiles_enabled_flag;
+      pic.EntropyCodingSyncEnable = pps->flags.entropy_coding_sync_enabled_flag;
+      pic.TilingEnable = pps->flags.tiles_enabled_flag;
+      pic.WeightedBiPredicationEnable = pps->flags.weighted_bipred_flag;
+      pic.WeightedPredicationEnable = pps->flags.weighted_pred_flag;
+      pic.FieldPic = 0;
+      pic.TopField = true;
+      pic.TransformSkipEnable = pps->flags.transform_skip_enabled_flag;
+      pic.AMPEnable = sps->flags.amp_enabled_flag;
+      pic.TransquantBypassEnable = pps->flags.transquant_bypass_enabled_flag;
+      pic.StrongIntraSmoothingEnable = sps->flags.strong_intra_smoothing_enabled_flag;
+      pic.CUPacketStructure = 0;
+
+      pic.PictureCbQPOffset = pps->pps_cb_qp_offset;
+      pic.PictureCrQPOffset = pps->pps_cr_qp_offset;
+      pic.IntraMaxTransformHierarchyDepth = sps->max_transform_hierarchy_depth_intra;
+      pic.InterMaxTransformHierarchyDepth = sps->max_transform_hierarchy_depth_inter;
+      pic.ChromaPCMSampleBitDepth = sps->pcm_sample_bit_depth_chroma_minus1 & 0xf;
+      pic.LumaPCMSampleBitDepth = sps->pcm_sample_bit_depth_luma_minus1 & 0xf;
+
+      pic.ChromaBitDepth = sps->bit_depth_chroma_minus8;
+      pic.LumaBitDepth = sps->bit_depth_luma_minus8;
+
+#if GFX_VER >= 11
+      pic.CbQPOffsetList0 = pps->cb_qp_offset_list[0];
+      pic.CbQPOffsetList1 = pps->cb_qp_offset_list[1];
+      pic.CbQPOffsetList2 = pps->cb_qp_offset_list[2];
+      pic.CbQPOffsetList3 = pps->cb_qp_offset_list[3];
+      pic.CbQPOffsetList4 = pps->cb_qp_offset_list[4];
+      pic.CbQPOffsetList5 = pps->cb_qp_offset_list[5];
+
+      pic.CrQPOffsetList0 = pps->cr_qp_offset_list[0];
+      pic.CrQPOffsetList1 = pps->cr_qp_offset_list[1];
+      pic.CrQPOffsetList2 = pps->cr_qp_offset_list[2];
+      pic.CrQPOffsetList3 = pps->cr_qp_offset_list[3];
+      pic.CrQPOffsetList4 = pps->cr_qp_offset_list[4];
+      pic.CrQPOffsetList5 = pps->cr_qp_offset_list[5];
+#endif
+   }
+
+   if (pps->flags.tiles_enabled_flag) {
+      int cum = 0;
+      anv_batch_emit(&cmd_buffer->batch, GENX(HCP_TILE_STATE), tile) {
+         tile.NumberofTileColumns = pps->num_tile_columns_minus1;
+         tile.NumberofTileRows = pps->num_tile_rows_minus1;
+         for (unsigned i = 0; i < 5; i++) {
+            tile.ColumnPosition[i].CtbPos0i = cum;
+            if ((4 * i) == pps->num_tile_columns_minus1)
+               break;
+
+            cum += pps->column_width_minus1[4 * i] + 1;
+            tile.ColumnPosition[i].CtbPos1i = cum;
+
+            if ((4 * i + 1) == pps->num_tile_columns_minus1)
+               break;
+            cum += pps->column_width_minus1[4 * i + 1] + 1;
+            tile.ColumnPosition[i].CtbPos2i = cum;
+
+            if ((4 * i + 2) == pps->num_tile_columns_minus1)
+               break;
+            cum += pps->column_width_minus1[4 * i + 2] + 1;
+            tile.ColumnPosition[i].CtbPos3i = cum;
+
+            if ((4 * i + 3) >= MIN2(pps->num_tile_columns_minus1,
+                                    ARRAY_SIZE(pps->column_width_minus1)))
+               break;
+
+            cum += pps->column_width_minus1[4 * i + 3] + 1;
+         }
+
+         cum = 0;
+
+         for (unsigned i = 0; i < 5; i++) {
+            tile.Rowposition[i].CtbPos0i = cum;
+            if ((4 * i) == pps->num_tile_rows_minus1)
+               break;
+
+            cum += pps->row_height_minus1[4 * i] + 1;
+            tile.Rowposition[i].CtbPos1i = cum;
+
+            if ((4 * i + 1) == pps->num_tile_rows_minus1)
+               break;
+            cum += pps->row_height_minus1[4 * i + 1] + 1;
+            tile.Rowposition[i].CtbPos2i = cum;
+
+            if ((4 * i + 2) == pps->num_tile_rows_minus1)
+               break;
+            cum += pps->row_height_minus1[4 * i + 2] + 1;
+            tile.Rowposition[i].CtbPos3i = cum;
+
+            if ((4 * i + 3) == pps->num_tile_rows_minus1)
+               break;
+
+            cum += pps->row_height_minus1[4 * i + 3] + 1;
+         }
+
+         if (pps->num_tile_rows_minus1 == 20) {
+            tile.Rowposition[5].CtbPos0i = cum;
+         }
+         if (pps->num_tile_rows_minus1 == 20) {
+            tile.Rowposition[5].CtbPos0i = cum;
+            cum += pps->row_height_minus1[20] + 1;
+            tile.Rowposition[5].CtbPos1i = cum;
+         }
+      }
+   }
+
+   /* Slice parsing */
+   uint32_t last_slice = h265_pic_info->sliceSegmentCount - 1;
+   void *slice_map;
+   VkResult result =
+      anv_device_map_bo(cmd_buffer->device,
+                        src_buffer->address.bo,
+                        src_buffer->address.offset,
+                        frame_info->srcBufferRange + frame_info->srcBufferOffset,
+                        NULL /* placed_addr */,
+                        &slice_map);
+   if (result != VK_SUCCESS) {
+      anv_batch_set_error(&cmd_buffer->batch, result);
+      return;
+   }
+
+   slice_map += frame_info->srcBufferOffset;
+
+   struct vk_video_h265_slice_params slice_params[h265_pic_info->sliceSegmentCount];
+
+   /* All slices should be parsed in advance to collect information necessary */
+   for (unsigned s = 0; s < h265_pic_info->sliceSegmentCount; s++) {
+      uint32_t current_offset = h265_pic_info->pSliceSegmentOffsets[s];
+      void *map = slice_map + current_offset;
+      uint32_t slice_size = 0;
+
+      if (s == last_slice)
+         slice_size = frame_info->srcBufferRange - current_offset;
+      else
+         slice_size = h265_pic_info->pSliceSegmentOffsets[s + 1] - current_offset;
+
+      vk_video_parse_h265_slice_header(frame_info, h265_pic_info, sps, pps, map, slice_size, &slice_params[s]);
+      vk_fill_video_h265_reference_info(frame_info, h265_pic_info, &slice_params[s], ref_slots);
+   }
+
+   anv_device_unmap_bo(cmd_buffer->device, src_buffer->address.bo,
+                       slice_map, frame_info->srcBufferRange,
+                       false /* replace */);
+
+   for (unsigned s = 0; s < h265_pic_info->sliceSegmentCount; s++) {
+      uint32_t ctb_size = 1 << (sps->log2_diff_max_min_luma_coding_block_size +
+          sps->log2_min_luma_coding_block_size_minus3 + 3);
+      uint32_t pic_width_in_min_cbs_y = sps->pic_width_in_luma_samples /
+         (1 << (sps->log2_min_luma_coding_block_size_minus3 + 3));
+      uint32_t width_in_pix = (1 << (sps->log2_min_luma_coding_block_size_minus3 + 3)) *
+         pic_width_in_min_cbs_y;
+      uint32_t ctb_w = DIV_ROUND_UP(width_in_pix, ctb_size);
+      bool is_last = (s == last_slice);
+      int slice_qp = (slice_params[s].slice_qp_delta + pps->init_qp_minus26 + 26) & 0x3f;
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(HCP_SLICE_STATE), slice) {
+         slice.SliceHorizontalPosition = slice_params[s].slice_segment_address % ctb_w;
+         slice.SliceVerticalPosition = slice_params[s].slice_segment_address / ctb_w;
+
+         if (is_last) {
+            slice.NextSliceHorizontalPosition = 0;
+            slice.NextSliceVerticalPosition = 0;
+         } else {
+            slice.NextSliceHorizontalPosition = (slice_params[s + 1].slice_segment_address) % ctb_w;
+            slice.NextSliceVerticalPosition = (slice_params[s + 1].slice_segment_address) / ctb_w;
+         }
+
+         slice.SliceType = slice_params[s].slice_type;
+         slice.LastSlice = is_last;
+         slice.DependentSlice = slice_params[s].dependent_slice_segment;
+         slice.SliceTemporalMVPEnable = slice_params[s].temporal_mvp_enable;
+         slice.SliceQP = abs(slice_qp);
+         slice.SliceQPSign = slice_qp >= 0 ? 0 : 1;
+         slice.SliceCbQPOffset = slice_params[s].slice_cb_qp_offset;
+         slice.SliceCrQPOffset = slice_params[s].slice_cr_qp_offset;
+         slice.SliceHeaderDisableDeblockingFilter = pps->flags.deblocking_filter_override_enabled_flag ?
+               slice_params[s].disable_deblocking_filter_idc : pps->flags.pps_deblocking_filter_disabled_flag;
+         slice.SliceTCOffsetDiv2 = slice_params[s].tc_offset_div2;
+         slice.SliceBetaOffsetDiv2 = slice_params[s].beta_offset_div2;
+         slice.SliceLoopFilterEnable = slice_params[s].loop_filter_across_slices_enable;
+         slice.SliceSAOChroma = slice_params[s].sao_chroma_flag;
+         slice.SliceSAOLuma = slice_params[s].sao_luma_flag;
+         slice.MVDL1Zero = slice_params[s].mvd_l1_zero_flag;
+
+         uint8_t low_delay = true;
+
+         if (slice_params[s].slice_type == STD_VIDEO_H265_SLICE_TYPE_I) {
+            low_delay = false;
+         } else {
+            for (unsigned i = 0; i < slice_params[s].num_ref_idx_l0_active; i++) {
+               int slot_idx = ref_slots[0][i].slot_index;
+
+               if (vk_video_h265_poc_by_slot(frame_info, slot_idx) >
+                     h265_pic_info->pStdPictureInfo->PicOrderCntVal) {
+                  low_delay = false;
+                  break;
+               }
+            }
+
+            for (unsigned i = 0; i < slice_params[s].num_ref_idx_l1_active; i++) {
+               int slot_idx = ref_slots[1][i].slot_index;
+               if (vk_video_h265_poc_by_slot(frame_info, slot_idx) >
+                     h265_pic_info->pStdPictureInfo->PicOrderCntVal) {
+                  low_delay = false;
+                  break;
+               }
+            }
+         }
+
+         slice.LowDelay = low_delay;
+         slice.CollocatedFromL0 = slice_params[s].collocated_list == 0 ? true : false;
+         slice.Log2WeightDenominatorChroma = slice_params[s].luma_log2_weight_denom +
+            (slice_params[s].chroma_log2_weight_denom - slice_params[s].luma_log2_weight_denom);
+         slice.Log2WeightDenominatorLuma = slice_params[s].luma_log2_weight_denom;
+         slice.CABACInit = slice_params[s].cabac_init_idc;
+         slice.MaxMergeIndex = slice_params[s].max_num_merge_cand - 1;
+         slice.CollocatedMVTemporalBufferIndex =
+            dpb_idx[ref_slots[slice_params[s].collocated_list][slice_params[s].collocated_ref_idx].slot_index];
+         assert(slice.CollocatedMVTemporalBufferIndex < ANV_VIDEO_H265_HCP_NUM_REF_FRAME);
+
+         slice.SliceHeaderLength = slice_params[s].slice_data_bytes_offset;
+         slice.CABACZeroWordInsertionEnable = false;
+         slice.EmulationByteSliceInsertEnable = false;
+         slice.TailInsertionPresent = false;
+         slice.SliceDataInsertionPresent = false;
+         slice.HeaderInsertionPresent = false;
+
+         slice.IndirectPAKBSEDataStartOffset = 0;
+         slice.TransformSkipLambda = 0;
+         slice.TransformSkipNumberofNonZeroCoeffsFactor0 = 0;
+         slice.TransformSkipNumberofZeroCoeffsFactor0 = 0;
+         slice.TransformSkipNumberofNonZeroCoeffsFactor1 = 0;
+         slice.TransformSkipNumberofZeroCoeffsFactor1 = 0;
+
+#if GFX_VER >= 12
+         slice.OriginalSliceStartCtbX = slice_params[s].slice_segment_address % ctb_w;
+         slice.OriginalSliceStartCtbY = slice_params[s].slice_segment_address / ctb_w;
+#endif
+      }
+
+      if (slice_params[s].slice_type != STD_VIDEO_H265_SLICE_TYPE_I) {
+         anv_batch_emit(&cmd_buffer->batch, GENX(HCP_REF_IDX_STATE), ref) {
+            ref.ReferencePictureListSelect = 0;
+            ref.NumberofReferenceIndexesActive = slice_params[s].num_ref_idx_l0_active - 1;
+
+            for (unsigned i = 0; i < ref.NumberofReferenceIndexesActive + 1; i++) {
+               int slot_idx = ref_slots[0][i].slot_index;
+               unsigned poc = ref_slots[0][i].pic_order_cnt;
+               int32_t diff_poc = h265_pic_info->pStdPictureInfo->PicOrderCntVal - poc;
+
+               assert(dpb_idx[slot_idx] < ANV_VIDEO_H265_HCP_NUM_REF_FRAME);
+
+               ref.ReferenceListEntry[i].ListEntry = dpb_idx[slot_idx];
+               ref.ReferenceListEntry[i].ReferencePicturetbValue = CLAMP(diff_poc, -128, 127) & 0xff;
+               ref.ReferenceListEntry[i].TopField = true;
+            }
+         }
+      }
+
+      if (slice_params[s].slice_type == STD_VIDEO_H265_SLICE_TYPE_B) {
+         anv_batch_emit(&cmd_buffer->batch, GENX(HCP_REF_IDX_STATE), ref) {
+            ref.ReferencePictureListSelect = 1;
+            ref.NumberofReferenceIndexesActive = slice_params[s].num_ref_idx_l1_active - 1;
+
+            for (unsigned i = 0; i < ref.NumberofReferenceIndexesActive + 1; i++) {
+               int slot_idx = ref_slots[1][i].slot_index;;
+               unsigned poc = ref_slots[1][i].pic_order_cnt;
+               int32_t diff_poc = h265_pic_info->pStdPictureInfo->PicOrderCntVal - poc;
+
+               assert(dpb_idx[slot_idx] < ANV_VIDEO_H265_HCP_NUM_REF_FRAME);
+
+               ref.ReferenceListEntry[i].ListEntry = dpb_idx[slot_idx];
+               ref.ReferenceListEntry[i].ReferencePicturetbValue = CLAMP(diff_poc, -128, 127) & 0xff;
+               ref.ReferenceListEntry[i].TopField = true;
+            }
+         }
+      }
+
+      if ((pps->flags.weighted_pred_flag && (slice_params[s].slice_type == STD_VIDEO_H265_SLICE_TYPE_P)) ||
+            (pps->flags.weighted_bipred_flag && (slice_params[s].slice_type == STD_VIDEO_H265_SLICE_TYPE_B))) {
+         anv_batch_emit(&cmd_buffer->batch, GENX(HCP_WEIGHTOFFSET_STATE), w) {
+            w.ReferencePictureListSelect = 0;
+
+            for (unsigned i = 0; i < ANV_VIDEO_H265_MAX_NUM_REF_FRAME; i++) {
+               w.LumaOffsets[i].DeltaLumaWeightLX = slice_params[s].delta_luma_weight_l0[i] & 0xff;
+               w.LumaOffsets[i].LumaOffsetLX = slice_params[s].luma_offset_l0[i] & 0xff;
+               w.ChromaOffsets[i].DeltaChromaWeightLX0 = slice_params[s].delta_chroma_weight_l0[i][0] & 0xff;
+               w.ChromaOffsets[i].ChromaOffsetLX0 = slice_params[s].chroma_offset_l0[i][0] & 0xff;
+               w.ChromaOffsets[i].DeltaChromaWeightLX1 = slice_params[s].delta_chroma_weight_l0[i][1] & 0xff;
+               w.ChromaOffsets[i].ChromaOffsetLX1 = slice_params[s].chroma_offset_l0[i][1] & 0xff;
+            }
+         }
+
+         if (slice_params[s].slice_type == STD_VIDEO_H265_SLICE_TYPE_B) {
+            anv_batch_emit(&cmd_buffer->batch, GENX(HCP_WEIGHTOFFSET_STATE), w) {
+               w.ReferencePictureListSelect = 1;
+
+               for (unsigned i = 0; i < ANV_VIDEO_H265_MAX_NUM_REF_FRAME; i++) {
+                  w.LumaOffsets[i].DeltaLumaWeightLX = slice_params[s].delta_luma_weight_l1[i] & 0xff;
+                  w.LumaOffsets[i].LumaOffsetLX = slice_params[s].luma_offset_l1[i] & 0xff;
+                  w.ChromaOffsets[i].DeltaChromaWeightLX0 = slice_params[s].delta_chroma_weight_l1[i][0] & 0xff;
+                  w.ChromaOffsets[i].DeltaChromaWeightLX1 = slice_params[s].delta_chroma_weight_l1[i][1] & 0xff;
+                  w.ChromaOffsets[i].ChromaOffsetLX0 = slice_params[s].chroma_offset_l1[i][0] & 0xff;
+                  w.ChromaOffsets[i].ChromaOffsetLX1 = slice_params[s].chroma_offset_l1[i][1] & 0xff;
+               }
+            }
+         }
+      }
+
+      uint32_t buffer_offset = frame_info->srcBufferOffset & 4095;
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(HCP_BSD_OBJECT), bsd) {
+         bsd.IndirectBSDDataLength = slice_params[s].slice_size - 3;
+         bsd.IndirectBSDDataStartAddress = buffer_offset + h265_pic_info->pSliceSegmentOffsets[s] + 3;
+      }
+   }
+
+#if GFX_VER >= 12
+   anv_batch_emit(&cmd_buffer->batch, GENX(VD_CONTROL_STATE), cs) {
+      cs.MemoryImplicitFlush = true;
+   }
+#endif
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(VD_PIPELINE_FLUSH), flush) {
+      flush.HEVCPipelineDone = true;
+      flush.HEVCPipelineCommandFlush = true;
+      flush.VDCommandMessageParserDone = true;
+   }
+}
+
+static void
+anv_h264_decode_video(struct anv_cmd_buffer *cmd_buffer,
+                      const VkVideoDecodeInfoKHR *frame_info)
+{
+   ANV_FROM_HANDLE(anv_buffer, src_buffer, frame_info->srcBuffer);
+   struct anv_video_session *vid = cmd_buffer->video.vid;
+   struct anv_video_session_params *params = cmd_buffer->video.params;
+   const struct VkVideoDecodeH264PictureInfoKHR *h264_pic_info =
+      vk_find_struct_const(frame_info->pNext, VIDEO_DECODE_H264_PICTURE_INFO_KHR);
+   const StdVideoH264SequenceParameterSet *sps = vk_video_find_h264_dec_std_sps(&params->vk, h264_pic_info->pStdPictureInfo->seq_parameter_set_id);
+   const StdVideoH264PictureParameterSet *pps = vk_video_find_h264_dec_std_pps(&params->vk, h264_pic_info->pStdPictureInfo->pic_parameter_set_id);
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
+      flush.DWordLength = 2;
+      flush.VideoPipelineCacheInvalidate = 1;
+   };
+
+#if GFX_VER >= 12
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_FORCE_WAKEUP), wake) {
+      wake.MFXPowerWellControl = 1;
+      wake.MaskBits = 768;
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_WAIT), mfx) {
+      mfx.MFXSyncControlFlag = 1;
+   }
+#endif
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_PIPE_MODE_SELECT), sel) {
+      sel.StandardSelect = SS_AVC;
+      sel.CodecSelect = Decode;
+      sel.DecoderShortFormatMode = ShortFormatDriverInterface;
+      sel.DecoderModeSelect = VLDMode; // Hardcoded
+
+      sel.PreDeblockingOutputEnable = 0;
+      sel.PostDeblockingOutputEnable = 1;
+   }
+
+#if GFX_VER >= 12
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_WAIT), mfx) {
+      mfx.MFXSyncControlFlag = 1;
+   }
+#endif
+
+   const struct anv_image_view *iv = anv_image_view_from_handle(frame_info->dstPictureResource.imageViewBinding);
+   const struct anv_image *img = iv->image;
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_SURFACE_STATE), ss) {
+      ss.Width = img->vk.extent.width - 1;
+      ss.Height = img->vk.extent.height - 1;
+      ss.SurfaceFormat = PLANAR_420_8; // assert on this?
+      ss.InterleaveChroma = 1;
+      ss.SurfacePitch = img->planes[0].primary_surface.isl.row_pitch_B - 1;
+      ss.TiledSurface = img->planes[0].primary_surface.isl.tiling != ISL_TILING_LINEAR;
+      ss.TileWalk = TW_YMAJOR;
+
+      ss.YOffsetforUCb = ss.YOffsetforVCr =
+         img->planes[1].primary_surface.memory_range.offset / img->planes[0].primary_surface.isl.row_pitch_B;
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_PIPE_BUF_ADDR_STATE), buf) {
+      bool use_pre_deblock = false;
+      if (use_pre_deblock) {
+         buf.PreDeblockingDestinationAddress = anv_image_address(img,
+                                                                 &img->planes[0].primary_surface.memory_range);
+      } else {
+         buf.PostDeblockingDestinationAddress = anv_image_address(img,
+                                                                  &img->planes[0].primary_surface.memory_range);
+      }
+      buf.PreDeblockingDestinationAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, buf.PreDeblockingDestinationAddress.bo, 0),
+      };
+      buf.PostDeblockingDestinationAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, buf.PostDeblockingDestinationAddress.bo, 0),
+      };
+
+      buf.IntraRowStoreScratchBufferAddress = (struct anv_address) { vid->vid_mem[ANV_VID_MEM_H264_INTRA_ROW_STORE].mem->bo, vid->vid_mem[ANV_VID_MEM_H264_INTRA_ROW_STORE].offset };
+      buf.IntraRowStoreScratchBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, buf.IntraRowStoreScratchBufferAddress.bo, 0),
+      };
+      buf.DeblockingFilterRowStoreScratchAddress = (struct anv_address) { vid->vid_mem[ANV_VID_MEM_H264_DEBLOCK_FILTER_ROW_STORE].mem->bo, vid->vid_mem[ANV_VID_MEM_H264_DEBLOCK_FILTER_ROW_STORE].offset };
+      buf.DeblockingFilterRowStoreScratchAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, buf.DeblockingFilterRowStoreScratchAddress.bo, 0),
+      };
+      buf.MBStatusBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      buf.MBILDBStreamOutBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      buf.SecondMBILDBStreamOutBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      buf.ScaledReferenceSurfaceAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      buf.OriginalUncompressedPictureSourceAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      buf.StreamOutDataDestinationAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+
+      struct anv_bo *ref_bo = NULL;
+      for (unsigned i = 0; i < frame_info->referenceSlotCount; i++) {
+         const struct anv_image_view *ref_iv = anv_image_view_from_handle(frame_info->pReferenceSlots[i].pPictureResource->imageViewBinding);
+         int idx = frame_info->pReferenceSlots[i].slotIndex;
+         buf.ReferencePictureAddress[idx] = anv_image_address(ref_iv->image,
+                                                              &ref_iv->image->planes[0].primary_surface.memory_range);
+
+         if (i == 0) {
+            ref_bo = ref_iv->image->bindings[0].address.bo;
+         }
+      }
+      buf.ReferencePictureAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, ref_bo, 0),
+      };
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_IND_OBJ_BASE_ADDR_STATE), index_obj) {
+      index_obj.MFXIndirectBitstreamObjectAddress = anv_address_add(src_buffer->address,
+                                                                    frame_info->srcBufferOffset & ~4095);
+      index_obj.MFXIndirectBitstreamObjectAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, src_buffer->address.bo, 0),
+      };
+      index_obj.MFXIndirectMVObjectAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      index_obj.MFDIndirectITCOEFFObjectAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      index_obj.MFDIndirectITDBLKObjectAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+      index_obj.MFCIndirectPAKBSEObjectAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_BSP_BUF_BASE_ADDR_STATE), bsp) {
+      bsp.BSDMPCRowStoreScratchBufferAddress = (struct anv_address) { vid->vid_mem[ANV_VID_MEM_H264_BSD_MPC_ROW_SCRATCH].mem->bo,
+         vid->vid_mem[ANV_VID_MEM_H264_BSD_MPC_ROW_SCRATCH].offset };
+
+      bsp.BSDMPCRowStoreScratchBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, bsp.BSDMPCRowStoreScratchBufferAddress.bo, 0),
+      };
+      bsp.MPRRowStoreScratchBufferAddress = (struct anv_address) { vid->vid_mem[ANV_VID_MEM_H264_MPR_ROW_SCRATCH].mem->bo,
+         vid->vid_mem[ANV_VID_MEM_H264_BSD_MPC_ROW_SCRATCH].offset };
+
+      bsp.MPRRowStoreScratchBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, bsp.MPRRowStoreScratchBufferAddress.bo, 0),
+      };
+      bsp.BitplaneReadBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
+      };
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFD_AVC_DPB_STATE), avc_dpb) {
+      for (unsigned i = 0; i < frame_info->referenceSlotCount; i++) {
+         const struct VkVideoDecodeH264DpbSlotInfoKHR *dpb_slot =
+            vk_find_struct_const(frame_info->pReferenceSlots[i].pNext, VIDEO_DECODE_H264_DPB_SLOT_INFO_KHR);
+         const StdVideoDecodeH264ReferenceInfo *ref_info = dpb_slot->pStdReferenceInfo;
+         int idx = frame_info->pReferenceSlots[i].slotIndex;
+         avc_dpb.NonExistingFrame[idx] = ref_info->flags.is_non_existing;
+         avc_dpb.LongTermFrame[idx] = ref_info->flags.used_for_long_term_reference;
+         if (!ref_info->flags.top_field_flag && !ref_info->flags.bottom_field_flag)
+            avc_dpb.UsedforReference[idx] = 3;
+         else
+            avc_dpb.UsedforReference[idx] = ref_info->flags.top_field_flag | (ref_info->flags.bottom_field_flag << 1);
+         avc_dpb.LTSTFrameNumberList[idx] = ref_info->FrameNum;
+      }
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFD_AVC_PICID_STATE), picid) {
+      picid.PictureIDRemappingDisable = true;
+   }
+
+   uint32_t pic_height = sps->pic_height_in_map_units_minus1 + 1;
+   if (!sps->flags.frame_mbs_only_flag)
+      pic_height *= 2;
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_AVC_IMG_STATE), avc_img) {
+      avc_img.FrameWidth = sps->pic_width_in_mbs_minus1;
+      avc_img.FrameHeight = pic_height - 1;
+      avc_img.FrameSize = (sps->pic_width_in_mbs_minus1 + 1) * pic_height;
+
+      if (!h264_pic_info->pStdPictureInfo->flags.field_pic_flag)
+         avc_img.ImageStructure = FramePicture;
+      else if (h264_pic_info->pStdPictureInfo->flags.bottom_field_flag)
+         avc_img.ImageStructure = BottomFieldPicture;
+      else
+         avc_img.ImageStructure = TopFieldPicture;
+
+      avc_img.WeightedBiPredictionIDC = pps->weighted_bipred_idc;
+      avc_img.WeightedPredictionEnable = pps->flags.weighted_pred_flag;
+      avc_img.FirstChromaQPOffset = pps->chroma_qp_index_offset;
+      avc_img.SecondChromaQPOffset = pps->second_chroma_qp_index_offset;
+      avc_img.FieldPicture = h264_pic_info->pStdPictureInfo->flags.field_pic_flag;
+      avc_img.MBAFFMode = (sps->flags.mb_adaptive_frame_field_flag &&
+                           !h264_pic_info->pStdPictureInfo->flags.field_pic_flag);
+      avc_img.FrameMBOnly = sps->flags.frame_mbs_only_flag;
+      avc_img._8x8IDCTTransformMode = pps->flags.transform_8x8_mode_flag;
+      avc_img.Direct8x8Inference = sps->flags.direct_8x8_inference_flag;
+      avc_img.ConstrainedIntraPrediction = pps->flags.constrained_intra_pred_flag;
+      avc_img.NonReferencePicture = !h264_pic_info->pStdPictureInfo->flags.is_reference;
+      avc_img.EntropyCodingSyncEnable = pps->flags.entropy_coding_mode_flag;
+      avc_img.ChromaFormatIDC = sps->chroma_format_idc;
+      avc_img.TrellisQuantizationChromaDisable = true;
+      avc_img.NumberofReferenceFrames = frame_info->referenceSlotCount;
+      avc_img.NumberofActiveReferencePicturesfromL0 = pps->num_ref_idx_l0_default_active_minus1 + 1;
+      avc_img.NumberofActiveReferencePicturesfromL1 = pps->num_ref_idx_l1_default_active_minus1 + 1;
+      avc_img.InitialQPValue = pps->pic_init_qp_minus26;
+      avc_img.PicOrderPresent = pps->flags.bottom_field_pic_order_in_frame_present_flag;
+      avc_img.DeltaPicOrderAlwaysZero = sps->flags.delta_pic_order_always_zero_flag;
+      avc_img.PicOrderCountType = sps->pic_order_cnt_type;
+      avc_img.DeblockingFilterControlPresent = pps->flags.deblocking_filter_control_present_flag;
+      avc_img.RedundantPicCountPresent = pps->flags.redundant_pic_cnt_present_flag;
+      avc_img.Log2MaxFrameNumber = sps->log2_max_frame_num_minus4;
+      avc_img.Log2MaxPicOrderCountLSB = sps->log2_max_pic_order_cnt_lsb_minus4;
+      avc_img.CurrentPictureFrameNumber = h264_pic_info->pStdPictureInfo->frame_num;
+   }
+
+   StdVideoH264ScalingLists scaling_lists;
+   vk_video_derive_h264_scaling_list(sps, pps, &scaling_lists);
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_QM_STATE), qm) {
+      qm.DWordLength = 16;
+      qm.AVC = AVC_4x4_Intra_MATRIX;
+      for (unsigned m = 0; m < 3; m++)
+         for (unsigned q = 0; q < 16; q++)
+            qm.ForwardQuantizerMatrix[m * 16 + vl_zscan_normal_16[q]] = scaling_lists.ScalingList4x4[m][q];
+   }
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_QM_STATE), qm) {
+      qm.DWordLength = 16;
+      qm.AVC = AVC_4x4_Inter_MATRIX;
+      for (unsigned m = 0; m < 3; m++)
+         for (unsigned q = 0; q < 16; q++)
+            qm.ForwardQuantizerMatrix[m * 16 + vl_zscan_normal_16[q]] = scaling_lists.ScalingList4x4[m + 3][q];
+   }
+   if (pps->flags.transform_8x8_mode_flag) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(MFX_QM_STATE), qm) {
+         qm.DWordLength = 16;
+         qm.AVC = AVC_8x8_Intra_MATRIX;
+         for (unsigned q = 0; q < 64; q++)
+            qm.ForwardQuantizerMatrix[vl_zscan_normal[q]] = scaling_lists.ScalingList8x8[0][q];
+      }
+      anv_batch_emit(&cmd_buffer->batch, GENX(MFX_QM_STATE), qm) {
+         qm.DWordLength = 16;
+         qm.AVC = AVC_8x8_Inter_MATRIX;
+         for (unsigned q = 0; q < 64; q++)
+            qm.ForwardQuantizerMatrix[vl_zscan_normal[q]] = scaling_lists.ScalingList8x8[1][q];
+      }
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(MFX_AVC_DIRECTMODE_STATE), avc_directmode) {
+      /* bind reference frame DMV */
+      struct anv_bo *dmv_bo = NULL;
+      for (unsigned i = 0; i < frame_info->referenceSlotCount; i++) {
+         int idx = frame_info->pReferenceSlots[i].slotIndex;
+         const struct VkVideoDecodeH264DpbSlotInfoKHR *dpb_slot =
+            vk_find_struct_const(frame_info->pReferenceSlots[i].pNext, VIDEO_DECODE_H264_DPB_SLOT_INFO_KHR);
+         const struct anv_image_view *ref_iv = anv_image_view_from_handle(frame_info->pReferenceSlots[i].pPictureResource->imageViewBinding);
+         const StdVideoDecodeH264ReferenceInfo *ref_info = dpb_slot->pStdReferenceInfo;
+         avc_directmode.DirectMVBufferAddress[idx] = anv_image_address(ref_iv->image,
+                                                                     &ref_iv->image->vid_dmv_top_surface);
+         if (i == 0) {
+            dmv_bo = ref_iv->image->bindings[0].address.bo;
+         }
+         avc_directmode.POCList[2 * idx] = ref_info->PicOrderCnt[0];
+         avc_directmode.POCList[2 * idx + 1] = ref_info->PicOrderCnt[1];
+      }
+      avc_directmode.DirectMVBufferAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, dmv_bo, 0),
+      };
+
+      avc_directmode.DirectMVBufferWriteAddress = anv_image_address(img,
+                                                                    &img->vid_dmv_top_surface);
+      avc_directmode.DirectMVBufferWriteAttributes = (struct GENX(MEMORYADDRESSATTRIBUTES)) {
+         .MOCS = anv_mocs(cmd_buffer->device, img->bindings[0].address.bo, 0),
+      };
+      avc_directmode.POCList[32] = h264_pic_info->pStdPictureInfo->PicOrderCnt[0];
+      avc_directmode.POCList[33] = h264_pic_info->pStdPictureInfo->PicOrderCnt[1];
+   }
+
+   uint32_t buffer_offset = frame_info->srcBufferOffset & 4095;
+#define HEADER_OFFSET 3
+   for (unsigned s = 0; s < h264_pic_info->sliceCount; s++) {
+      bool last_slice = s == (h264_pic_info->sliceCount - 1);
+      uint32_t current_offset = h264_pic_info->pSliceOffsets[s];
+      uint32_t this_end;
+      if (!last_slice) {
+         uint32_t next_offset = h264_pic_info->pSliceOffsets[s + 1];
+         uint32_t next_end = h264_pic_info->pSliceOffsets[s + 2];
+         if (s == h264_pic_info->sliceCount - 2)
+            next_end = frame_info->srcBufferRange;
+         anv_batch_emit(&cmd_buffer->batch, GENX(MFD_AVC_SLICEADDR), sliceaddr) {
+            sliceaddr.IndirectBSDDataLength = next_end - next_offset - HEADER_OFFSET;
+            /* start decoding after the 3-byte header. */
+            sliceaddr.IndirectBSDDataStartAddress = buffer_offset + next_offset + HEADER_OFFSET;
+         };
+         this_end = next_offset;
+      } else
+         this_end = frame_info->srcBufferRange;
+      anv_batch_emit(&cmd_buffer->batch, GENX(MFD_AVC_BSD_OBJECT), avc_bsd) {
+         avc_bsd.IndirectBSDDataLength = this_end - current_offset - HEADER_OFFSET;
+         /* start decoding after the 3-byte header. */
+         avc_bsd.IndirectBSDDataStartAddress = buffer_offset + current_offset + HEADER_OFFSET;
+         avc_bsd.InlineData.LastSlice = last_slice;
+         avc_bsd.InlineData.FixPrevMBSkipped = 1;
+         avc_bsd.InlineData.IntraPredictionErrorControl = 1;
+         avc_bsd.InlineData.Intra8x84x4PredictionErrorConcealmentControl = 1;
+         avc_bsd.InlineData.ISliceConcealmentMode = 1;
+      };
+   }
+}
+
+void
+genX(CmdDecodeVideoKHR)(VkCommandBuffer commandBuffer,
+                        const VkVideoDecodeInfoKHR *frame_info)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   switch (cmd_buffer->video.vid->vk.op) {
+   case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR:
+      anv_h264_decode_video(cmd_buffer, frame_info);
+      break;
+   case VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR:
+      anv_h265_decode_video(cmd_buffer, frame_info);
+      break;
+   default:
+      assert(0);
+   }
+}
+
+#ifdef VK_ENABLE_BETA_EXTENSIONS
+void
+genX(CmdEncodeVideoKHR)(VkCommandBuffer commandBuffer,
+                        const VkVideoEncodeInfoKHR *pEncodeInfo)
+{
+}
+#endif
diff --git a/src/intel/vulkan/genX_gfx_state.c b/src/intel/vulkan/genX_gfx_state.c
new file mode 100644
index 00000000000..5f0b1e1c538
--- /dev/null
+++ b/src/intel/vulkan/genX_gfx_state.c
@@ -0,0 +1,2385 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "anv_private.h"
+
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+#include "common/intel_genX_state_brw.h"
+#include "common/intel_guardband.h"
+#include "common/intel_tiled_render.h"
+#include "compiler/brw_prim.h"
+
+const uint32_t genX(vk_to_intel_blend)[] = {
+   [VK_BLEND_FACTOR_ZERO]                    = BLENDFACTOR_ZERO,
+   [VK_BLEND_FACTOR_ONE]                     = BLENDFACTOR_ONE,
+   [VK_BLEND_FACTOR_SRC_COLOR]               = BLENDFACTOR_SRC_COLOR,
+   [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR]     = BLENDFACTOR_INV_SRC_COLOR,
+   [VK_BLEND_FACTOR_DST_COLOR]               = BLENDFACTOR_DST_COLOR,
+   [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR]     = BLENDFACTOR_INV_DST_COLOR,
+   [VK_BLEND_FACTOR_SRC_ALPHA]               = BLENDFACTOR_SRC_ALPHA,
+   [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA]     = BLENDFACTOR_INV_SRC_ALPHA,
+   [VK_BLEND_FACTOR_DST_ALPHA]               = BLENDFACTOR_DST_ALPHA,
+   [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA]     = BLENDFACTOR_INV_DST_ALPHA,
+   [VK_BLEND_FACTOR_CONSTANT_COLOR]          = BLENDFACTOR_CONST_COLOR,
+   [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR,
+   [VK_BLEND_FACTOR_CONSTANT_ALPHA]          = BLENDFACTOR_CONST_ALPHA,
+   [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA,
+   [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE]      = BLENDFACTOR_SRC_ALPHA_SATURATE,
+   [VK_BLEND_FACTOR_SRC1_COLOR]              = BLENDFACTOR_SRC1_COLOR,
+   [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR]    = BLENDFACTOR_INV_SRC1_COLOR,
+   [VK_BLEND_FACTOR_SRC1_ALPHA]              = BLENDFACTOR_SRC1_ALPHA,
+   [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA]    = BLENDFACTOR_INV_SRC1_ALPHA,
+};
+
+static const uint32_t genX(vk_to_intel_blend_op)[] = {
+   [VK_BLEND_OP_ADD]                         = BLENDFUNCTION_ADD,
+   [VK_BLEND_OP_SUBTRACT]                    = BLENDFUNCTION_SUBTRACT,
+   [VK_BLEND_OP_REVERSE_SUBTRACT]            = BLENDFUNCTION_REVERSE_SUBTRACT,
+   [VK_BLEND_OP_MIN]                         = BLENDFUNCTION_MIN,
+   [VK_BLEND_OP_MAX]                         = BLENDFUNCTION_MAX,
+};
+
+static void
+genX(streamout_prologue)(struct anv_cmd_buffer *cmd_buffer)
+{
+#if INTEL_WA_16013994831_GFX_VER
+   /* Wa_16013994831 - Disable preemption during streamout, enable back
+    * again if XFB not used by the current pipeline.
+    *
+    * Although this workaround applies to Gfx12+, we already disable object
+    * level preemption for another reason in genX_state.c so we can skip this
+    * for Gfx12.
+    */
+   if (!intel_needs_workaround(cmd_buffer->device->info, 16013994831))
+      return;
+
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+   if (pipeline->uses_xfb) {
+      genX(cmd_buffer_set_preemption)(cmd_buffer, false);
+      return;
+   }
+
+   if (!cmd_buffer->state.gfx.object_preemption)
+      genX(cmd_buffer_set_preemption)(cmd_buffer, true);
+#endif
+}
+
+#if GFX_VER >= 12
+static uint32_t
+get_cps_state_offset(struct anv_cmd_buffer *cmd_buffer, bool cps_enabled,
+                     const struct vk_fragment_shading_rate_state *fsr)
+{
+   struct anv_device *device = cmd_buffer->device;
+
+   if (!cps_enabled) {
+      assert(cmd_buffer->state.current_db_mode !=
+             ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN);
+      return cmd_buffer->state.current_db_mode ==
+             ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER ?
+             device->cps_states_db.offset :
+             device->cps_states.offset;
+   }
+
+   uint32_t offset;
+   static const uint32_t size_index[] = {
+      [1] = 0,
+      [2] = 1,
+      [4] = 2,
+   };
+
+#if GFX_VERx10 >= 125
+   offset =
+      1 + /* skip disabled */
+      fsr->combiner_ops[0] * 5 * 3 * 3 +
+      fsr->combiner_ops[1] * 3 * 3 +
+      size_index[fsr->fragment_size.width] * 3 +
+      size_index[fsr->fragment_size.height];
+#else
+   offset =
+      1 + /* skip disabled */
+      size_index[fsr->fragment_size.width] * 3 +
+      size_index[fsr->fragment_size.height];
+#endif
+
+   offset *= MAX_VIEWPORTS * GENX(CPS_STATE_length) * 4;
+
+   assert(cmd_buffer->state.current_db_mode !=
+          ANV_CMD_DESCRIPTOR_BUFFER_MODE_UNKNOWN);
+   return (cmd_buffer->state.current_db_mode ==
+           ANV_CMD_DESCRIPTOR_BUFFER_MODE_BUFFER ?
+           device->cps_states_db.offset :
+           device->cps_states.offset) + offset;
+}
+#endif /* GFX_VER >= 12 */
+
+static bool
+has_ds_feedback_loop(const struct vk_dynamic_graphics_state *dyn)
+{
+   return dyn->feedback_loops & (VK_IMAGE_ASPECT_DEPTH_BIT |
+                                 VK_IMAGE_ASPECT_STENCIL_BIT);
+}
+
+UNUSED static bool
+want_stencil_pma_fix(struct anv_cmd_buffer *cmd_buffer,
+                     const struct vk_dynamic_graphics_state *dyn,
+                     const struct vk_depth_stencil_state *ds)
+{
+   if (GFX_VER > 9)
+      return false;
+   assert(GFX_VER == 9);
+
+   /* From the Skylake PRM Vol. 2c CACHE_MODE_1::STC PMA Optimization Enable:
+    *
+    *    Clearing this bit will force the STC cache to wait for pending
+    *    retirement of pixels at the HZ-read stage and do the STC-test for
+    *    Non-promoted, R-computed and Computed depth modes instead of
+    *    postponing the STC-test to RCPFE.
+    *
+    *    STC_TEST_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
+    *                  3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
+    *
+    *    STC_WRITE_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
+    *                   (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
+    *                    3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
+    *
+    *    COMP_STC_EN = STC_TEST_EN &&
+    *                  3DSTATE_PS_EXTRA::PixelShaderComputesStencil
+    *
+    *    SW parses the pipeline states to generate the following logical
+    *    signal indicating if PMA FIX can be enabled.
+    *
+    *    STC_PMA_OPT =
+    *       3DSTATE_WM::ForceThreadDispatch != 1 &&
+    *       !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
+    *       3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
+    *       3DSTATE_DEPTH_BUFFER::HIZ Enable &&
+    *       !(3DSTATE_WM::EDSC_Mode == 2) &&
+    *       3DSTATE_PS_EXTRA::PixelShaderValid &&
+    *       !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
+    *         3DSTATE_WM_HZ_OP::DepthBufferResolve ||
+    *         3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
+    *         3DSTATE_WM_HZ_OP::StencilBufferClear) &&
+    *       (COMP_STC_EN || STC_WRITE_EN) &&
+    *       ((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
+    *         3DSTATE_WM::ForceKillPix == ON ||
+    *         3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
+    *         3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
+    *         3DSTATE_PS_BLEND::AlphaTestEnable ||
+    *         3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
+    *        (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
+    */
+
+   /* These are always true:
+    *    3DSTATE_WM::ForceThreadDispatch != 1 &&
+    *    !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
+    */
+
+   /* We only enable the PMA fix if we know for certain that HiZ is enabled.
+    * If we don't know whether HiZ is enabled or not, we disable the PMA fix
+    * and there is no harm.
+    *
+    * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
+    * 3DSTATE_DEPTH_BUFFER::HIZ Enable
+    */
+   if (!cmd_buffer->state.hiz_enabled)
+      return false;
+
+   /* We can't possibly know if HiZ is enabled without the depth attachment */
+   ASSERTED const struct anv_image_view *d_iview =
+      cmd_buffer->state.gfx.depth_att.iview;
+   assert(d_iview && d_iview->image->planes[0].aux_usage == ISL_AUX_USAGE_HIZ);
+
+   /* 3DSTATE_PS_EXTRA::PixelShaderValid */
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
+      return false;
+
+   /* !(3DSTATE_WM::EDSC_Mode == 2) */
+   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+   if (wm_prog_data->early_fragment_tests)
+      return false;
+
+   /* We never use anv_pipeline for HiZ ops so this is trivially true:
+   *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
+    *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
+    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
+    *      3DSTATE_WM_HZ_OP::StencilBufferClear)
+    */
+
+   /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
+    * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
+    */
+   const bool stc_test_en = ds->stencil.test_enable;
+
+   /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
+    * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
+    *  3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
+    */
+   const bool stc_write_en = ds->stencil.write_enable;
+
+   /* STC_TEST_EN && 3DSTATE_PS_EXTRA::PixelShaderComputesStencil */
+   const bool comp_stc_en = stc_test_en && wm_prog_data->computed_stencil;
+
+   /* COMP_STC_EN || STC_WRITE_EN */
+   if (!(comp_stc_en || stc_write_en))
+      return false;
+
+   /* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
+    *  3DSTATE_WM::ForceKillPix == ON ||
+    *  3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
+    *  3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
+    *  3DSTATE_PS_BLEND::AlphaTestEnable ||
+    *  3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
+    * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF)
+    */
+   return pipeline->kill_pixel ||
+          pipeline->rp_has_ds_self_dep ||
+          has_ds_feedback_loop(dyn) ||
+          wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
+}
+
+static void
+genX(rasterization_mode)(VkPolygonMode raster_mode,
+                         VkLineRasterizationModeKHR line_mode,
+                         float line_width,
+                         uint32_t *api_mode,
+                         bool *msaa_rasterization_enable)
+{
+   if (raster_mode == VK_POLYGON_MODE_LINE) {
+      /* Unfortunately, configuring our line rasterization hardware on gfx8
+       * and later is rather painful.  Instead of giving us bits to tell the
+       * hardware what line mode to use like we had on gfx7, we now have an
+       * arcane combination of API Mode and MSAA enable bits which do things
+       * in a table which are expected to magically put the hardware into the
+       * right mode for your API.  Sadly, Vulkan isn't any of the APIs the
+       * hardware people thought of so nothing works the way you want it to.
+       *
+       * Look at the table titled "Multisample Rasterization Modes" in Vol 7
+       * of the Skylake PRM for more details.
+       */
+      switch (line_mode) {
+      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
+         *api_mode = DX101;
+#if GFX_VER <= 9
+         /* Prior to ICL, the algorithm the HW uses to draw wide lines
+          * doesn't quite match what the CTS expects, at least for rectangular
+          * lines, so we set this to false here, making it draw parallelograms
+          * instead, which work well enough.
+          */
+         *msaa_rasterization_enable = line_width < 1.0078125;
+#else
+         *msaa_rasterization_enable = true;
+#endif
+         break;
+
+      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
+      case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
+         *api_mode = DX9OGL;
+         *msaa_rasterization_enable = false;
+         break;
+
+      default:
+         unreachable("Unsupported line rasterization mode");
+      }
+   } else {
+      *api_mode = DX101;
+      *msaa_rasterization_enable = true;
+   }
+}
+
+static bool
+is_src1_blend_factor(enum GENX(3D_Color_Buffer_Blend_Factor) factor)
+{
+   return factor == BLENDFACTOR_SRC1_COLOR ||
+          factor == BLENDFACTOR_SRC1_ALPHA ||
+          factor == BLENDFACTOR_INV_SRC1_COLOR ||
+          factor == BLENDFACTOR_INV_SRC1_ALPHA;
+}
+
+#if GFX_VERx10 == 125
+/**
+ * Return the dimensions of the current rendering area, defined as the
+ * bounding box of all present color, depth and stencil attachments.
+ */
+UNUSED static bool
+calculate_render_area(struct anv_cmd_buffer *cmd_buffer,
+                      unsigned *width, unsigned *height)
+{
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+
+   *width = gfx->render_area.offset.x + gfx->render_area.extent.width;
+   *height = gfx->render_area.offset.y + gfx->render_area.extent.height;
+
+   for (unsigned i = 0; i < gfx->color_att_count; i++) {
+      struct anv_attachment *att = &gfx->color_att[i];
+      if (att->iview) {
+         *width = MAX2(*width, att->iview->vk.extent.width);
+         *height = MAX2(*height, att->iview->vk.extent.height);
+      }
+   }
+
+   const struct anv_image_view *const z_view = gfx->depth_att.iview;
+   if (z_view) {
+      *width = MAX2(*width, z_view->vk.extent.width);
+      *height = MAX2(*height, z_view->vk.extent.height);
+   }
+
+   const struct anv_image_view *const s_view = gfx->stencil_att.iview;
+   if (s_view) {
+      *width = MAX2(*width, s_view->vk.extent.width);
+      *height = MAX2(*height, s_view->vk.extent.height);
+   }
+
+   return *width && *height;
+}
+
+/* Calculate TBIMR tiling parameters adequate for the current pipeline
+ * setup.  Return true if TBIMR should be enabled.
+ */
+UNUSED static bool
+calculate_tile_dimensions(struct anv_cmd_buffer *cmd_buffer,
+                          unsigned fb_width, unsigned fb_height,
+                          unsigned *tile_width, unsigned *tile_height)
+{
+   const struct anv_device *device = cmd_buffer->device;
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   const unsigned aux_scale = 256;
+   unsigned pixel_size = 0;
+
+   /* Perform a rough calculation of the tile cache footprint of the
+    * pixel pipeline, approximating it as the sum of the amount of
+    * memory used per pixel by every render target, depth, stencil and
+    * auxiliary surfaces bound to the pipeline.
+    */
+   for (uint32_t i = 0; i < gfx->color_att_count; i++) {
+      struct anv_attachment *att = &gfx->color_att[i];
+
+      if (att->iview) {
+         const struct anv_image *image = att->iview->image;
+         const unsigned p = anv_image_aspect_to_plane(image,
+                                                      VK_IMAGE_ASPECT_COLOR_BIT);
+         const struct anv_image_plane *plane = &image->planes[p];
+
+         pixel_size += intel_calculate_surface_pixel_size(
+            &plane->primary_surface.isl);
+
+         if (isl_aux_usage_has_mcs(att->aux_usage))
+            pixel_size += intel_calculate_surface_pixel_size(
+               &plane->aux_surface.isl);
+
+         /* XXX - Use proper implicit CCS surface metadata tracking
+          *       instead of inferring pixel size from primary
+          *       surface.
+          */
+         if (isl_aux_usage_has_ccs(att->aux_usage))
+            pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
+                                          &plane->primary_surface.isl),
+                                       aux_scale);
+      }
+   }
+
+   const struct anv_image_view *const z_view = gfx->depth_att.iview;
+   if (z_view) {
+      const struct anv_image *image = z_view->image;
+      assert(image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
+      const unsigned p = anv_image_aspect_to_plane(image,
+                                                   VK_IMAGE_ASPECT_DEPTH_BIT);
+      const struct anv_image_plane *plane = &image->planes[p];
+
+      pixel_size += intel_calculate_surface_pixel_size(
+         &plane->primary_surface.isl);
+
+      if (isl_aux_usage_has_hiz(image->planes[p].aux_usage))
+         pixel_size += intel_calculate_surface_pixel_size(
+            &plane->aux_surface.isl);
+
+      /* XXX - Use proper implicit CCS surface metadata tracking
+       *       instead of inferring pixel size from primary
+       *       surface.
+       */
+      if (isl_aux_usage_has_ccs(image->planes[p].aux_usage))
+         pixel_size += DIV_ROUND_UP(intel_calculate_surface_pixel_size(
+                                       &plane->primary_surface.isl),
+                                    aux_scale);
+   }
+
+   const struct anv_image_view *const s_view = gfx->depth_att.iview;
+   if (s_view && s_view != z_view) {
+      const struct anv_image *image = s_view->image;
+      assert(image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
+      const unsigned p = anv_image_aspect_to_plane(image,
+                                                   VK_IMAGE_ASPECT_STENCIL_BIT);
+      const struct anv_image_plane *plane = &image->planes[p];
+
+      pixel_size += intel_calculate_surface_pixel_size(
+         &plane->primary_surface.isl);
+   }
+
+   if (!pixel_size)
+      return false;
+
+   /* Compute a tile layout that allows reasonable utilization of the
+    * tile cache based on the per-pixel cache footprint estimated
+    * above.
+    */
+   intel_calculate_tile_dimensions(device->info, cmd_buffer->state.current_l3_config,
+                                   32, 32, fb_width, fb_height,
+                                   pixel_size, tile_width, tile_height);
+
+   /* Perform TBIMR tile passes only if the framebuffer covers more
+    * than a single tile.
+    */
+   return *tile_width < fb_width || *tile_height < fb_height;
+}
+#endif
+
+/**
+ * This function takes the vulkan runtime values & dirty states and updates
+ * the values in anv_gfx_dynamic_state, flagging HW instructions for
+ * reemission if the values are changing.
+ *
+ * Nothing is emitted in the batch buffer.
+ */
+void
+genX(cmd_buffer_flush_gfx_runtime_state)(struct anv_cmd_buffer *cmd_buffer)
+{
+   UNUSED struct anv_device *device = cmd_buffer->device;
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   const struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(gfx->base.pipeline);
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+   struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
+   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+   struct anv_instance *instance = cmd_buffer->device->physical->instance;
+
+#define GET(field) hw_state->field
+#define SET(bit, field, value)                               \
+   do {                                                      \
+      __typeof(hw_state->field) __v = value;                 \
+      if (hw_state->field != __v) {                          \
+         hw_state->field = __v;                              \
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit);   \
+      }                                                      \
+   } while (0)
+#define SET_STAGE(bit, field, value, stage)                  \
+   do {                                                      \
+      __typeof(hw_state->field) __v = value;                 \
+      if (!anv_pipeline_has_stage(pipeline,                  \
+                                  MESA_SHADER_##stage)) {    \
+         hw_state->field = __v;                              \
+         break;                                              \
+      }                                                      \
+      if (hw_state->field != __v) {                          \
+         hw_state->field = __v;                              \
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_##bit);   \
+      }                                                      \
+   } while (0)
+
+#define SETUP_PROVOKING_VERTEX(bit, cmd, mode)                         \
+   switch (mode) {                                                     \
+   case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:                     \
+      SET(bit, cmd.TriangleStripListProvokingVertexSelect, 0);         \
+      SET(bit, cmd.LineStripListProvokingVertexSelect,     0);         \
+      SET(bit, cmd.TriangleFanProvokingVertexSelect,       1);         \
+      break;                                                           \
+   case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:                      \
+      SET(bit, cmd.TriangleStripListProvokingVertexSelect, 2);         \
+      SET(bit, cmd.LineStripListProvokingVertexSelect,     1);         \
+      SET(bit, cmd.TriangleFanProvokingVertexSelect,       2);         \
+      break;                                                           \
+   default:                                                            \
+      unreachable("Invalid provoking vertex mode");                    \
+   }                                                                   \
+
+   UNUSED bool fs_msaa_changed = false;
+   if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR)) {
+      enum intel_msaa_flags fs_msaa_flags = 0;
+
+      if (wm_prog_data) {
+         /* If we have any dynamic bits here, we might need to update the
+          * value in the push constant for the shader.
+          */
+         if (wm_prog_data->coarse_pixel_dispatch == BRW_SOMETIMES ||
+             wm_prog_data->persample_dispatch == BRW_SOMETIMES ||
+             wm_prog_data->alpha_to_coverage == BRW_SOMETIMES) {
+            fs_msaa_flags = INTEL_MSAA_FLAG_ENABLE_DYNAMIC;
+
+            if (dyn->ms.rasterization_samples > 1) {
+               fs_msaa_flags |= INTEL_MSAA_FLAG_MULTISAMPLE_FBO;
+
+               if (wm_prog_data->sample_shading) {
+                  assert(wm_prog_data->persample_dispatch != BRW_NEVER);
+                  fs_msaa_flags |= INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH;
+               }
+               if ((pipeline->sample_shading_enable &&
+                    (pipeline->min_sample_shading * dyn->ms.rasterization_samples) > 1) ||
+                   wm_prog_data->sample_shading) {
+                  fs_msaa_flags |= INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH |
+                                   INTEL_MSAA_FLAG_PERSAMPLE_INTERP;
+               }
+            }
+
+            if (wm_prog_data->coarse_pixel_dispatch == BRW_SOMETIMES &&
+                !(fs_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH)) {
+               fs_msaa_flags |= INTEL_MSAA_FLAG_COARSE_PI_MSG |
+                                INTEL_MSAA_FLAG_COARSE_RT_WRITES;
+            }
+
+            if (wm_prog_data->alpha_to_coverage == BRW_SOMETIMES &&
+                dyn->ms.alpha_to_coverage_enable)
+               fs_msaa_flags |= INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE;
+
+            /* Check the last push constant value and update */
+
+            if (gfx->base.push_constants.gfx.fs_msaa_flags != fs_msaa_flags) {
+               gfx->base.push_constants.gfx.fs_msaa_flags = fs_msaa_flags;
+               cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
+               gfx->base.push_constants_data_dirty = true;
+            }
+         }
+      }
+
+      if (fs_msaa_flags != gfx->fs_msaa_flags) {
+         gfx->fs_msaa_flags = fs_msaa_flags;
+         gfx->dirty |= ANV_CMD_DIRTY_FS_MSAA_FLAGS;
+      }
+   }
+
+   if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+       (gfx->dirty & ANV_CMD_DIRTY_FS_MSAA_FLAGS)) {
+      if (wm_prog_data) {
+         const struct anv_shader_bin *fs_bin =
+            pipeline->base.shaders[MESA_SHADER_FRAGMENT];
+
+         struct GENX(3DSTATE_PS) ps = {};
+         intel_set_ps_dispatch_state(&ps, device->info, wm_prog_data,
+                                     MAX2(dyn->ms.rasterization_samples, 1),
+                                     gfx->fs_msaa_flags);
+
+         SET(PS, ps.KernelStartPointer0,
+             fs_bin->kernel.offset +
+             brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0));
+         SET(PS, ps.KernelStartPointer1,
+             fs_bin->kernel.offset +
+             brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1));
+#if GFX_VER < 20
+         SET(PS, ps.KernelStartPointer2,
+             fs_bin->kernel.offset +
+             brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2));
+#endif
+
+         SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData0,
+             brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0));
+         SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData1,
+             brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1));
+#if GFX_VER < 20
+         SET(PS, ps.DispatchGRFStartRegisterForConstantSetupData2,
+             brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2));
+#endif
+
+#if GFX_VER < 20
+         SET(PS, ps._8PixelDispatchEnable,  ps._8PixelDispatchEnable);
+         SET(PS, ps._16PixelDispatchEnable, ps._16PixelDispatchEnable);
+         SET(PS, ps._32PixelDispatchEnable, ps._32PixelDispatchEnable);
+#else
+         SET(PS, ps.Kernel0Enable,            ps.Kernel0Enable);
+         SET(PS, ps.Kernel1Enable,            ps.Kernel1Enable);
+         SET(PS, ps.Kernel0SIMDWidth,         ps.Kernel0SIMDWidth);
+         SET(PS, ps.Kernel1SIMDWidth,         ps.Kernel1SIMDWidth);
+         SET(PS, ps.Kernel0PolyPackingPolicy, ps.Kernel0PolyPackingPolicy);
+#endif
+
+         SET(PS, ps.PositionXYOffsetSelect,
+             !wm_prog_data->uses_pos_offset ? POSOFFSET_NONE :
+             brw_wm_prog_data_is_persample(wm_prog_data, gfx->fs_msaa_flags) ?
+             POSOFFSET_SAMPLE : POSOFFSET_CENTROID);
+
+         SET(PS_EXTRA, ps_extra.PixelShaderIsPerSample,
+             brw_wm_prog_data_is_persample(wm_prog_data, gfx->fs_msaa_flags));
+#if GFX_VER >= 11
+         SET(PS_EXTRA, ps_extra.PixelShaderIsPerCoarsePixel,
+             brw_wm_prog_data_is_coarse(wm_prog_data, gfx->fs_msaa_flags));
+#endif
+#if GFX_VERx10 >= 125
+         /* TODO: We should only require this when the last geometry shader
+          *       uses a fragment shading rate that is not constant.
+          */
+         SET(PS_EXTRA, ps_extra.EnablePSDependencyOnCPsizeChange,
+             brw_wm_prog_data_is_coarse(wm_prog_data, gfx->fs_msaa_flags));
+#endif
+         SET(WM, wm.BarycentricInterpolationMode,
+             wm_prog_data_barycentric_modes(wm_prog_data, gfx->fs_msaa_flags));
+      } else {
+#if GFX_VER < 20
+         SET(PS, ps._8PixelDispatchEnable,  false);
+         SET(PS, ps._16PixelDispatchEnable, false);
+         SET(PS, ps._32PixelDispatchEnable, false);
+#else
+         SET(PS, ps.Kernel0Enable, false);
+         SET(PS, ps.Kernel1Enable, false);
+#endif
+      }
+   }
+
+   if ((gfx->dirty & (ANV_CMD_DIRTY_PIPELINE |
+                      ANV_CMD_DIRTY_XFB_ENABLE |
+                      ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE)) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) {
+      SET(STREAMOUT, so.RenderingDisable, dyn->rs.rasterizer_discard_enable);
+      SET(STREAMOUT, so.RenderStreamSelect, dyn->rs.rasterization_stream);
+
+#if INTEL_NEEDS_WA_18022508906
+      /* Wa_18022508906 :
+       *
+       * SKL PRMs, Volume 7: 3D-Media-GPGPU, Stream Output Logic (SOL) Stage:
+       *
+       * SOL_INT::Render_Enable =
+       *   (3DSTATE_STREAMOUT::Force_Rending == Force_On) ||
+       *   (
+       *     (3DSTATE_STREAMOUT::Force_Rending != Force_Off) &&
+       *     !(3DSTATE_GS::Enable && 3DSTATE_GS::Output Vertex Size == 0) &&
+       *     !3DSTATE_STREAMOUT::API_Render_Disable &&
+       *     (
+       *       3DSTATE_DEPTH_STENCIL_STATE::Stencil_TestEnable ||
+       *       3DSTATE_DEPTH_STENCIL_STATE::Depth_TestEnable ||
+       *       3DSTATE_DEPTH_STENCIL_STATE::Depth_WriteEnable ||
+       *       3DSTATE_PS_EXTRA::PS_Valid ||
+       *       3DSTATE_WM::Legacy Depth_Buffer_Clear ||
+       *       3DSTATE_WM::Legacy Depth_Buffer_Resolve_Enable ||
+       *       3DSTATE_WM::Legacy Hierarchical_Depth_Buffer_Resolve_Enable
+       *     )
+       *   )
+       *
+       * If SOL_INT::Render_Enable is false, the SO stage will not forward any
+       * topologies down the pipeline. Which is not what we want for occlusion
+       * queries.
+       *
+       * Here we force rendering to get SOL_INT::Render_Enable when occlusion
+       * queries are active.
+       */
+      SET(STREAMOUT, so.ForceRendering,
+          (!GET(so.RenderingDisable) && gfx->n_occlusion_queries > 0) ?
+          Force_on : 0);
+#endif
+
+      switch (dyn->rs.provoking_vertex) {
+      case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
+         SET(STREAMOUT, so.ReorderMode, LEADING);
+         SET_STAGE(GS, gs.ReorderMode, LEADING, GEOMETRY);
+         break;
+
+      case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
+         SET(STREAMOUT, so.ReorderMode, TRAILING);
+         SET_STAGE(GS, gs.ReorderMode, TRAILING, GEOMETRY);
+         break;
+
+      default:
+         unreachable("Invalid provoking vertex mode");
+      }
+   }
+
+   if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) {
+      uint32_t topology;
+      if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
+         topology = _3DPRIM_PATCHLIST(dyn->ts.patch_control_points);
+      else
+         topology = genX(vk_to_intel_primitive_type)[dyn->ia.primitive_topology];
+
+      gfx->primitive_topology = topology;
+
+      SET(VF_TOPOLOGY, vft.PrimitiveTopologyType, topology);
+   }
+
+   if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
+
+#if GFX_VER >= 11
+   if (cmd_buffer->device->vk.enabled_extensions.KHR_fragment_shading_rate &&
+       ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+        (gfx->dirty & ANV_CMD_DIRTY_FS_MSAA_FLAGS) ||
+        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_FSR))) {
+      const bool cps_enable = wm_prog_data &&
+         brw_wm_prog_data_is_coarse(wm_prog_data, gfx->fs_msaa_flags);
+#if GFX_VER == 11
+      SET(CPS, cps.CoarsePixelShadingMode,
+               cps_enable ? CPS_MODE_CONSTANT : CPS_MODE_NONE);
+      SET(CPS, cps.MinCPSizeX, dyn->fsr.fragment_size.width);
+      SET(CPS, cps.MinCPSizeY, dyn->fsr.fragment_size.height);
+#elif GFX_VER >= 12
+      SET(CPS, cps.CoarsePixelShadingStateArrayPointer,
+               get_cps_state_offset(cmd_buffer, cps_enable, &dyn->fsr));
+#endif
+   }
+#endif /* GFX_VER >= 11 */
+
+   if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN)) {
+      const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
+
+      if (tes_prog_data && anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
+         if (dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
+            SET(TE, te.OutputTopology, tes_prog_data->output_topology);
+         } else {
+            /* When the origin is upper-left, we have to flip the winding order */
+            if (tes_prog_data->output_topology == OUTPUT_TRI_CCW) {
+               SET(TE, te.OutputTopology, OUTPUT_TRI_CW);
+            } else if (tes_prog_data->output_topology == OUTPUT_TRI_CW) {
+               SET(TE, te.OutputTopology, OUTPUT_TRI_CCW);
+            } else {
+               SET(TE, te.OutputTopology, tes_prog_data->output_topology);
+            }
+         }
+      } else {
+         SET(TE, te.OutputTopology, OUTPUT_POINT);
+      }
+   }
+
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH))
+      SET(SF, sf.LineWidth, dyn->rs.line.width);
+
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_PROVOKING_VERTEX)) {
+      SETUP_PROVOKING_VERTEX(SF, sf, dyn->rs.provoking_vertex);
+      SETUP_PROVOKING_VERTEX(CLIP, clip, dyn->rs.provoking_vertex);
+   }
+
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
+      /**
+       * From the Vulkan Spec:
+       *
+       *    "VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT specifies that the depth
+       *     bias representation is a factor of constant r equal to 1."
+       *
+       * From the SKL PRMs, Volume 7: 3D-Media-GPGPU, Depth Offset:
+       *
+       *    "When UNORM Depth Buffer is at Output Merger (or no Depth Buffer):
+       *
+       *     Bias = GlobalDepthOffsetConstant * r + GlobalDepthOffsetScale * MaxDepthSlope
+       *
+       *     Where r is the minimum representable value > 0 in the depth
+       *     buffer format, converted to float32 (note: If state bit Legacy
+       *     Global Depth Bias Enable is set, the r term will be forced to
+       *     1.0)"
+       *
+       * When VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT is set, enable
+       * LegacyGlobalDepthBiasEnable.
+       */
+      SET(SF, sf.LegacyGlobalDepthBiasEnable,
+          dyn->rs.depth_bias.representation ==
+          VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT);
+   }
+
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE))
+      SET(CLIP, clip.APIMode, dyn->vp.depth_clip_negative_one_to_one ? APIMODE_OGL : APIMODE_D3D);
+
+   if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+       (gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_POLYGON_MODE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_MODE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CONSERVATIVE_MODE)) {
+      /* Take dynamic primitive topology in to account with
+       *    3DSTATE_RASTER::APIMode
+       *    3DSTATE_RASTER::DXMultisampleRasterizationEnable
+       *    3DSTATE_RASTER::AntialiasingEnable
+       */
+      uint32_t api_mode = 0;
+      bool msaa_raster_enable = false;
+
+      const VkLineRasterizationModeKHR line_mode =
+         anv_line_rasterization_mode(dyn->rs.line.mode,
+                                     dyn->ms.rasterization_samples);
+
+      const VkPolygonMode dynamic_raster_mode =
+         genX(raster_polygon_mode)(pipeline,
+                                   dyn->rs.polygon_mode,
+                                   dyn->ia.primitive_topology);
+
+      genX(rasterization_mode)(dynamic_raster_mode,
+                               line_mode, dyn->rs.line.width,
+                               &api_mode, &msaa_raster_enable);
+
+     /* From the Browadwell PRM, Volume 2, documentation for
+      * 3DSTATE_RASTER, "Antialiasing Enable":
+      *
+      * "This field must be disabled if any of the render targets
+      * have integer (UINT or SINT) surface format."
+      *
+      * Additionally internal documentation for Gfx12+ states:
+      *
+      * "This bit MUST not be set when NUM_MULTISAMPLES > 1 OR
+      *  FORCED_SAMPLE_COUNT > 1."
+      */
+      const bool aa_enable =
+         anv_rasterization_aa_mode(dynamic_raster_mode, line_mode) &&
+         !gfx->has_uint_rt &&
+         !(GFX_VER >= 12 && gfx->samples > 1);
+
+      const bool depth_clip_enable =
+         vk_rasterization_state_depth_clip_enable(&dyn->rs);
+
+      const bool xy_clip_test_enable =
+         (dynamic_raster_mode == VK_POLYGON_MODE_FILL);
+
+      SET(CLIP, clip.ViewportXYClipTestEnable, xy_clip_test_enable);
+
+      SET(RASTER, raster.APIMode, api_mode);
+      SET(RASTER, raster.DXMultisampleRasterizationEnable, msaa_raster_enable);
+      SET(RASTER, raster.AntialiasingEnable, aa_enable);
+      SET(RASTER, raster.CullMode, genX(vk_to_intel_cullmode)[dyn->rs.cull_mode]);
+      SET(RASTER, raster.FrontWinding, genX(vk_to_intel_front_face)[dyn->rs.front_face]);
+      SET(RASTER, raster.GlobalDepthOffsetEnableSolid, dyn->rs.depth_bias.enable);
+      SET(RASTER, raster.GlobalDepthOffsetEnableWireframe, dyn->rs.depth_bias.enable);
+      SET(RASTER, raster.GlobalDepthOffsetEnablePoint, dyn->rs.depth_bias.enable);
+      SET(RASTER, raster.GlobalDepthOffsetConstant, dyn->rs.depth_bias.constant);
+      SET(RASTER, raster.GlobalDepthOffsetScale, dyn->rs.depth_bias.slope);
+      SET(RASTER, raster.GlobalDepthOffsetClamp, dyn->rs.depth_bias.clamp);
+      SET(RASTER, raster.FrontFaceFillMode, genX(vk_to_intel_fillmode)[dyn->rs.polygon_mode]);
+      SET(RASTER, raster.BackFaceFillMode, genX(vk_to_intel_fillmode)[dyn->rs.polygon_mode]);
+      SET(RASTER, raster.ViewportZFarClipTestEnable, depth_clip_enable);
+      SET(RASTER, raster.ViewportZNearClipTestEnable, depth_clip_enable);
+      SET(RASTER, raster.ConservativeRasterizationEnable,
+                  dyn->rs.conservative_mode !=
+                  VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT);
+   }
+
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES)) {
+      SET(MULTISAMPLE, ms.NumberofMultisamples,
+          __builtin_ffs(MAX2(dyn->ms.rasterization_samples, 1)) - 1);
+   }
+
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_MASK)) {
+      /* From the Vulkan 1.0 spec:
+       *    If pSampleMask is NULL, it is treated as if the mask has all bits
+       *    enabled, i.e. no coverage is removed from fragments.
+       *
+       * 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits.
+       */
+      SET(SAMPLE_MASK, sm.SampleMask, dyn->ms.sample_mask & 0xffff);
+   }
+
+   if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
+#if GFX_VER == 9
+       /* For the PMA fix */
+       (gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+#endif
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE)) {
+      VkImageAspectFlags ds_aspects = 0;
+      if (gfx->depth_att.vk_format != VK_FORMAT_UNDEFINED)
+         ds_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
+      if (gfx->stencil_att.vk_format != VK_FORMAT_UNDEFINED)
+         ds_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
+
+      struct vk_depth_stencil_state opt_ds = dyn->ds;
+      vk_optimize_depth_stencil_state(&opt_ds, ds_aspects, true);
+
+      SET(WM_DEPTH_STENCIL, ds.DoubleSidedStencilEnable, true);
+
+      SET(WM_DEPTH_STENCIL, ds.StencilTestMask,
+                            opt_ds.stencil.front.compare_mask & 0xff);
+      SET(WM_DEPTH_STENCIL, ds.StencilWriteMask,
+                            opt_ds.stencil.front.write_mask & 0xff);
+
+      SET(WM_DEPTH_STENCIL, ds.BackfaceStencilTestMask, opt_ds.stencil.back.compare_mask & 0xff);
+      SET(WM_DEPTH_STENCIL, ds.BackfaceStencilWriteMask, opt_ds.stencil.back.write_mask & 0xff);
+
+      SET(WM_DEPTH_STENCIL, ds.StencilReferenceValue,
+                            opt_ds.stencil.front.reference & 0xff);
+      SET(WM_DEPTH_STENCIL, ds.BackfaceStencilReferenceValue,
+                            opt_ds.stencil.back.reference & 0xff);
+
+      SET(WM_DEPTH_STENCIL, ds.DepthTestEnable, opt_ds.depth.test_enable);
+      SET(WM_DEPTH_STENCIL, ds.DepthBufferWriteEnable, opt_ds.depth.write_enable);
+      SET(WM_DEPTH_STENCIL, ds.DepthTestFunction,
+                            genX(vk_to_intel_compare_op)[opt_ds.depth.compare_op]);
+      SET(WM_DEPTH_STENCIL, ds.StencilTestEnable, opt_ds.stencil.test_enable);
+      SET(WM_DEPTH_STENCIL, ds.StencilBufferWriteEnable, opt_ds.stencil.write_enable);
+      SET(WM_DEPTH_STENCIL, ds.StencilFailOp,
+                            genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.fail]);
+      SET(WM_DEPTH_STENCIL, ds.StencilPassDepthPassOp,
+                            genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.pass]);
+      SET(WM_DEPTH_STENCIL, ds.StencilPassDepthFailOp,
+                            genX(vk_to_intel_stencil_op)[opt_ds.stencil.front.op.depth_fail]);
+      SET(WM_DEPTH_STENCIL, ds.StencilTestFunction,
+                            genX(vk_to_intel_compare_op)[opt_ds.stencil.front.op.compare]);
+      SET(WM_DEPTH_STENCIL, ds.BackfaceStencilFailOp,
+                            genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.fail]);
+      SET(WM_DEPTH_STENCIL, ds.BackfaceStencilPassDepthPassOp,
+                            genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.pass]);
+      SET(WM_DEPTH_STENCIL, ds.BackfaceStencilPassDepthFailOp,
+                            genX(vk_to_intel_stencil_op)[opt_ds.stencil.back.op.depth_fail]);
+      SET(WM_DEPTH_STENCIL, ds.BackfaceStencilTestFunction,
+                            genX(vk_to_intel_compare_op)[opt_ds.stencil.back.op.compare]);
+
+#if GFX_VER == 9
+      const bool pma = want_stencil_pma_fix(cmd_buffer, dyn, &opt_ds);
+      SET(PMA_FIX, pma_fix, pma);
+#endif
+
+#if GFX_VERx10 >= 125
+      if (intel_needs_workaround(cmd_buffer->device->info, 18019816803)) {
+         bool ds_write_state = opt_ds.depth.write_enable || opt_ds.stencil.write_enable;
+         if (gfx->ds_write_state != ds_write_state) {
+            gfx->ds_write_state = ds_write_state;
+            BITSET_SET(hw_state->dirty, ANV_GFX_STATE_WA_18019816803);
+         }
+      }
+#endif
+   }
+
+#if GFX_VER >= 12
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS)) {
+      SET(DEPTH_BOUNDS, db.DepthBoundsTestEnable, dyn->ds.depth.bounds_test.enable);
+      /* Only look at updating the bounds if testing is enabled */
+      if (dyn->ds.depth.bounds_test.enable) {
+         SET(DEPTH_BOUNDS, db.DepthBoundsTestMinValue, dyn->ds.depth.bounds_test.min);
+         SET(DEPTH_BOUNDS, db.DepthBoundsTestMaxValue, dyn->ds.depth.bounds_test.max);
+      }
+   }
+#endif
+
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_STIPPLE_ENABLE)) {
+      SET(LINE_STIPPLE, ls.LineStipplePattern, dyn->rs.line.stipple.pattern);
+      SET(LINE_STIPPLE, ls.LineStippleInverseRepeatCount,
+                        1.0f / MAX2(1, dyn->rs.line.stipple.factor));
+      SET(LINE_STIPPLE, ls.LineStippleRepeatCount, dyn->rs.line.stipple.factor);
+
+      SET(WM,           wm.LineStippleEnable, dyn->rs.line.stipple.enable);
+   }
+
+   if ((gfx->dirty & ANV_CMD_DIRTY_RESTART_INDEX) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE)) {
+      SET(VF, vf.IndexedDrawCutIndexEnable, dyn->ia.primitive_restart_enable);
+      SET(VF, vf.CutIndex, gfx->restart_index);
+   }
+
+   if (gfx->dirty & ANV_CMD_DIRTY_INDEX_BUFFER)
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_INDEX_BUFFER);
+
+#if GFX_VERx10 >= 125
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_RESTART_ENABLE))
+      SET(VFG, vfg.ListCutIndexEnable, dyn->ia.primitive_restart_enable);
+#endif
+
+   if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations &&
+       (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS) ||
+        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE)))
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SAMPLE_PATTERN);
+
+   if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+       (gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) {
+      /* 3DSTATE_WM in the hope we can avoid spawning fragment shaders
+       * threads.
+       */
+      bool force_thread_dispatch =
+         anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) &&
+         (pipeline->force_fragment_thread_dispatch ||
+          anv_cmd_buffer_all_color_write_masked(cmd_buffer));
+      SET(WM, wm.ForceThreadDispatchEnable, force_thread_dispatch ? ForceON : 0);
+   }
+
+   if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE)) {
+      SET_STAGE(PS_EXTRA, ps_extra.PixelShaderKillsPixel,
+                wm_prog_data && (pipeline->rp_has_ds_self_dep ||
+                                 has_ds_feedback_loop(dyn) ||
+                                 wm_prog_data->uses_kill),
+                FRAGMENT);
+   }
+
+   if ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+       (gfx->dirty & ANV_CMD_DIRTY_RENDER_TARGETS) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS)) {
+      const uint8_t color_writes = dyn->cb.color_write_enables;
+      bool has_writeable_rt =
+         anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT) &&
+         (color_writes & ((1u << gfx->color_att_count) - 1)) != 0;
+
+      SET(BLEND_STATE, blend.AlphaToCoverageEnable,
+                       dyn->ms.alpha_to_coverage_enable);
+      SET(BLEND_STATE, blend.AlphaToOneEnable,
+                       dyn->ms.alpha_to_one_enable);
+
+      bool independent_alpha_blend = false;
+      /* Wa_14018912822, check if we set these during RT setup. */
+      bool color_blend_zero = false;
+      bool alpha_blend_zero = false;
+      for (uint32_t i = 0; i < MAX_RTS; i++) {
+         /* Disable anything above the current number of color attachments. */
+         bool write_disabled = i >= gfx->color_att_count ||
+                               (color_writes & BITFIELD_BIT(i)) == 0;
+
+         SET(BLEND_STATE, blend.rts[i].WriteDisableAlpha,
+                          write_disabled ||
+                          (dyn->cb.attachments[i].write_mask &
+                           VK_COLOR_COMPONENT_A_BIT) == 0);
+         SET(BLEND_STATE, blend.rts[i].WriteDisableRed,
+                          write_disabled ||
+                          (dyn->cb.attachments[i].write_mask &
+                           VK_COLOR_COMPONENT_R_BIT) == 0);
+         SET(BLEND_STATE, blend.rts[i].WriteDisableGreen,
+                          write_disabled ||
+                          (dyn->cb.attachments[i].write_mask &
+                           VK_COLOR_COMPONENT_G_BIT) == 0);
+         SET(BLEND_STATE, blend.rts[i].WriteDisableBlue,
+                          write_disabled ||
+                          (dyn->cb.attachments[i].write_mask &
+                           VK_COLOR_COMPONENT_B_BIT) == 0);
+         /* Vulkan specification 1.2.168, VkLogicOp:
+          *
+          *   "Logical operations are controlled by the logicOpEnable and
+          *   logicOp members of VkPipelineColorBlendStateCreateInfo. If
+          *   logicOpEnable is VK_TRUE, then a logical operation selected by
+          *   logicOp is applied between each color attachment and the
+          *   fragment’s corresponding output value, and blending of all
+          *   attachments is treated as if it were disabled."
+          *
+          * From the Broadwell PRM Volume 2d: Command Reference: Structures:
+          * BLEND_STATE_ENTRY:
+          *
+          *   "Enabling LogicOp and Color Buffer Blending at the same time is
+          *   UNDEFINED"
+          */
+         SET(BLEND_STATE, blend.rts[i].LogicOpFunction,
+                          genX(vk_to_intel_logic_op)[dyn->cb.logic_op]);
+         SET(BLEND_STATE, blend.rts[i].LogicOpEnable, dyn->cb.logic_op_enable);
+
+         SET(BLEND_STATE, blend.rts[i].ColorClampRange, COLORCLAMP_RTFORMAT);
+         SET(BLEND_STATE, blend.rts[i].PreBlendColorClampEnable, true);
+         SET(BLEND_STATE, blend.rts[i].PostBlendColorClampEnable, true);
+
+         /* Setup blend equation. */
+         SET(BLEND_STATE, blend.rts[i].ColorBlendFunction,
+                          genX(vk_to_intel_blend_op)[
+                             dyn->cb.attachments[i].color_blend_op]);
+         SET(BLEND_STATE, blend.rts[i].AlphaBlendFunction,
+                          genX(vk_to_intel_blend_op)[
+                             dyn->cb.attachments[i].alpha_blend_op]);
+
+         if (dyn->cb.attachments[i].src_color_blend_factor !=
+             dyn->cb.attachments[i].src_alpha_blend_factor ||
+             dyn->cb.attachments[i].dst_color_blend_factor !=
+             dyn->cb.attachments[i].dst_alpha_blend_factor ||
+             dyn->cb.attachments[i].color_blend_op !=
+             dyn->cb.attachments[i].alpha_blend_op) {
+            independent_alpha_blend = true;
+         }
+
+         /* The Dual Source Blending documentation says:
+          *
+          * "If SRC1 is included in a src/dst blend factor and
+          * a DualSource RT Write message is not used, results
+          * are UNDEFINED. (This reflects the same restriction in DX APIs,
+          * where undefined results are produced if “o1” is not written
+          * by a PS – there are no default values defined)."
+          *
+          * There is no way to gracefully fix this undefined situation
+          * so we just disable the blending to prevent possible issues.
+          */
+         if (wm_prog_data && !wm_prog_data->dual_src_blend &&
+             anv_is_dual_src_blend_equation(&dyn->cb.attachments[i])) {
+            SET(BLEND_STATE, blend.rts[i].ColorBufferBlendEnable, false);
+         } else {
+            SET(BLEND_STATE, blend.rts[i].ColorBufferBlendEnable,
+                             !dyn->cb.logic_op_enable &&
+                             dyn->cb.attachments[i].blend_enable);
+         }
+
+         /* Our hardware applies the blend factor prior to the blend function
+          * regardless of what function is used.  Technically, this means the
+          * hardware can do MORE than GL or Vulkan specify.  However, it also
+          * means that, for MIN and MAX, we have to stomp the blend factor to
+          * ONE to make it a no-op.
+          */
+         uint32_t SourceBlendFactor;
+         uint32_t DestinationBlendFactor;
+         uint32_t SourceAlphaBlendFactor;
+         uint32_t DestinationAlphaBlendFactor;
+         if (dyn->cb.attachments[i].color_blend_op == VK_BLEND_OP_MIN ||
+             dyn->cb.attachments[i].color_blend_op == VK_BLEND_OP_MAX) {
+            SourceBlendFactor = BLENDFACTOR_ONE;
+            DestinationBlendFactor = BLENDFACTOR_ONE;
+         } else {
+            SourceBlendFactor = genX(vk_to_intel_blend)[
+               dyn->cb.attachments[i].src_color_blend_factor];
+            DestinationBlendFactor = genX(vk_to_intel_blend)[
+               dyn->cb.attachments[i].dst_color_blend_factor];
+         }
+
+         if (dyn->cb.attachments[i].alpha_blend_op == VK_BLEND_OP_MIN ||
+             dyn->cb.attachments[i].alpha_blend_op == VK_BLEND_OP_MAX) {
+            SourceAlphaBlendFactor = BLENDFACTOR_ONE;
+            DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
+         } else {
+            SourceAlphaBlendFactor = genX(vk_to_intel_blend)[
+               dyn->cb.attachments[i].src_alpha_blend_factor];
+            DestinationAlphaBlendFactor = genX(vk_to_intel_blend)[
+               dyn->cb.attachments[i].dst_alpha_blend_factor];
+         }
+
+         /* Replace and Src1 value by 1.0 if dual source blending is not
+          * enabled.
+          */
+         if (wm_prog_data && !wm_prog_data->dual_src_blend) {
+            if (is_src1_blend_factor(SourceBlendFactor))
+               SourceBlendFactor = BLENDFACTOR_ONE;
+            if (is_src1_blend_factor(DestinationBlendFactor))
+               DestinationBlendFactor = BLENDFACTOR_ONE;
+         }
+
+         if (instance->intel_enable_wa_14018912822 &&
+             intel_needs_workaround(cmd_buffer->device->info, 14018912822) &&
+             dyn->ms.rasterization_samples > 1) {
+            if (DestinationBlendFactor == BLENDFACTOR_ZERO) {
+               DestinationBlendFactor = BLENDFACTOR_CONST_COLOR;
+               color_blend_zero = true;
+            }
+            if (DestinationAlphaBlendFactor == BLENDFACTOR_ZERO) {
+               DestinationAlphaBlendFactor = BLENDFACTOR_CONST_ALPHA;
+               alpha_blend_zero = true;
+            }
+         }
+
+         SET(BLEND_STATE, blend.rts[i].SourceBlendFactor, SourceBlendFactor);
+         SET(BLEND_STATE, blend.rts[i].DestinationBlendFactor, DestinationBlendFactor);
+         SET(BLEND_STATE, blend.rts[i].SourceAlphaBlendFactor, SourceAlphaBlendFactor);
+         SET(BLEND_STATE, blend.rts[i].DestinationAlphaBlendFactor, DestinationAlphaBlendFactor);
+      }
+      gfx->color_blend_zero = color_blend_zero;
+      gfx->alpha_blend_zero = alpha_blend_zero;
+
+      SET(BLEND_STATE, blend.IndependentAlphaBlendEnable, independent_alpha_blend);
+
+      /* 3DSTATE_PS_BLEND to be consistent with the rest of the
+       * BLEND_STATE_ENTRY.
+       */
+      SET(PS_BLEND, ps_blend.HasWriteableRT, has_writeable_rt);
+      SET(PS_BLEND, ps_blend.ColorBufferBlendEnable, GET(blend.rts[0].ColorBufferBlendEnable));
+      SET(PS_BLEND, ps_blend.SourceAlphaBlendFactor, GET(blend.rts[0].SourceAlphaBlendFactor));
+      SET(PS_BLEND, ps_blend.DestinationAlphaBlendFactor, gfx->alpha_blend_zero ?
+                                                          BLENDFACTOR_CONST_ALPHA :
+                                                          GET(blend.rts[0].DestinationAlphaBlendFactor));
+      SET(PS_BLEND, ps_blend.SourceBlendFactor, GET(blend.rts[0].SourceBlendFactor));
+      SET(PS_BLEND, ps_blend.DestinationBlendFactor, gfx->color_blend_zero ?
+                                                     BLENDFACTOR_CONST_COLOR :
+                                                     GET(blend.rts[0].DestinationBlendFactor));
+      SET(PS_BLEND, ps_blend.AlphaTestEnable, false);
+      SET(PS_BLEND, ps_blend.IndependentAlphaBlendEnable, GET(blend.IndependentAlphaBlendEnable));
+      SET(PS_BLEND, ps_blend.AlphaToCoverageEnable, dyn->ms.alpha_to_coverage_enable);
+   }
+
+   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) {
+      SET(CC_STATE, cc.BlendConstantColorRed,
+                    gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[0]);
+      SET(CC_STATE, cc.BlendConstantColorGreen,
+                    gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[1]);
+      SET(CC_STATE, cc.BlendConstantColorBlue,
+                    gfx->color_blend_zero ? 0.0f : dyn->cb.blend_constants[2]);
+      SET(CC_STATE, cc.BlendConstantColorAlpha,
+                    gfx->alpha_blend_zero ? 0.0f : dyn->cb.blend_constants[3]);
+   }
+
+   if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_AREA) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE)) {
+      struct anv_instance *instance = cmd_buffer->device->physical->instance;
+      const VkViewport *viewports = dyn->vp.viewports;
+
+      const float scale = dyn->vp.depth_clip_negative_one_to_one ? 0.5f : 1.0f;
+
+      for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
+         const VkViewport *vp = &viewports[i];
+
+         /* The gfx7 state struct has just the matrix and guardband fields, the
+          * gfx8 struct adds the min/max viewport fields. */
+         struct GENX(SF_CLIP_VIEWPORT) sfv = {
+            .ViewportMatrixElementm00 = vp->width / 2,
+            .ViewportMatrixElementm11 = vp->height / 2,
+            .ViewportMatrixElementm22 = (vp->maxDepth - vp->minDepth) * scale,
+            .ViewportMatrixElementm30 = vp->x + vp->width / 2,
+            .ViewportMatrixElementm31 = vp->y + vp->height / 2,
+            .ViewportMatrixElementm32 = dyn->vp.depth_clip_negative_one_to_one ?
+               (vp->minDepth + vp->maxDepth) * scale : vp->minDepth,
+            .XMinClipGuardband = -1.0f,
+            .XMaxClipGuardband = 1.0f,
+            .YMinClipGuardband = -1.0f,
+            .YMaxClipGuardband = 1.0f,
+            .XMinViewPort = vp->x,
+            .XMaxViewPort = vp->x + vp->width - 1,
+            .YMinViewPort = MIN2(vp->y, vp->y + vp->height),
+            .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1,
+         };
+
+         /* Fix depth test misrenderings by lowering translated depth range */
+         if (instance->lower_depth_range_rate != 1.0f)
+            sfv.ViewportMatrixElementm32 *= instance->lower_depth_range_rate;
+
+         const uint32_t fb_size_max = 1 << 14;
+         uint32_t x_min = 0, x_max = fb_size_max;
+         uint32_t y_min = 0, y_max = fb_size_max;
+
+         /* If we have a valid renderArea, include that */
+         if (gfx->render_area.extent.width > 0 &&
+             gfx->render_area.extent.height > 0) {
+            x_min = MAX2(x_min, gfx->render_area.offset.x);
+            x_max = MIN2(x_max, gfx->render_area.offset.x +
+                                gfx->render_area.extent.width);
+            y_min = MAX2(y_min, gfx->render_area.offset.y);
+            y_max = MIN2(y_max, gfx->render_area.offset.y +
+                                gfx->render_area.extent.height);
+         }
+
+         /* The client is required to have enough scissors for whatever it
+          * sets as ViewportIndex but it's possible that they've got more
+          * viewports set from a previous command. Also, from the Vulkan
+          * 1.3.207:
+          *
+          *    "The application must ensure (using scissor if necessary) that
+          *    all rendering is contained within the render area."
+          *
+          * If the client doesn't set a scissor, that basically means it
+          * guarantees everything is in-bounds already. If we end up using a
+          * guardband of [-1, 1] in that case, there shouldn't be much loss.
+          * It's theoretically possible that they could do all their clipping
+          * with clip planes but that'd be a bit odd.
+          */
+         if (i < dyn->vp.scissor_count) {
+            const VkRect2D *scissor = &dyn->vp.scissors[i];
+            x_min = MAX2(x_min, scissor->offset.x);
+            x_max = MIN2(x_max, scissor->offset.x + scissor->extent.width);
+            y_min = MAX2(y_min, scissor->offset.y);
+            y_max = MIN2(y_max, scissor->offset.y + scissor->extent.height);
+         }
+
+         /* Only bother calculating the guardband if our known render area is
+          * less than the maximum size. Otherwise, it will calculate [-1, 1]
+          * anyway but possibly with precision loss.
+          */
+         if (x_min > 0 || x_max < fb_size_max ||
+             y_min > 0 || y_max < fb_size_max) {
+            intel_calculate_guardband_size(x_min, x_max, y_min, y_max,
+                                           sfv.ViewportMatrixElementm00,
+                                           sfv.ViewportMatrixElementm11,
+                                           sfv.ViewportMatrixElementm30,
+                                           sfv.ViewportMatrixElementm31,
+                                           &sfv.XMinClipGuardband,
+                                           &sfv.XMaxClipGuardband,
+                                           &sfv.YMinClipGuardband,
+                                           &sfv.YMaxClipGuardband);
+         }
+
+#define SET_VP(bit, state, field)                                        \
+         do {                                                           \
+            if (hw_state->state.field != sfv.field) {                   \
+               hw_state->state.field = sfv.field;                       \
+               BITSET_SET(hw_state->dirty,                              \
+                          ANV_GFX_STATE_##bit);                         \
+            }                                                           \
+         } while (0)
+         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm00);
+         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm11);
+         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm22);
+         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm30);
+         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm31);
+         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], ViewportMatrixElementm32);
+         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMinClipGuardband);
+         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMaxClipGuardband);
+         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMinClipGuardband);
+         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMaxClipGuardband);
+         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMinViewPort);
+         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], XMaxViewPort);
+         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMinViewPort);
+         SET_VP(VIEWPORT_SF_CLIP, vp_sf_clip.elem[i], YMaxViewPort);
+#undef SET_VP
+
+         const bool depth_range_unrestricted =
+            cmd_buffer->device->vk.enabled_extensions.EXT_depth_range_unrestricted;
+
+         float min_depth_limit = depth_range_unrestricted ? -FLT_MAX : 0.0;
+         float max_depth_limit = depth_range_unrestricted ? FLT_MAX : 1.0;
+
+         float min_depth = dyn->rs.depth_clamp_enable ?
+                           MIN2(vp->minDepth, vp->maxDepth) : min_depth_limit;
+         float max_depth = dyn->rs.depth_clamp_enable ?
+                           MAX2(vp->minDepth, vp->maxDepth) : max_depth_limit;
+
+         SET(VIEWPORT_CC, vp_cc.elem[i].MinimumDepth, min_depth);
+         SET(VIEWPORT_CC, vp_cc.elem[i].MaximumDepth, max_depth);
+
+         SET(CLIP, clip.MaximumVPIndex, dyn->vp.viewport_count > 0 ?
+                                        dyn->vp.viewport_count - 1 : 0);
+      }
+
+      /* If the HW state is already considered dirty or the previous
+       * programmed viewport count is smaller than what we need, update the
+       * viewport count and ensure the HW state is dirty. Otherwise if the
+       * number of viewport programmed previously was larger than what we need
+       * now, no need to reemit we can just keep the old programmed values.
+       */
+      if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP) ||
+          hw_state->vp_sf_clip.count < dyn->vp.viewport_count) {
+         hw_state->vp_sf_clip.count = dyn->vp.viewport_count;
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP);
+      }
+      if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC) ||
+          hw_state->vp_cc.count < dyn->vp.viewport_count) {
+         hw_state->vp_cc.count = dyn->vp.viewport_count;
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC);
+      }
+   }
+
+   if ((gfx->dirty & ANV_CMD_DIRTY_RENDER_AREA) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
+       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS)) {
+      const VkRect2D *scissors = dyn->vp.scissors;
+      const VkViewport *viewports = dyn->vp.viewports;
+
+      for (uint32_t i = 0; i < dyn->vp.scissor_count; i++) {
+         const VkRect2D *s = &scissors[i];
+         const VkViewport *vp = &viewports[i];
+
+         const int max = 0xffff;
+
+         uint32_t y_min = MAX2(s->offset.y, MIN2(vp->y, vp->y + vp->height));
+         uint32_t x_min = MAX2(s->offset.x, vp->x);
+         int64_t y_max = MIN2(s->offset.y + s->extent.height - 1,
+                              MAX2(vp->y, vp->y + vp->height) - 1);
+         int64_t x_max = MIN2(s->offset.x + s->extent.width - 1,
+                              vp->x + vp->width - 1);
+
+         y_max = CLAMP(y_max, 0, INT16_MAX >> 1);
+         x_max = CLAMP(x_max, 0, INT16_MAX >> 1);
+
+         /* Do this math using int64_t so overflow gets clamped correctly. */
+         if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
+            y_min = CLAMP((uint64_t) y_min, gfx->render_area.offset.y, max);
+            x_min = CLAMP((uint64_t) x_min, gfx->render_area.offset.x, max);
+            y_max = CLAMP((uint64_t) y_max, 0,
+                          gfx->render_area.offset.y +
+                          gfx->render_area.extent.height - 1);
+            x_max = CLAMP((uint64_t) x_max, 0,
+                          gfx->render_area.offset.x +
+                          gfx->render_area.extent.width - 1);
+         }
+
+         if (s->extent.width <= 0 || s->extent.height <= 0) {
+            /* Since xmax and ymax are inclusive, we have to have xmax < xmin
+             * or ymax < ymin for empty clips. In case clip x, y, width height
+             * are all 0, the clamps below produce 0 for xmin, ymin, xmax,
+             * ymax, which isn't what we want. Just special case empty clips
+             * and produce a canonical empty clip.
+             */
+            SET(SCISSOR, scissor.elem[i].ScissorRectangleYMin, 1);
+            SET(SCISSOR, scissor.elem[i].ScissorRectangleXMin, 1);
+            SET(SCISSOR, scissor.elem[i].ScissorRectangleYMax, 0);
+            SET(SCISSOR, scissor.elem[i].ScissorRectangleXMax, 0);
+         } else {
+            SET(SCISSOR, scissor.elem[i].ScissorRectangleYMin, y_min);
+            SET(SCISSOR, scissor.elem[i].ScissorRectangleXMin, x_min);
+            SET(SCISSOR, scissor.elem[i].ScissorRectangleYMax, y_max);
+            SET(SCISSOR, scissor.elem[i].ScissorRectangleXMax, x_max);
+         }
+      }
+
+      /* If the HW state is already considered dirty or the previous
+       * programmed viewport count is smaller than what we need, update the
+       * viewport count and ensure the HW state is dirty. Otherwise if the
+       * number of viewport programmed previously was larger than what we need
+       * now, no need to reemit we can just keep the old programmed values.
+       */
+      if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SCISSOR) ||
+          hw_state->scissor.count < dyn->vp.scissor_count) {
+         hw_state->scissor.count = dyn->vp.scissor_count;
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SCISSOR);
+      }
+   }
+
+#if GFX_VERx10 == 125
+   if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)) {
+      unsigned fb_width, fb_height, tile_width, tile_height;
+
+      if (cmd_buffer->device->physical->instance->enable_tbimr &&
+          calculate_render_area(cmd_buffer, &fb_width, &fb_height) &&
+          calculate_tile_dimensions(cmd_buffer, fb_width, fb_height,
+                                    &tile_width, &tile_height)) {
+         /* Use a batch size of 128 polygons per slice as recommended
+          * by BSpec 68436 "TBIMR Programming".
+          */
+         const unsigned num_slices = cmd_buffer->device->info->num_slices;
+         const unsigned batch_size = DIV_ROUND_UP(num_slices, 2) * 256;
+
+         SET(TBIMR_TILE_PASS_INFO, tbimr.TileRectangleHeight, tile_height);
+         SET(TBIMR_TILE_PASS_INFO, tbimr.TileRectangleWidth, tile_width);
+         SET(TBIMR_TILE_PASS_INFO, tbimr.VerticalTileCount,
+             DIV_ROUND_UP(fb_height, tile_height));
+         SET(TBIMR_TILE_PASS_INFO, tbimr.HorizontalTileCount,
+             DIV_ROUND_UP(fb_width, tile_width));
+         SET(TBIMR_TILE_PASS_INFO, tbimr.TBIMRBatchSize,
+             util_logbase2(batch_size) - 5);
+         SET(TBIMR_TILE_PASS_INFO, tbimr.TileBoxCheck, true);
+         SET(TBIMR_TILE_PASS_INFO, use_tbimr, true);
+      } else {
+         hw_state->use_tbimr = false;
+      }
+   }
+#endif
+
+   struct anv_push_constants *push = &cmd_buffer->state.gfx.base.push_constants;
+
+   /* If the pipeline uses a dynamic value of patch_control_points and either
+    * the pipeline change or the dynamic value change, check the value and
+    * reemit if needed.
+    */
+   if (pipeline->dynamic_patch_control_points &&
+       ((gfx->dirty & ANV_CMD_DIRTY_PIPELINE) ||
+        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS)) &&
+       push->gfx.tcs_input_vertices != dyn->ts.patch_control_points) {
+      push->gfx.tcs_input_vertices = dyn->ts.patch_control_points;
+      cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
+      gfx->base.push_constants_data_dirty = true;
+   }
+
+#undef GET
+#undef SET
+#undef SET_STAGE
+
+   vk_dynamic_graphics_state_clear_dirty(&cmd_buffer->vk.dynamic_graphics_state);
+}
+
+static void
+emit_wa_18020335297_dummy_draw(struct anv_cmd_buffer *cmd_buffer)
+{
+#if GFX_VERx10 >= 125
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VFG), vfg) {
+      vfg.DistributionMode = RR_STRICT;
+   }
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) {
+      vf.GeometryDistributionEnable = true;
+   }
+#endif
+
+#if GFX_VER >= 12
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
+      pr.ReplicaMask = 1;
+   }
+#endif
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_RASTER), rr) {
+      rr.CullMode = CULLMODE_NONE;
+      rr.FrontFaceFillMode = FILL_MODE_SOLID;
+      rr.BackFaceFillMode = FILL_MODE_SOLID;
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_STATISTICS), zero);
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS), zero);
+
+#if GFX_VER >= 11
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS_2), zero);
+#endif
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CLIP), clip) {
+      clip.ClipEnable = true;
+      clip.ClipMode = CLIPMODE_REJECT_ALL;
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VS), zero);
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_GS), zero);
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HS), zero);
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TE), zero);
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DS), zero);
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), zero);
+
+   uint32_t *vertex_elements = anv_batch_emitn(&cmd_buffer->batch, 1 + 2 * 2,
+                                               GENX(3DSTATE_VERTEX_ELEMENTS));
+   uint32_t *ve_pack_dest = &vertex_elements[1];
+
+   for (int i = 0; i < 2; i++) {
+      struct GENX(VERTEX_ELEMENT_STATE) element = {
+         .Valid = true,
+         .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
+         .Component0Control = VFCOMP_STORE_0,
+         .Component1Control = VFCOMP_STORE_0,
+         .Component2Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP,
+         .Component3Control = i == 0 ? VFCOMP_STORE_0 : VFCOMP_STORE_1_FP,
+      };
+      GENX(VERTEX_ELEMENT_STATE_pack)(NULL, ve_pack_dest, &element);
+      ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
+      topo.PrimitiveTopologyType = _3DPRIM_TRILIST;
+   }
+
+   /* Emit dummy draw per slice. */
+   for (unsigned i = 0; i < cmd_buffer->device->info->num_slices; i++) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+         prim.VertexCountPerInstance = 3;
+         prim.PrimitiveTopologyType = _3DPRIM_TRILIST;
+         prim.InstanceCount = 1;
+         prim.VertexAccessType = SEQUENTIAL;
+      }
+   }
+}
+/**
+ * This function handles dirty state emission to the batch buffer.
+ */
+static void
+cmd_buffer_gfx_state_emission(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_device *device = cmd_buffer->device;
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(gfx->base.pipeline);
+   const struct vk_dynamic_graphics_state *dyn =
+      &cmd_buffer->vk.dynamic_graphics_state;
+   struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_URB)) {
+      genX(urb_workaround)(cmd_buffer, &pipeline->urb_cfg);
+
+      anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.urb);
+
+      memcpy(&gfx->urb_cfg, &pipeline->urb_cfg,
+             sizeof(struct intel_urb_config));
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION))
+      anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.primitive_replication);
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_INSTANCING))
+      anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs_instancing);
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS))
+      anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs);
+
+#if GFX_VER >= 11
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2))
+      anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_sgvs_2);
+#endif
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VS))
+      anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vs);
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_HS))
+      anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.hs);
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_DS))
+      anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ds);
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS))
+      anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.vf_statistics);
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE))
+      anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe);
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_SWIZ))
+      anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe_swiz);
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST)) {
+      /* Wa_16011773973:
+       * If SOL is enabled and SO_DECL state has to be programmed,
+       *    1. Send 3D State SOL state with SOL disabled
+       *    2. Send SO_DECL NP state
+       *    3. Send 3D State SOL with SOL Enabled
+       */
+      if (intel_needs_workaround(device->info, 16011773973) &&
+          pipeline->uses_xfb)
+         anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), so);
+
+      anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline,
+                                    final.so_decl_list);
+
+#if GFX_VER >= 11 && GFX_VER < 20
+      /* ICL PRMs, Volume 2a - Command Reference: Instructions,
+       * 3DSTATE_SO_DECL_LIST:
+       *
+       *    "Workaround: This command must be followed by a PIPE_CONTROL with
+       *     CS Stall bit set."
+       *
+       * On DG2+ also known as Wa_1509820217.
+       */
+      genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
+                                   cmd_buffer->state.current_pipeline,
+                                   ANV_PIPE_CS_STALL_BIT);
+#endif
+   }
+
+   if (device->vk.enabled_extensions.EXT_mesh_shader) {
+      if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL))
+         anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.mesh_control);
+
+      if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_SHADER))
+         anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.mesh_shader);
+
+      if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_DISTRIB))
+         anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.mesh_distrib);
+
+      if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL))
+         anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.task_control);
+
+      if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_SHADER))
+         anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.task_shader);
+
+      if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_REDISTRIB))
+         anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.task_redistrib);
+
+      if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_MESH))
+         anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.sbe_mesh);
+
+      if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP_MESH))
+         anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.clip_mesh);
+   } else {
+      assert(!BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL) &&
+             !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_SHADER) &&
+             !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MESH_DISTRIB) &&
+             !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL) &&
+             !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_SHADER) &&
+             !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TASK_REDISTRIB) &&
+             !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP_MESH) &&
+             !BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SBE_MESH));
+   }
+
+#define INIT(category, name) \
+   .name = hw_state->category.name
+#define SET(s, category, name) \
+   s.name = hw_state->category.name
+
+   /* Now the potentially dynamic instructions */
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS)) {
+      anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_PS),
+                           pipeline, partial.ps, ps) {
+         SET(ps, ps, KernelStartPointer0);
+         SET(ps, ps, KernelStartPointer1);
+         SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData0);
+         SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData1);
+
+#if GFX_VER < 20
+         SET(ps, ps, KernelStartPointer2);
+         SET(ps, ps, DispatchGRFStartRegisterForConstantSetupData2);
+
+         SET(ps, ps, _8PixelDispatchEnable);
+         SET(ps, ps, _16PixelDispatchEnable);
+         SET(ps, ps, _32PixelDispatchEnable);
+#else
+         SET(ps, ps, Kernel0Enable);
+         SET(ps, ps, Kernel1Enable);
+         SET(ps, ps, Kernel0SIMDWidth);
+         SET(ps, ps, Kernel1SIMDWidth);
+         SET(ps, ps, Kernel0PolyPackingPolicy);
+#endif
+         SET(ps, ps, PositionXYOffsetSelect);
+      }
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS_EXTRA)) {
+      anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_PS_EXTRA),
+                           pipeline, partial.ps_extra, pse) {
+         SET(pse, ps_extra, PixelShaderIsPerSample);
+#if GFX_VER >= 11
+         SET(pse, ps_extra, PixelShaderIsPerCoarsePixel);
+#endif
+#if GFX_VERx10 >= 125
+         SET(pse, ps_extra, EnablePSDependencyOnCPsizeChange);
+#endif
+         SET(pse, ps_extra, PixelShaderKillsPixel);
+      }
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CLIP)) {
+      anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_CLIP),
+                           pipeline, partial.clip, clip) {
+         SET(clip, clip, APIMode);
+         SET(clip, clip, ViewportXYClipTestEnable);
+         SET(clip, clip, TriangleStripListProvokingVertexSelect);
+         SET(clip, clip, LineStripListProvokingVertexSelect);
+         SET(clip, clip, TriangleFanProvokingVertexSelect);
+         SET(clip, clip, MaximumVPIndex);
+      }
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_STREAMOUT)) {
+      genX(streamout_prologue)(cmd_buffer);
+
+      anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT),
+                           pipeline, partial.so, so) {
+         SET(so, so, RenderingDisable);
+         SET(so, so, RenderStreamSelect);
+         SET(so, so, ReorderMode);
+         SET(so, so, ForceRendering);
+      }
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_SF_CLIP)) {
+      struct anv_state sf_clip_state =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                            hw_state->vp_sf_clip.count * 64, 64);
+
+      for (uint32_t i = 0; i < hw_state->vp_sf_clip.count; i++) {
+         struct GENX(SF_CLIP_VIEWPORT) sfv = {
+            INIT(vp_sf_clip.elem[i], ViewportMatrixElementm00),
+            INIT(vp_sf_clip.elem[i], ViewportMatrixElementm11),
+            INIT(vp_sf_clip.elem[i], ViewportMatrixElementm22),
+            INIT(vp_sf_clip.elem[i], ViewportMatrixElementm30),
+            INIT(vp_sf_clip.elem[i], ViewportMatrixElementm31),
+            INIT(vp_sf_clip.elem[i], ViewportMatrixElementm32),
+            INIT(vp_sf_clip.elem[i], XMinClipGuardband),
+            INIT(vp_sf_clip.elem[i], XMaxClipGuardband),
+            INIT(vp_sf_clip.elem[i], YMinClipGuardband),
+            INIT(vp_sf_clip.elem[i], YMaxClipGuardband),
+            INIT(vp_sf_clip.elem[i], XMinViewPort),
+            INIT(vp_sf_clip.elem[i], XMaxViewPort),
+            INIT(vp_sf_clip.elem[i], YMinViewPort),
+            INIT(vp_sf_clip.elem[i], YMaxViewPort),
+         };
+         GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv);
+      }
+
+      anv_batch_emit(&cmd_buffer->batch,
+                     GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) {
+         clip.SFClipViewportPointer = sf_clip_state.offset;
+      }
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC)) {
+      hw_state->vp_cc.state =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                            hw_state->vp_cc.count * 8, 32);
+
+      for (uint32_t i = 0; i < hw_state->vp_cc.count; i++) {
+         struct GENX(CC_VIEWPORT) cc_viewport = {
+            INIT(vp_cc.elem[i], MinimumDepth),
+            INIT(vp_cc.elem[i], MaximumDepth),
+         };
+         GENX(CC_VIEWPORT_pack)(NULL, hw_state->vp_cc.state.map + i * 8,
+                                &cc_viewport);
+      }
+
+      /* Dirty the pointers to reemit 3DSTATE_VIEWPORT_STATE_POINTERS_CC below
+       */
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR);
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR)) {
+      anv_batch_emit(&cmd_buffer->batch,
+                     GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
+         cc.CCViewportPointer = hw_state->vp_cc.state.offset;
+      }
+      cmd_buffer->state.gfx.viewport_set = true;
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SCISSOR)) {
+      /* Wa_1409725701:
+       *
+       *    "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
+       *    stored as an array of up to 16 elements. The location of first
+       *    element of the array, as specified by Pointer to SCISSOR_RECT,
+       *    should be aligned to a 64-byte boundary.
+       */
+      struct anv_state scissor_state =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                            hw_state->scissor.count * 8, 64);
+
+      for (uint32_t i = 0; i < hw_state->scissor.count; i++) {
+         struct GENX(SCISSOR_RECT) scissor = {
+            INIT(scissor.elem[i], ScissorRectangleYMin),
+            INIT(scissor.elem[i], ScissorRectangleXMin),
+            INIT(scissor.elem[i], ScissorRectangleYMax),
+            INIT(scissor.elem[i], ScissorRectangleXMax),
+         };
+         GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, &scissor);
+      }
+
+      anv_batch_emit(&cmd_buffer->batch,
+                     GENX(3DSTATE_SCISSOR_STATE_POINTERS), ssp) {
+         ssp.ScissorRectPointer = scissor_state.offset;
+      }
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
+         SET(vft, vft, PrimitiveTopologyType);
+      }
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT)) {
+      const uint32_t ve_count =
+         pipeline->vs_input_elements + pipeline->svgs_count;
+      const uint32_t num_dwords = 1 + 2 * MAX2(1, ve_count);
+      uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
+                                    GENX(3DSTATE_VERTEX_ELEMENTS));
+
+      if (p) {
+         if (ve_count == 0) {
+            memcpy(p + 1, cmd_buffer->device->empty_vs_input,
+                   sizeof(cmd_buffer->device->empty_vs_input));
+         } else if (ve_count == pipeline->vertex_input_elems) {
+            /* MESA_VK_DYNAMIC_VI is not dynamic for this pipeline, so
+             * everything is in pipeline->vertex_input_data and we can just
+             * memcpy
+             */
+            memcpy(p + 1, pipeline->vertex_input_data, 4 * 2 * ve_count);
+            anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline,
+                                          final.vf_instancing);
+         } else {
+            assert(pipeline->final.vf_instancing.len == 0);
+            /* Use dyn->vi to emit the dynamic VERTEX_ELEMENT_STATE input. */
+            genX(emit_vertex_input)(&cmd_buffer->batch, p + 1,
+                                    pipeline, dyn->vi, false /* emit_in_pipeline */);
+            /* Then append the VERTEX_ELEMENT_STATE for the draw parameters */
+            memcpy(p + 1 + 2 * pipeline->vs_input_elements,
+                   pipeline->vertex_input_data,
+                   4 * 2 * pipeline->vertex_input_elems);
+         }
+      }
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TE)) {
+      anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_TE),
+                           pipeline, partial.te, te) {
+         SET(te, te, OutputTopology);
+      }
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_GS)) {
+      anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_GS),
+                           pipeline, partial.gs, gs) {
+         SET(gs, gs, ReorderMode);
+      }
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CPS)) {
+#if GFX_VER == 11
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CPS), cps) {
+         SET(cps, cps, CoarsePixelShadingMode);
+         SET(cps, cps, MinCPSizeX);
+         SET(cps, cps, MinCPSizeY);
+      }
+#elif GFX_VER >= 12
+      /* TODO: we can optimize this flush in the following cases:
+       *
+       *    In the case where the last geometry shader emits a value that is
+       *    not constant, we can avoid this stall because we can synchronize
+       *    the pixel shader internally with
+       *    3DSTATE_PS::EnablePSDependencyOnCPsizeChange.
+       *
+       *    If we know that the previous pipeline and the current one are
+       *    using the same fragment shading rate.
+       */
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+#if GFX_VERx10 >= 125
+         pc.PSSStallSyncEnable = true;
+#else
+         pc.PSDSyncEnable = true;
+#endif
+      }
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CPS_POINTERS), cps) {
+         SET(cps, cps, CoarsePixelShadingStateArrayPointer);
+      }
+#endif
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SF)) {
+      anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_SF),
+                           pipeline, partial.sf, sf) {
+         SET(sf, sf, LineWidth);
+         SET(sf, sf, TriangleStripListProvokingVertexSelect);
+         SET(sf, sf, LineStripListProvokingVertexSelect);
+         SET(sf, sf, TriangleFanProvokingVertexSelect);
+         SET(sf, sf, LegacyGlobalDepthBiasEnable);
+      }
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_RASTER)) {
+      anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_RASTER),
+                           pipeline, partial.raster, raster) {
+         SET(raster, raster, APIMode);
+         SET(raster, raster, DXMultisampleRasterizationEnable);
+         SET(raster, raster, AntialiasingEnable);
+         SET(raster, raster, CullMode);
+         SET(raster, raster, FrontWinding);
+         SET(raster, raster, GlobalDepthOffsetEnableSolid);
+         SET(raster, raster, GlobalDepthOffsetEnableWireframe);
+         SET(raster, raster, GlobalDepthOffsetEnablePoint);
+         SET(raster, raster, GlobalDepthOffsetConstant);
+         SET(raster, raster, GlobalDepthOffsetScale);
+         SET(raster, raster, GlobalDepthOffsetClamp);
+         SET(raster, raster, FrontFaceFillMode);
+         SET(raster, raster, BackFaceFillMode);
+         SET(raster, raster, ViewportZFarClipTestEnable);
+         SET(raster, raster, ViewportZNearClipTestEnable);
+         SET(raster, raster, ConservativeRasterizationEnable);
+      }
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE)) {
+      anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_MULTISAMPLE),
+                           pipeline, partial.ms, ms) {
+         SET(ms, ms, NumberofMultisamples);
+      }
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CC_STATE)) {
+      hw_state->cc.state =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                            GENX(COLOR_CALC_STATE_length) * 4,
+                                            64);
+      struct GENX(COLOR_CALC_STATE) cc = {
+         INIT(cc, BlendConstantColorRed),
+         INIT(cc, BlendConstantColorGreen),
+         INIT(cc, BlendConstantColorBlue),
+         INIT(cc, BlendConstantColorAlpha),
+      };
+      GENX(COLOR_CALC_STATE_pack)(NULL, hw_state->cc.state.map, &cc);
+
+      /* Dirty the pointers to reemit 3DSTATE_CC_STATE_POINTERS below
+       */
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CC_STATE_PTR);
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_CC_STATE_PTR)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) {
+         ccp.ColorCalcStatePointer = hw_state->cc.state.offset;
+         ccp.ColorCalcStatePointerValid = true;
+      }
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SAMPLE_MASK)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SAMPLE_MASK), sm) {
+         SET(sm, sm, SampleMask);
+      }
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WM_DEPTH_STENCIL)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_WM_DEPTH_STENCIL), ds) {
+         SET(ds, ds, DoubleSidedStencilEnable);
+         SET(ds, ds, StencilTestMask);
+         SET(ds, ds, StencilWriteMask);
+         SET(ds, ds, BackfaceStencilTestMask);
+         SET(ds, ds, BackfaceStencilWriteMask);
+         SET(ds, ds, StencilReferenceValue);
+         SET(ds, ds, BackfaceStencilReferenceValue);
+         SET(ds, ds, DepthTestEnable);
+         SET(ds, ds, DepthBufferWriteEnable);
+         SET(ds, ds, DepthTestFunction);
+         SET(ds, ds, StencilTestEnable);
+         SET(ds, ds, StencilBufferWriteEnable);
+         SET(ds, ds, StencilFailOp);
+         SET(ds, ds, StencilPassDepthPassOp);
+         SET(ds, ds, StencilPassDepthFailOp);
+         SET(ds, ds, StencilTestFunction);
+         SET(ds, ds, BackfaceStencilFailOp);
+         SET(ds, ds, BackfaceStencilPassDepthPassOp);
+         SET(ds, ds, BackfaceStencilPassDepthFailOp);
+         SET(ds, ds, BackfaceStencilTestFunction);
+      }
+   }
+
+#if GFX_VER >= 12
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_DEPTH_BOUNDS)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BOUNDS), db) {
+         SET(db, db, DepthBoundsTestEnable);
+         SET(db, db, DepthBoundsTestMinValue);
+         SET(db, db, DepthBoundsTestMaxValue);
+      }
+   }
+#endif
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_LINE_STIPPLE)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_LINE_STIPPLE), ls) {
+         SET(ls, ls, LineStipplePattern);
+         SET(ls, ls, LineStippleInverseRepeatCount);
+         SET(ls, ls, LineStippleRepeatCount);
+      }
+#if GFX_VER >= 11
+      /* ICL PRMs, Volume 2a - Command Reference: Instructions,
+       * 3DSTATE_LINE_STIPPLE:
+       *
+       *    "Workaround: This command must be followed by a PIPE_CONTROL with
+       *     CS Stall bit set."
+       */
+      genx_batch_emit_pipe_control(&cmd_buffer->batch,
+                                   cmd_buffer->device->info,
+                                   cmd_buffer->state.current_pipeline,
+                                   ANV_PIPE_CS_STALL_BIT);
+#endif
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VF)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) {
+#if GFX_VERx10 >= 125
+         vf.GeometryDistributionEnable = true;
+#endif
+         SET(vf, vf, IndexedDrawCutIndexEnable);
+         SET(vf, vf, CutIndex);
+      }
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_INDEX_BUFFER)) {
+      struct anv_buffer *buffer = gfx->index_buffer;
+      uint32_t offset = gfx->index_offset;
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
+         ib.IndexFormat           = gfx->index_type;
+         ib.MOCS                  = anv_mocs(cmd_buffer->device,
+                                             buffer ? buffer->address.bo : NULL,
+                                             ISL_SURF_USAGE_INDEX_BUFFER_BIT);
+#if GFX_VER >= 12
+         ib.L3BypassDisable       = true;
+#endif
+         if (buffer) {
+            ib.BufferStartingAddress = anv_address_add(buffer->address, offset);
+            ib.BufferSize            = gfx->index_size;
+         }
+      }
+   }
+
+#if GFX_VERx10 >= 125
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VFG)) {
+      anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_VFG),
+                           pipeline, partial.vfg, vfg) {
+         SET(vfg, vfg, ListCutIndexEnable);
+      }
+   }
+#endif
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SAMPLE_PATTERN)) {
+      genX(emit_sample_pattern)(&cmd_buffer->batch,
+                                dyn->ms.sample_locations_enable ?
+                                dyn->ms.sample_locations : NULL);
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WM)) {
+      anv_batch_emit_merge(&cmd_buffer->batch, GENX(3DSTATE_WM),
+                           pipeline, partial.wm, wm) {
+         SET(wm, wm, ForceThreadDispatchEnable);
+         SET(wm, wm, LineStippleEnable);
+         SET(wm, wm, BarycentricInterpolationMode);
+      }
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PS_BLEND)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PS_BLEND), blend) {
+         SET(blend, ps_blend, HasWriteableRT);
+         SET(blend, ps_blend, ColorBufferBlendEnable);
+         SET(blend, ps_blend, SourceAlphaBlendFactor);
+         SET(blend, ps_blend, DestinationAlphaBlendFactor);
+         SET(blend, ps_blend, SourceBlendFactor);
+         SET(blend, ps_blend, DestinationBlendFactor);
+         SET(blend, ps_blend, AlphaTestEnable);
+         SET(blend, ps_blend, IndependentAlphaBlendEnable);
+         SET(blend, ps_blend, AlphaToCoverageEnable);
+      }
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE)) {
+      const uint32_t num_dwords = GENX(BLEND_STATE_length) +
+         GENX(BLEND_STATE_ENTRY_length) * MAX_RTS;
+      hw_state->blend.state =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
+                                            num_dwords * 4,
+                                            64);
+
+      uint32_t *dws = hw_state->blend.state.map;
+
+      struct GENX(BLEND_STATE) blend_state = {
+         INIT(blend, AlphaToCoverageEnable),
+         INIT(blend, AlphaToOneEnable),
+         INIT(blend, IndependentAlphaBlendEnable),
+      };
+      GENX(BLEND_STATE_pack)(NULL, dws, &blend_state);
+
+      /* Jump to blend entries. */
+      dws += GENX(BLEND_STATE_length);
+      for (uint32_t i = 0; i < MAX_RTS; i++) {
+         struct GENX(BLEND_STATE_ENTRY) entry = {
+            INIT(blend.rts[i], WriteDisableAlpha),
+            INIT(blend.rts[i], WriteDisableRed),
+            INIT(blend.rts[i], WriteDisableGreen),
+            INIT(blend.rts[i], WriteDisableBlue),
+            INIT(blend.rts[i], LogicOpFunction),
+            INIT(blend.rts[i], LogicOpEnable),
+            INIT(blend.rts[i], ColorBufferBlendEnable),
+            INIT(blend.rts[i], ColorClampRange),
+            INIT(blend.rts[i], PreBlendColorClampEnable),
+            INIT(blend.rts[i], PostBlendColorClampEnable),
+            INIT(blend.rts[i], SourceBlendFactor),
+            INIT(blend.rts[i], DestinationBlendFactor),
+            INIT(blend.rts[i], ColorBlendFunction),
+            INIT(blend.rts[i], SourceAlphaBlendFactor),
+            INIT(blend.rts[i], DestinationAlphaBlendFactor),
+            INIT(blend.rts[i], AlphaBlendFunction),
+         };
+
+         GENX(BLEND_STATE_ENTRY_pack)(NULL, dws, &entry);
+         dws += GENX(BLEND_STATE_ENTRY_length);
+      }
+
+      /* Dirty the pointers to reemit 3DSTATE_BLEND_STATE_POINTERS below */
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE_PTR);
+   }
+
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_BLEND_STATE_PTR)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
+         bsp.BlendStatePointer      = hw_state->blend.state.offset;
+         bsp.BlendStatePointerValid = true;
+      }
+   }
+
+#if GFX_VERx10 >= 125
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WA_18019816803)) {
+      genx_batch_emit_pipe_control(&cmd_buffer->batch, cmd_buffer->device->info,
+                                   cmd_buffer->state.current_pipeline,
+                                   ANV_PIPE_PSS_STALL_SYNC_BIT);
+   }
+#endif
+
+#if GFX_VER == 9
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_PMA_FIX))
+      genX(cmd_buffer_enable_pma_fix)(cmd_buffer, hw_state->pma_fix);
+#endif
+
+#if GFX_VERx10 >= 125
+   if (hw_state->use_tbimr &&
+       BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_TBIMR_TILE_PASS_INFO)) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TBIMR_TILE_PASS_INFO),
+                     tbimr) {
+         SET(tbimr, tbimr, TileRectangleHeight);
+         SET(tbimr, tbimr, TileRectangleWidth);
+         SET(tbimr, tbimr, VerticalTileCount);
+         SET(tbimr, tbimr, HorizontalTileCount);
+         SET(tbimr, tbimr, TBIMRBatchSize);
+         SET(tbimr, tbimr, TileBoxCheck);
+      }
+   }
+#endif
+
+#undef INIT
+#undef SET
+
+   BITSET_ZERO(hw_state->dirty);
+}
+
+/**
+ * This function handles possible state workarounds and emits the dirty
+ * instructions to the batch buffer.
+ */
+void
+genX(cmd_buffer_flush_gfx_hw_state)(struct anv_cmd_buffer *cmd_buffer)
+{
+   struct anv_device *device = cmd_buffer->device;
+   struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
+   struct anv_graphics_pipeline *pipeline =
+      anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
+   struct anv_gfx_dynamic_state *hw_state = &gfx->dyn_state;
+
+   if (INTEL_DEBUG(DEBUG_REEMIT)) {
+      BITSET_OR(gfx->dyn_state.dirty, gfx->dyn_state.dirty,
+                device->gfx_dirty_state);
+   }
+
+   /**
+    * Put potential workarounds here if you need to reemit an instruction
+    * because of another one is changing.
+    */
+
+   /* Since Wa_16011773973 will disable 3DSTATE_STREAMOUT, we need to reemit
+    * it after.
+    */
+   if (intel_needs_workaround(device->info, 16011773973) &&
+       pipeline->uses_xfb &&
+       BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST)) {
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
+   }
+
+   /* Gfx11 undocumented issue :
+    * https://gitlab.freedesktop.org/mesa/mesa/-/issues/9781
+    */
+#if GFX_VER == 11
+   if (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_WM))
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE);
+#endif
+
+   /* Wa_18020335297 - Apply the WA when viewport ptr is reprogrammed. */
+   if (intel_needs_workaround(device->info, 18020335297) &&
+       (BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC) ||
+        BITSET_TEST(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC_PTR)) &&
+       cmd_buffer->state.gfx.viewport_set) {
+      /* For mesh, we implement the WA using CS stall. This is for
+       * simplicity and takes care of possible interaction with Wa_16014390852.
+       */
+      if (anv_pipeline_is_mesh(pipeline)) {
+         genx_batch_emit_pipe_control(&cmd_buffer->batch, device->info,
+                                      _3D, ANV_PIPE_CS_STALL_BIT);
+      } else {
+         /* Mask off all instructions that we program. */
+         BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VFG);
+         BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF);
+         BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
+         BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_RASTER);
+         BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
+         BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
+         BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
+         BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_CLIP);
+         BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
+         BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
+         BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
+
+         BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_VS);
+         BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_GS);
+         BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_HS);
+         BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_TE);
+         BITSET_CLEAR(hw_state->dirty, ANV_GFX_STATE_DS);
+
+         cmd_buffer_gfx_state_emission(cmd_buffer);
+
+         emit_wa_18020335297_dummy_draw(cmd_buffer);
+
+         /* Dirty all emitted WA state to make sure that current real
+          * state is restored.
+          */
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VFG);
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF);
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_RASTER);
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CLIP);
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
+
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VS);
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_GS);
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_HS);
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TE);
+         BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DS);
+      }
+   }
+
+   cmd_buffer_gfx_state_emission(cmd_buffer);
+}
+
+void
+genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable)
+{
+   if (!anv_cmd_buffer_is_render_queue(cmd_buffer))
+      return;
+
+   if (cmd_buffer->state.pma_fix_enabled == enable)
+      return;
+
+   cmd_buffer->state.pma_fix_enabled = enable;
+
+   /* According to the Broadwell PIPE_CONTROL documentation, software should
+    * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
+    * prior to the LRI.  If stencil buffer writes are enabled, then a Render
+    * Cache Flush is also necessary.
+    *
+    * The Skylake docs say to use a depth stall rather than a command
+    * streamer stall.  However, the hardware seems to violently disagree.
+    * A full command streamer stall seems to be needed in both cases.
+    */
+   genx_batch_emit_pipe_control
+      (&cmd_buffer->batch, cmd_buffer->device->info,
+       cmd_buffer->state.current_pipeline,
+       ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
+       ANV_PIPE_CS_STALL_BIT |
+#if GFX_VER >= 12
+       ANV_PIPE_TILE_CACHE_FLUSH_BIT |
+#endif
+       ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
+
+#if GFX_VER == 9
+   uint32_t cache_mode;
+   anv_pack_struct(&cache_mode, GENX(CACHE_MODE_0),
+                   .STCPMAOptimizationEnable = enable,
+                   .STCPMAOptimizationEnableMask = true);
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+      lri.RegisterOffset   = GENX(CACHE_MODE_0_num);
+      lri.DataDWord        = cache_mode;
+   }
+
+#endif /* GFX_VER == 9 */
+
+   /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
+    * Flush bits is often necessary.  We do it regardless because it's easier.
+    * The render cache flush is also necessary if stencil writes are enabled.
+    *
+    * Again, the Skylake docs give a different set of flushes but the BDW
+    * flushes seem to work just as well.
+    */
+   genx_batch_emit_pipe_control
+      (&cmd_buffer->batch, cmd_buffer->device->info,
+       cmd_buffer->state.current_pipeline,
+       ANV_PIPE_DEPTH_STALL_BIT |
+       ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
+#if GFX_VER >= 12
+       ANV_PIPE_TILE_CACHE_FLUSH_BIT |
+#endif
+       ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
+}
diff --git a/src/intel/vulkan/genX_gpu_memcpy.c b/src/intel/vulkan/genX_gpu_memcpy.c
index 8f83212b2d7..60ca6f0a248 100644
--- a/src/intel/vulkan/genX_gpu_memcpy.c
+++ b/src/intel/vulkan/genX_gpu_memcpy.c
@@ -51,14 +51,86 @@ gcd_pow2_u64(uint64_t a, uint64_t b)
    return 1 << MIN2(a_log2, b_log2);
 }
 
-void
-genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
-                           struct anv_address dst, struct anv_address src,
-                           uint32_t size)
+static void
+emit_common_so_memcpy(struct anv_batch *batch, struct anv_device *device,
+                      const struct intel_urb_config *urb_cfg_in,
+                      struct intel_urb_config *urb_cfg_out,
+                      const struct intel_l3_config *l3_config)
 {
-   if (size == 0)
-      return;
+   anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+      vfi.InstancingEnable = false;
+      vfi.VertexElementIndex = 0;
+   }
+   anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs);
+#if GFX_VER >= 11
+   anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS_2), sgvs);
+#endif
+
+   /* Disable all shader stages */
+   anv_batch_emit(batch, GENX(3DSTATE_VS), vs);
+   anv_batch_emit(batch, GENX(3DSTATE_HS), hs);
+   anv_batch_emit(batch, GENX(3DSTATE_TE), te);
+   anv_batch_emit(batch, GENX(3DSTATE_DS), DS);
+   anv_batch_emit(batch, GENX(3DSTATE_GS), gs);
+   anv_batch_emit(batch, GENX(3DSTATE_PS), gs);
+
+#if GFX_VERx10 >= 125
+   /* Disable Mesh, we can't have this and streamout enabled at the same
+    * time.
+    */
+   if (device->vk.enabled_extensions.EXT_mesh_shader) {
+      anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), mesh);
+      anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), task);
+   }
+#endif
+
+#if INTEL_WA_16013994831_GFX_VER
+   /* Wa_16013994831 - Disable preemption during streamout. */
+   if (intel_needs_workaround(device->info, 16013994831))
+      genX(batch_set_preemption)(batch, device->info, _3D, false);
+#endif
+
+   anv_batch_emit(batch, GENX(3DSTATE_SBE), sbe) {
+      sbe.VertexURBEntryReadOffset = 1;
+      sbe.NumberofSFOutputAttributes = 1;
+      sbe.VertexURBEntryReadLength = 1;
+      sbe.ForceVertexURBEntryReadLength = true;
+      sbe.ForceVertexURBEntryReadOffset = true;
+
+      for (unsigned i = 0; i < 32; i++)
+         sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
+   }
+
+   /* Emit URB setup.  We tell it that the VS is active because we want it to
+    * allocate space for the VS.  Even though one isn't run, we need VUEs to
+    * store the data that VF is going to pass to SOL.
+    */
+   const unsigned entry_size[4] = { DIV_ROUND_UP(32, 64), 1, 1, 1 };
+   memcpy(urb_cfg_out->size, &entry_size, sizeof(entry_size));
+
+   genX(emit_urb_setup)(device, batch, l3_config,
+                        VK_SHADER_STAGE_VERTEX_BIT, urb_cfg_in, urb_cfg_out,
+                        NULL);
+
+#if GFX_VER >= 12
+   /* Disable Primitive Replication. */
+   anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
+#endif
 
+   anv_batch_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
+      topo.PrimitiveTopologyType = _3DPRIM_POINTLIST;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
+      vf.StatisticsEnable = false;
+   }
+}
+
+static void
+emit_so_memcpy(struct anv_batch *batch, struct anv_device *device,
+               struct anv_address dst, struct anv_address src,
+               uint32_t size)
+{
    /* The maximum copy block size is 4 32-bit components at a time. */
    assert(size % 4 == 0);
    unsigned bs = gcd_pow2_u64(16, size);
@@ -72,38 +144,23 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
       unreachable("Invalid size");
    }
 
-   if (!cmd_buffer->state.current_l3_config) {
-      const struct intel_l3_config *cfg =
-         intel_get_default_l3_config(&cmd_buffer->device->info);
-      genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
-   }
-
-   genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, 32, src, size);
-   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
-
-   genX(flush_pipeline_select_3d)(cmd_buffer);
-
    uint32_t *dw;
-   dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(3DSTATE_VERTEX_BUFFERS));
-   GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, dw + 1,
+   dw = anv_batch_emitn(batch, 5, GENX(3DSTATE_VERTEX_BUFFERS));
+   GENX(VERTEX_BUFFER_STATE_pack)(batch, dw + 1,
       &(struct GENX(VERTEX_BUFFER_STATE)) {
          .VertexBufferIndex = 32, /* Reserved for this */
          .AddressModifyEnable = true,
          .BufferStartingAddress = src,
          .BufferPitch = bs,
-         .MOCS = anv_mocs(cmd_buffer->device, src.bo, 0),
+         .MOCS = anv_mocs(device, src.bo, 0),
 #if GFX_VER >= 12
          .L3BypassDisable = true,
 #endif
-#if (GFX_VER >= 8)
          .BufferSize = size,
-#else
-         .EndAddress = anv_address_add(src, size - 1),
-#endif
       });
 
-   dw = anv_batch_emitn(&cmd_buffer->batch, 3, GENX(3DSTATE_VERTEX_ELEMENTS));
-   GENX(VERTEX_ELEMENT_STATE_pack)(&cmd_buffer->batch, dw + 1,
+   dw = anv_batch_emitn(batch, 3, GENX(3DSTATE_VERTEX_ELEMENTS));
+   GENX(VERTEX_ELEMENT_STATE_pack)(batch, dw + 1,
       &(struct GENX(VERTEX_ELEMENT_STATE)) {
          .VertexBufferIndex = 32,
          .Valid = true,
@@ -115,69 +172,29 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
          .Component3Control = (bs >= 16) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0,
       });
 
-#if GFX_VER >= 8
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
-      vfi.InstancingEnable = false;
-      vfi.VertexElementIndex = 0;
-   }
-#endif
 
-#if GFX_VER >= 8
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_SGVS), sgvs);
-#endif
-
-   /* Disable all shader stages */
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VS), vs);
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HS), hs);
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TE), te);
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DS), DS);
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_GS), gs);
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PS), gs);
-
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SBE), sbe) {
-      sbe.VertexURBEntryReadOffset = 1;
-      sbe.NumberofSFOutputAttributes = 1;
-      sbe.VertexURBEntryReadLength = 1;
-#if GFX_VER >= 8
-      sbe.ForceVertexURBEntryReadLength = true;
-      sbe.ForceVertexURBEntryReadOffset = true;
-#endif
-
-#if GFX_VER >= 9
-      for (unsigned i = 0; i < 32; i++)
-         sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
-#endif
-   }
-
-   /* Emit URB setup.  We tell it that the VS is active because we want it to
-    * allocate space for the VS.  Even though one isn't run, we need VUEs to
-    * store the data that VF is going to pass to SOL.
+   /* Wa_16011411144:
+    *
+    * SW must insert a PIPE_CONTROL cmd before and after the
+    * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
+    * state is not combined with other state changes.
     */
-   const unsigned entry_size[4] = { DIV_ROUND_UP(32, 64), 1, 1, 1 };
-
-   genX(emit_urb_setup)(cmd_buffer->device, &cmd_buffer->batch,
-                        cmd_buffer->state.current_l3_config,
-                        VK_SHADER_STAGE_VERTEX_BIT, entry_size, NULL);
+   if (intel_needs_workaround(device->info, 16011411144))
+      genx_batch_emit_pipe_control(batch, device->info, _3D, ANV_PIPE_CS_STALL_BIT);
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
+   anv_batch_emit(batch, GENX(3DSTATE_SO_BUFFER), sob) {
 #if GFX_VER < 12
       sob.SOBufferIndex = 0;
 #else
       sob._3DCommandOpcode = 0;
       sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD;
 #endif
-      sob.MOCS = anv_mocs(cmd_buffer->device, dst.bo, 0),
+      sob.MOCS = anv_mocs(device, dst.bo, ISL_SURF_USAGE_STREAM_OUT_BIT),
       sob.SurfaceBaseAddress = dst;
 
-#if GFX_VER >= 8
       sob.SOBufferEnable = true;
       sob.SurfaceSize = size / 4 - 1;
-#else
-      sob.SurfacePitch = bs;
-      sob.SurfaceEndAddress = anv_address_add(dst, size);
-#endif
 
-#if GFX_VER >= 8
       /* As SOL writes out data, it updates the SO_WRITE_OFFSET registers with
        * the end position of the stream.  We need to reset this value to 0 at
        * the beginning of the run or else SOL will start at the offset from
@@ -185,21 +202,16 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
        */
       sob.StreamOffsetWriteEnable = true;
       sob.StreamOffset = 0;
-#endif
    }
 
-#if GFX_VER <= 7
-   /* The hardware can do this for us on BDW+ (see above) */
-   anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), load) {
-      load.RegisterOffset = GENX(SO_WRITE_OFFSET0_num);
-      load.DataDWord = 0;
-   }
-#endif
+   /* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */
+   if (intel_needs_workaround(device->info, 16011411144))
+      genx_batch_emit_pipe_control(batch, device->info, _3D, ANV_PIPE_CS_STALL_BIT);
 
-   dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(3DSTATE_SO_DECL_LIST),
+   dw = anv_batch_emitn(batch, 5, GENX(3DSTATE_SO_DECL_LIST),
                         .StreamtoBufferSelects0 = (1 << 0),
                         .NumEntries0 = 1);
-   GENX(SO_DECL_ENTRY_pack)(&cmd_buffer->batch, dw + 3,
+   GENX(SO_DECL_ENTRY_pack)(batch, dw + 3,
       &(struct GENX(SO_DECL_ENTRY)) {
          .Stream0Decl = {
             .OutputBufferSlot = 0,
@@ -208,36 +220,22 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
          },
       });
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_STREAMOUT), so) {
+#if GFX_VERx10 == 125
+      /* Wa_14015946265: Send PC with CS stall after SO_DECL. */
+      genx_batch_emit_pipe_control(batch, device->info, _3D, ANV_PIPE_CS_STALL_BIT);
+#endif
+
+   anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so) {
       so.SOFunctionEnable = true;
       so.RenderingDisable = true;
       so.Stream0VertexReadOffset = 0;
       so.Stream0VertexReadLength = DIV_ROUND_UP(32, 64);
-#if GFX_VER >= 8
       so.Buffer0SurfacePitch = bs;
-#else
-      so.SOBufferEnable0 = true;
-#endif
-   }
-
-#if GFX_VER >= 8
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
-      topo.PrimitiveTopologyType = _3DPRIM_POINTLIST;
    }
-#endif
-
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_STATISTICS), vf) {
-      vf.StatisticsEnable = false;
-   }
-
-#if GFX_VER >= 12
-   /* Disable Primitive Replication. */
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
-#endif
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+   genX(emit_breakpoint)(batch, device, true);
+   anv_batch_emit(batch, GENX(3DPRIMITIVE), prim) {
       prim.VertexAccessType         = SEQUENTIAL;
-      prim.PrimitiveTopologyType    = _3DPRIM_POINTLIST;
       prim.VertexCountPerInstance   = size / bs;
       prim.StartVertexLocation      = 0;
       prim.InstanceCount            = 1;
@@ -245,8 +243,147 @@ genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
       prim.BaseVertexLocation       = 0;
    }
 
+   genX(batch_emit_post_3dprimitive_was)(batch,
+                                         device,
+                                         _3DPRIM_POINTLIST, size / bs);
+
+   genX(emit_breakpoint)(batch, device, false);
+}
+
+void
+genX(emit_so_memcpy_init)(struct anv_memcpy_state *state,
+                          struct anv_device *device,
+                          struct anv_batch *batch)
+{
+   memset(state, 0, sizeof(*state));
+
+   state->batch = batch;
+   state->device = device;
+
+   const struct intel_l3_config *cfg = intel_get_default_l3_config(device->info);
+   genX(emit_l3_config)(batch, device, cfg);
+   genX(emit_pipeline_select)(batch, _3D, device);
+
+   struct intel_urb_config urb_cfg_in = { 0 };
+   struct intel_urb_config urb_cfg = { 0 };
+
+   emit_common_so_memcpy(batch, device, &urb_cfg_in, &urb_cfg, cfg);
+}
+
+void
+genX(emit_so_memcpy_fini)(struct anv_memcpy_state *state)
+{
+   genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D,
+                                 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                                 NULL);
+}
+
+void
+genX(emit_so_memcpy_end)(struct anv_memcpy_state *state)
+{
+   if (intel_needs_workaround(state->device->info, 16013994831))
+      genX(batch_set_preemption)(state->batch, state->device->info, _3D, true);
+
+   anv_batch_emit(state->batch, GENX(MI_BATCH_BUFFER_END), end);
+
+   if ((state->batch->next - state->batch->start) & 4)
+      anv_batch_emit(state->batch, GENX(MI_NOOP), noop);
+}
+
+void
+genX(emit_so_memcpy)(struct anv_memcpy_state *state,
+                     struct anv_address dst, struct anv_address src,
+                     uint32_t size)
+{
+   if (GFX_VER == 9 &&
+       anv_gfx8_9_vb_cache_range_needs_workaround(&state->vb_bound,
+                                                  &state->vb_dirty,
+                                                  src, size)) {
+      genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D,
+                                    ANV_PIPE_CS_STALL_BIT |
+                                    ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
+                                    NULL);
+      memset(&state->vb_dirty, 0, sizeof(state->vb_dirty));
+   }
+
+   emit_so_memcpy(state->batch, state->device, dst, src, size);
+}
+
+void
+genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
+                           struct anv_address dst, struct anv_address src,
+                           uint32_t size)
+{
+   if (size == 0)
+      return;
+
+   if (!cmd_buffer->state.current_l3_config) {
+      const struct intel_l3_config *cfg =
+         intel_get_default_l3_config(cmd_buffer->device->info);
+      genX(cmd_buffer_config_l3)(cmd_buffer, cfg);
+   }
+
+#if GFX_VER == 9
+   genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, 32, src, size);
+#endif
+
+   /* Wa_14015814527 */
+   genX(apply_task_urb_workaround)(cmd_buffer);
+
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   genX(flush_pipeline_select_3d)(cmd_buffer);
+
+   struct intel_urb_config urb_cfg;
+
+   emit_common_so_memcpy(&cmd_buffer->batch, cmd_buffer->device,
+                         &cmd_buffer->state.gfx.urb_cfg,
+                         &urb_cfg,
+                         cmd_buffer->state.current_l3_config);
+   emit_so_memcpy(&cmd_buffer->batch, cmd_buffer->device, dst, src, size);
+
+#if GFX_VER == 9
    genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer, SEQUENTIAL,
                                                        1ull << 32);
+#endif
+
+   /* Update urb config after memcpy. */
+   memcpy(&cmd_buffer->state.gfx.urb_cfg, &urb_cfg,
+          sizeof(struct intel_urb_config));
+
+   /* Flag all the instructions emitted by the memcpy. */
+   struct anv_gfx_dynamic_state *hw_state =
+      &cmd_buffer->state.gfx.dyn_state;
+
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_URB);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
+#if GFX_VER >= 11
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
+#endif
+#if GFX_VER >= 12
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
+#endif
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SAMPLE_MASK);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SF);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SBE);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VS);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_HS);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DS);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TE);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_GS);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS);
+   if (cmd_buffer->device->vk.enabled_extensions.EXT_mesh_shader) {
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL);
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL);
+   }
 
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE;
+   cmd_buffer->state.gfx.dirty |= ~(ANV_CMD_DIRTY_PIPELINE |
+                                    ANV_CMD_DIRTY_INDEX_BUFFER);
 }
diff --git a/src/intel/vulkan/genX_init_state.c b/src/intel/vulkan/genX_init_state.c
new file mode 100644
index 00000000000..e86a6e42232
--- /dev/null
+++ b/src/intel/vulkan/genX_init_state.c
@@ -0,0 +1,1446 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+#include "common/intel_aux_map.h"
+#include "common/intel_sample_positions.h"
+#include "common/intel_pixel_hash.h"
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+
+#include "vk_standard_sample_locations.h"
+
+#if GFX_VERx10 == 125 && ANV_SUPPORT_RT
+#include "grl/genX_grl.h"
+#endif
+
+#include "vk_util.h"
+#include "vk_format.h"
+
+static void
+genX(emit_slice_hashing_state)(struct anv_device *device,
+                               struct anv_batch *batch)
+{
+#if GFX_VER == 11
+   /* Gfx11 hardware has two pixel pipes at most. */
+   for (unsigned i = 2; i < ARRAY_SIZE(device->info->ppipe_subslices); i++)
+      assert(device->info->ppipe_subslices[i] == 0);
+
+   if (device->info->ppipe_subslices[0] == device->info->ppipe_subslices[1])
+     return;
+
+   if (!device->slice_hash.alloc_size) {
+      unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
+      device->slice_hash =
+         anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
+
+      const bool flip = device->info->ppipe_subslices[0] <
+                     device->info->ppipe_subslices[1];
+      struct GENX(SLICE_HASH_TABLE) table;
+      intel_compute_pixel_hash_table_3way(16, 16, 3, 3, flip, table.Entry[0]);
+
+      GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
+      ptr.SliceHashStatePointerValid = true;
+      ptr.SliceHashTableStatePointer = device->slice_hash.offset;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
+      mode.SliceHashingTableEnable = true;
+   }
+#elif GFX_VERx10 == 120
+   /* For each n calculate ppipes_of[n], equal to the number of pixel pipes
+    * present with n active dual subslices.
+    */
+   unsigned ppipes_of[3] = {};
+
+   for (unsigned n = 0; n < ARRAY_SIZE(ppipes_of); n++) {
+      for (unsigned p = 0; p < 3; p++)
+         ppipes_of[n] += (device->info->ppipe_subslices[p] == n);
+   }
+
+   /* Gfx12 has three pixel pipes. */
+   for (unsigned p = 3; p < ARRAY_SIZE(device->info->ppipe_subslices); p++)
+      assert(device->info->ppipe_subslices[p] == 0);
+
+   if (ppipes_of[2] == 3 || ppipes_of[0] == 2) {
+      /* All three pixel pipes have the maximum number of active dual
+       * subslices, or there is only one active pixel pipe: Nothing to do.
+       */
+      return;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_SUBSLICE_HASH_TABLE), p) {
+      p.SliceHashControl[0] = TABLE_0;
+
+      if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
+         intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.TwoWayTableEntry[0]);
+      else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
+         intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.TwoWayTableEntry[0]);
+
+      if (ppipes_of[2] == 2 && ppipes_of[1] == 1)
+         intel_compute_pixel_hash_table_3way(8, 16, 5, 4, 0, p.ThreeWayTableEntry[0]);
+      else if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
+         intel_compute_pixel_hash_table_3way(8, 16, 2, 2, 0, p.ThreeWayTableEntry[0]);
+      else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
+         intel_compute_pixel_hash_table_3way(8, 16, 3, 3, 0, p.ThreeWayTableEntry[0]);
+      else
+         unreachable("Illegal fusing.");
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), p) {
+      p.SubsliceHashingTableEnable = true;
+      p.SubsliceHashingTableEnableMask = true;
+   }
+#elif GFX_VERx10 == 125
+   /* Calculate the set of present pixel pipes, and another set of
+    * present pixel pipes with 2 dual subslices enabled, the latter
+    * will appear on the hashing table with twice the frequency of
+    * pixel pipes with a single dual subslice present.
+    */
+   uint32_t ppipe_mask1 = 0, ppipe_mask2 = 0;
+   for (unsigned p = 0; p < ARRAY_SIZE(device->info->ppipe_subslices); p++) {
+      if (device->info->ppipe_subslices[p] > 0)
+         ppipe_mask1 |= (1u << p);
+      if (device->info->ppipe_subslices[p] > 1)
+         ppipe_mask2 |= (1u << p);
+   }
+   assert(ppipe_mask1);
+
+   if (!device->slice_hash.alloc_size) {
+      unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
+      device->slice_hash =
+         anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
+
+      struct GENX(SLICE_HASH_TABLE) table;
+
+      /* Note that the hardware expects an array with 7 tables, each
+       * table is intended to specify the pixel pipe hashing behavior
+       * for every possible slice count between 2 and 8, however that
+       * doesn't actually work, among other reasons due to hardware
+       * bugs that will cause the GPU to erroneously access the table
+       * at the wrong index in some cases, so in practice all 7 tables
+       * need to be initialized to the same value.
+       */
+      for (unsigned i = 0; i < 7; i++)
+         intel_compute_pixel_hash_table_nway(16, 16, ppipe_mask1, ppipe_mask2,
+                                             table.Entry[i][0]);
+
+      GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
+      ptr.SliceHashStatePointerValid = true;
+      ptr.SliceHashTableStatePointer = device->slice_hash.offset;
+   }
+
+   /* TODO: Figure out FCV support for other platforms
+    * Testing indicates that FCV is broken gfx125.
+    * Let's disable FCV for now till we figure out what's wrong.
+    *
+    * Alternatively, it can be toggled off via drirc option 'anv_disable_fcv'.
+    *
+    * Ref: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9987
+    * Ref: https://gitlab.freedesktop.org/mesa/mesa/-/issues/10318
+    * Ref: https://gitlab.freedesktop.org/mesa/mesa/-/issues/10795
+    * Ref: Internal issue 1480 about Unreal Engine 5.1
+    */
+   anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
+      mode.SliceHashingTableEnable = true;
+      mode.SliceHashingTableEnableMask = true;
+      mode.CrossSliceHashingMode = (util_bitcount(ppipe_mask1) > 1 ?
+				    hashing32x32 : NormalMode);
+      mode.CrossSliceHashingModeMask = -1;
+      mode.FastClearOptimizationEnable = !device->physical->disable_fcv;
+      mode.FastClearOptimizationEnableMask = !device->physical->disable_fcv;
+   }
+#endif
+}
+
+static void
+init_common_queue_state(struct anv_queue *queue, struct anv_batch *batch)
+{
+   UNUSED struct anv_device *device = queue->device;
+
+#if GFX_VER >= 11
+   /* Starting with GFX version 11, SLM is no longer part of the L3$ config
+    * so it never changes throughout the lifetime of the VkDevice.
+    */
+   const struct intel_l3_config *cfg = intel_get_default_l3_config(device->info);
+   genX(emit_l3_config)(batch, device, cfg);
+   device->l3_config = cfg;
+#endif
+
+#if GFX_VERx10 == 125
+   /* Even though L3 partial write merging is supposed to be enabled
+    * by default on Gfx12.5 according to the hardware spec, i915
+    * appears to accidentally clear the enables during context
+    * initialization, so make sure to enable them here since partial
+    * write merging has a large impact on rendering performance.
+    */
+   anv_batch_write_reg(batch, GENX(L3SQCREG5), reg) {
+      reg.L3CachePartialWriteMergeTimerInitialValue = 0x7f;
+      reg.CompressiblePartialWriteMergeEnable = true;
+      reg.CoherentPartialWriteMergeEnable = true;
+      reg.CrossTilePartialWriteMergeEnable = true;
+   }
+#endif
+
+   /* Emit STATE_BASE_ADDRESS on Gfx12+ because we set a default CPS_STATE and
+    * those are relative to STATE_BASE_ADDRESS::DynamicStateBaseAddress.
+    */
+#if GFX_VER >= 12
+
+#if GFX_VERx10 >= 125
+   /* Wa_14016407139:
+    *
+    * "On Surface state base address modification, for 3D workloads, SW must
+    *  always program PIPE_CONTROL either with CS Stall or PS sync stall. In
+    *  both the cases set Render Target Cache Flush Enable".
+    */
+   genx_batch_emit_pipe_control(batch, device->info,
+                                0,
+                                ANV_PIPE_CS_STALL_BIT |
+                                ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT);
+#endif
+
+   /* GEN:BUG:1607854226:
+    *
+    *  Non-pipelined state has issues with not applying in MEDIA/GPGPU mode.
+    *  Fortunately, we always start the context off in 3D mode.
+    */
+   uint32_t mocs = device->isl_dev.mocs.internal;
+   anv_batch_emit(batch, GENX(STATE_BASE_ADDRESS), sba) {
+      sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
+      sba.GeneralStateBufferSize  = 0xfffff;
+      sba.GeneralStateMOCS = mocs;
+      sba.GeneralStateBaseAddressModifyEnable = true;
+      sba.GeneralStateBufferSizeModifyEnable = true;
+
+      sba.StatelessDataPortAccessMOCS = mocs;
+
+      sba.SurfaceStateBaseAddress =
+         (struct anv_address) { .offset =
+         device->physical->va.internal_surface_state_pool.addr,
+      };
+      sba.SurfaceStateMOCS = mocs;
+      sba.SurfaceStateBaseAddressModifyEnable = true;
+
+      sba.DynamicStateBaseAddress =
+         (struct anv_address) { .offset =
+         device->physical->va.dynamic_state_pool.addr,
+      };
+      sba.DynamicStateBufferSize = (device->physical->va.dynamic_state_pool.size +
+                                    device->physical->va.sampler_state_pool.size) / 4096;
+      sba.DynamicStateMOCS = mocs;
+      sba.DynamicStateBaseAddressModifyEnable = true;
+      sba.DynamicStateBufferSizeModifyEnable = true;
+
+      sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
+      sba.IndirectObjectBufferSize = 0xfffff;
+      sba.IndirectObjectMOCS = mocs;
+      sba.IndirectObjectBaseAddressModifyEnable = true;
+      sba.IndirectObjectBufferSizeModifyEnable = true;
+
+      sba.InstructionBaseAddress =
+         (struct anv_address) { .offset =
+         device->physical->va.instruction_state_pool.addr,
+      };
+      sba.InstructionBufferSize = device->physical->va.instruction_state_pool.size / 4096;
+      sba.InstructionMOCS = mocs;
+      sba.InstructionBaseAddressModifyEnable = true;
+      sba.InstructionBuffersizeModifyEnable = true;
+
+#if GFX_VER >= 11
+      sba.BindlessSamplerStateBaseAddress = ANV_NULL_ADDRESS;
+      sba.BindlessSamplerStateBufferSize = 0;
+      sba.BindlessSamplerStateMOCS = mocs;
+      sba.BindlessSamplerStateBaseAddressModifyEnable = true;
+#endif
+
+      if (device->physical->indirect_descriptors) {
+         sba.BindlessSurfaceStateBaseAddress =
+            (struct anv_address) { .offset =
+            device->physical->va.bindless_surface_state_pool.addr,
+         };
+         sba.BindlessSurfaceStateSize =
+            anv_physical_device_bindless_heap_size(device->physical, false) /
+            ANV_SURFACE_STATE_SIZE - 1;
+         sba.BindlessSurfaceStateMOCS = mocs;
+         sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
+      } else {
+         /* Bindless Surface State & Bindless Sampler State are aligned to the
+          * same heap
+          */
+         sba.BindlessSurfaceStateBaseAddress = (struct anv_address) {
+            .offset = device->physical->va.internal_surface_state_pool.addr,
+         };
+         sba.BindlessSurfaceStateSize =
+            (device->physical->va.internal_surface_state_pool.size +
+             device->physical->va.bindless_surface_state_pool.size) - 1;
+         sba.BindlessSurfaceStateMOCS = mocs;
+         sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
+      }
+
+#if GFX_VERx10 >= 125
+      sba.L1CacheControl = L1CC_WB;
+#endif
+   }
+#endif
+
+#if GFX_VERx10 >= 125
+   if (ANV_SUPPORT_RT && device->info->has_ray_tracing) {
+      anv_batch_emit(batch, GENX(3DSTATE_BTD), btd) {
+         /* TODO: This is the timeout after which the bucketed thread
+          *       dispatcher will kick off a wave of threads. We go with the
+          *       lowest value for now. It could be tweaked on a per
+          *       application basis (drirc).
+          */
+         btd.DispatchTimeoutCounter = _64clocks;
+         /* BSpec 43851: "This field must be programmed to 6h i.e. memory
+          *               backed buffer must be 128KB."
+          */
+         btd.PerDSSMemoryBackedBufferSize = 6;
+         btd.MemoryBackedBufferBasePointer = (struct anv_address) {
+            /* This batch doesn't have a reloc list so we can't use the BO
+             * here.  We just use the address directly.
+             */
+            .offset = device->btd_fifo_bo->offset,
+         };
+      }
+   }
+#endif
+}
+
+static VkResult
+init_render_queue_state(struct anv_queue *queue, bool is_companion_rcs_batch)
+{
+   struct anv_device *device = queue->device;
+   UNUSED const struct intel_device_info *devinfo = queue->device->info;
+   uint32_t cmds[256];
+   struct anv_batch batch = {
+      .start = cmds,
+      .next = cmds,
+      .end = (void *) cmds + sizeof(cmds),
+   };
+
+   struct GENX(VERTEX_ELEMENT_STATE) empty_ve = {
+      .Valid = true,
+      .Component0Control = VFCOMP_STORE_0,
+      .Component1Control = VFCOMP_STORE_0,
+      .Component2Control = VFCOMP_STORE_0,
+      .Component3Control = VFCOMP_STORE_0,
+   };
+   GENX(VERTEX_ELEMENT_STATE_pack)(NULL, device->empty_vs_input, &empty_ve);
+
+   genX(emit_pipeline_select)(&batch, _3D, device);
+
+#if GFX_VER == 9
+   anv_batch_write_reg(&batch, GENX(CACHE_MODE_1), cm1) {
+      cm1.FloatBlendOptimizationEnable = true;
+      cm1.FloatBlendOptimizationEnableMask = true;
+      cm1.MSCRAWHazardAvoidanceBit = true;
+      cm1.MSCRAWHazardAvoidanceBitMask = true;
+      cm1.PartialResolveDisableInVC = true;
+      cm1.PartialResolveDisableInVCMask = true;
+   }
+#endif
+
+   anv_batch_emit(&batch, GENX(3DSTATE_AA_LINE_PARAMETERS), aa);
+
+   anv_batch_emit(&batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
+      rect.ClippedDrawingRectangleYMin = 0;
+      rect.ClippedDrawingRectangleXMin = 0;
+      rect.ClippedDrawingRectangleYMax = UINT16_MAX;
+      rect.ClippedDrawingRectangleXMax = UINT16_MAX;
+      rect.DrawingRectangleOriginY = 0;
+      rect.DrawingRectangleOriginX = 0;
+   }
+
+   anv_batch_emit(&batch, GENX(3DSTATE_WM_CHROMAKEY), ck);
+
+   /* SKL PRMs, Volume 2a: Command Reference: Instructions: 3DSTATE_WM_HZ_OP:
+    *
+    *   "3DSTATE_RASTER if used must be programmed prior to using this
+    *    packet."
+    *
+    * Emit this before 3DSTATE_WM_HZ_OP below.
+    */
+   anv_batch_emit(&batch, GENX(3DSTATE_RASTER), rast) {
+      rast.APIMode = DX101;
+   }
+
+   /* SKL PRMs, Volume 2a: Command Reference: Instructions: 3DSTATE_WM_HZ_OP:
+    *
+    *    "3DSTATE_MULTISAMPLE packet must be used prior to this packet to
+    *     change the Number of Multisamples. This packet must not be used to
+    *     change Number of Multisamples in a rendering sequence."
+    *
+    * Emit this before 3DSTATE_WM_HZ_OP below.
+    */
+   anv_batch_emit(&batch, GENX(3DSTATE_MULTISAMPLE), ms);
+
+   /* The BDW+ docs describe how to use the 3DSTATE_WM_HZ_OP instruction in the
+    * section titled, "Optimized Depth Buffer Clear and/or Stencil Buffer
+    * Clear." It mentions that the packet overrides GPU state for the clear
+    * operation and needs to be reset to 0s to clear the overrides. Depending
+    * on the kernel, we may not get a context with the state for this packet
+    * zeroed. Do it ourselves just in case. We've observed this to prevent a
+    * number of GPU hangs on ICL.
+    */
+   anv_batch_emit(&batch, GENX(3DSTATE_WM_HZ_OP), hzp);
+
+   genX(emit_sample_pattern)(&batch, NULL);
+
+#if GFX_VER == 11
+   /* The default behavior of bit 5 "Headerless Message for Pre-emptable
+    * Contexts" in SAMPLER MODE register is set to 0, which means
+    * headerless sampler messages are not allowed for pre-emptable
+    * contexts. Set the bit 5 to 1 to allow them.
+    */
+   anv_batch_write_reg(&batch, GENX(SAMPLER_MODE), sm) {
+      sm.HeaderlessMessageforPreemptableContexts = true;
+      sm.HeaderlessMessageforPreemptableContextsMask = true;
+   }
+
+   /* Bit 1 "Enabled Texel Offset Precision Fix" must be set in
+    * HALF_SLICE_CHICKEN7 register.
+    */
+   anv_batch_write_reg(&batch, GENX(HALF_SLICE_CHICKEN7), hsc7) {
+      hsc7.EnabledTexelOffsetPrecisionFix = true;
+      hsc7.EnabledTexelOffsetPrecisionFixMask = true;
+   }
+
+   anv_batch_write_reg(&batch, GENX(TCCNTLREG), tcc) {
+      tcc.L3DataPartialWriteMergingEnable = true;
+      tcc.ColorZPartialWriteMergingEnable = true;
+      tcc.URBPartialWriteMergingEnable = true;
+      tcc.TCDisable = true;
+   }
+#endif
+   genX(emit_slice_hashing_state)(device, &batch);
+
+#if GFX_VER >= 11
+   /* hardware specification recommends disabling repacking for
+    * the compatibility with decompression mechanism in display controller.
+    */
+   if (device->info->disable_ccs_repack) {
+      anv_batch_write_reg(&batch, GENX(CACHE_MODE_0), cm0) {
+         cm0.DisableRepackingforCompression = true;
+         cm0.DisableRepackingforCompressionMask = true;
+      }
+   }
+
+   /* an unknown issue is causing vs push constants to become
+    * corrupted during object-level preemption. For now, restrict
+    * to command buffer level preemption to avoid rendering
+    * corruption.
+    */
+   anv_batch_write_reg(&batch, GENX(CS_CHICKEN1), cc1) {
+      cc1.ReplayMode = MidcmdbufferPreemption;
+      cc1.ReplayModeMask = true;
+
+#if GFX_VERx10 == 120
+      cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommand = true;
+      cc1.DisablePreemptionandHighPriorityPausingdueto3DPRIMITIVECommandMask = true;
+#endif
+   }
+
+#if INTEL_NEEDS_WA_1806527549
+   /* Wa_1806527549 says to disable the following HiZ optimization when the
+    * depth buffer is D16_UNORM. We've found the WA to help with more depth
+    * buffer configurations however, so we always disable it just to be safe.
+    */
+   anv_batch_write_reg(&batch, GENX(HIZ_CHICKEN), reg) {
+      reg.HZDepthTestLEGEOptimizationDisable = true;
+      reg.HZDepthTestLEGEOptimizationDisableMask = true;
+   }
+#endif
+
+#if GFX_VER == 12
+   anv_batch_write_reg(&batch, GENX(FF_MODE2), reg) {
+      /* On Alchemist, the FF_MODE2 docs for the GS timer say:
+       *
+       *    "The timer value must be set to 224."
+       *
+       * and Wa_16011163337 indicates this is the case for all Gfx12 parts,
+       * and that this is necessary to avoid hanging the HS/DS units.  It
+       * also clarifies that 224 is literally 0xE0 in the bits, not 7*32=224.
+       *
+       * The HS timer docs also have the same quote for Alchemist.  I am
+       * unaware of a reason it needs to be set to 224 on Tigerlake, but
+       * we do so for consistency if nothing else.
+       *
+       * For the TDS timer value, the docs say:
+       *
+       *    "For best performance, a value of 4 should be programmed."
+       *
+       * i915 also sets it this way on Tigerlake due to workarounds.
+       *
+       * The default VS timer appears to be 0, so we leave it at that.
+       */
+      reg.GSTimerValue  = 224;
+      reg.HSTimerValue  = 224;
+      reg.TDSTimerValue = 4;
+      reg.VSTimerValue  = 0;
+   }
+#endif
+
+#if INTEL_NEEDS_WA_1508744258
+   /*    Disable RHWO by setting 0x7010[14] by default except during resolve
+    *    pass.
+    *
+    * We implement global disabling of the optimization here and we toggle it
+    * in anv_image_ccs_op().
+    */
+   anv_batch_write_reg(&batch, GENX(COMMON_SLICE_CHICKEN1), c1) {
+      c1.RCCRHWOOptimizationDisable = true;
+      c1.RCCRHWOOptimizationDisableMask = true;
+   }
+#endif
+
+#if GFX_VERx10 < 125
+#define AA_LINE_QUALITY_REG GENX(3D_CHICKEN3)
+#else
+#define AA_LINE_QUALITY_REG GENX(CHICKEN_RASTER_1)
+#endif
+
+   /* Enable the new line drawing algorithm that produces higher quality
+    * lines.
+    */
+   anv_batch_write_reg(&batch, AA_LINE_QUALITY_REG, c3) {
+      c3.AALineQualityFix = true;
+      c3.AALineQualityFixMask = true;
+   }
+#endif
+
+#if GFX_VER == 12
+   if (device->info->has_aux_map) {
+      uint64_t aux_base_addr = intel_aux_map_get_base(device->aux_map_ctx);
+      assert(aux_base_addr % (32 * 1024) == 0);
+      anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+         lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num);
+         lri.DataDWord = aux_base_addr & 0xffffffff;
+      }
+      anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+         lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num) + 4;
+         lri.DataDWord = aux_base_addr >> 32;
+      }
+   }
+#endif
+
+#if GFX_VERx10 == 125
+   anv_batch_write_reg(&batch, GENX(CHICKEN_RASTER_2), reg) {
+      reg.TBIMRBatchSizeOverride = true;
+      reg.TBIMROpenBatchEnable = true;
+      reg.TBIMRFastClip = true;
+      reg.TBIMRBatchSizeOverrideMask = true;
+      reg.TBIMROpenBatchEnableMask = true;
+      reg.TBIMRFastClipMask = true;
+   }
+#endif
+
+   /* Set the "CONSTANT_BUFFER Address Offset Disable" bit, so
+    * 3DSTATE_CONSTANT_XS buffer 0 is an absolute address.
+    *
+    * This is only safe on kernels with context isolation support.
+    */
+   assert(device->physical->info.has_context_isolation);
+   anv_batch_write_reg(&batch, GENX(CS_DEBUG_MODE2), csdm2) {
+      csdm2.CONSTANT_BUFFERAddressOffsetDisable = true;
+      csdm2.CONSTANT_BUFFERAddressOffsetDisableMask = true;
+   }
+
+   init_common_queue_state(queue, &batch);
+
+   /* Because 3DSTATE_CPS::CoarsePixelShadingStateArrayPointer is relative to
+    * the dynamic state base address we need to emit this instruction after
+    * STATE_BASE_ADDRESS in init_common_queue_state().
+    */
+#if GFX_VER == 11
+   anv_batch_emit(&batch, GENX(3DSTATE_CPS), cps);
+#elif GFX_VER >= 12
+   anv_batch_emit(&batch, GENX(3DSTATE_CPS_POINTERS), cps) {
+      assert(device->cps_states.alloc_size != 0);
+      /* Offset 0 is the disabled state */
+      cps.CoarsePixelShadingStateArrayPointer =
+         device->cps_states.offset;
+   }
+#endif
+
+#if GFX_VERx10 >= 125
+   anv_batch_emit(&batch, GENX(STATE_COMPUTE_MODE), cm) {
+      cm.Mask1 = 0xffff;
+   }
+   anv_batch_emit(&batch, GENX(3DSTATE_MESH_CONTROL), zero);
+   anv_batch_emit(&batch, GENX(3DSTATE_TASK_CONTROL), zero);
+
+   /* We no longer required to explicitly flush or invalidate caches since the
+    * PIPELINE_SELECT is getting deprecated on Xe2+.
+    */
+#if GFX_VER < 20
+   genx_batch_emit_pipe_control_write(&batch, device->info, _3D, NoWrite,
+                                      ANV_NULL_ADDRESS,
+                                      0,
+                                      ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS);
+#endif
+
+   genX(emit_pipeline_select)(&batch, GPGPU, device);
+   anv_batch_emit(&batch, GENX(CFE_STATE), cfe) {
+      cfe.MaximumNumberofThreads =
+         devinfo->max_cs_threads * devinfo->subslice_total;
+   }
+
+   /* We no longer required to explicitly flush or invalidate caches since the
+    * PIPELINE_SELECT is getting deprecated on Xe2+.
+    */
+#if GFX_VER < 20
+   genx_batch_emit_pipe_control_write(&batch, device->info, _3D, NoWrite,
+                                      ANV_NULL_ADDRESS,
+                                      0,
+                                      ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS);
+#endif
+
+   genX(emit_pipeline_select)(&batch, _3D, device);
+#endif
+
+   anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
+
+   assert(batch.next <= batch.end);
+
+   if (!device->trtt.queue)
+      device->trtt.queue = queue;
+
+   return anv_queue_submit_simple_batch(queue, &batch, is_companion_rcs_batch);
+}
+
+static VkResult
+init_compute_queue_state(struct anv_queue *queue)
+{
+   UNUSED const struct intel_device_info *devinfo = queue->device->info;
+   uint32_t cmds[64];
+   struct anv_batch batch = {
+      .start = cmds,
+      .next = cmds,
+      .end = (void *) cmds + sizeof(cmds),
+   };
+
+   genX(emit_pipeline_select)(&batch, GPGPU, queue->device);
+
+#if GFX_VER == 12
+   if (queue->device->info->has_aux_map) {
+      uint64_t aux_base_addr =
+         intel_aux_map_get_base(queue->device->aux_map_ctx);
+      assert(aux_base_addr % (32 * 1024) == 0);
+      anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+         lri.RegisterOffset = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num);
+         lri.DataDWord = aux_base_addr & 0xffffffff;
+      }
+      anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+         lri.RegisterOffset = GENX(COMPCS0_AUX_TABLE_BASE_ADDR_num) + 4;
+         lri.DataDWord = aux_base_addr >> 32;
+      }
+   }
+#else
+   assert(!queue->device->info->has_aux_map);
+#endif
+
+   /* Wa_14015782607 - Issue pipe control with HDC_flush and
+    * untyped cache flush set to 1 when CCS has NP state update with
+    * STATE_COMPUTE_MODE.
+    */
+   if (intel_needs_workaround(devinfo, 14015782607) &&
+       queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
+      genx_batch_emit_pipe_control(&batch, devinfo, GPGPU,
+                                   ANV_PIPE_CS_STALL_BIT |
+                                   ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
+                                   ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
+   }
+
+#if GFX_VERx10 >= 125
+   /* Wa_14014427904/22013045878 - We need additional invalidate/flush when
+    * emitting NP state commands with ATS-M in compute mode.
+    */
+   if (intel_device_info_is_atsm(devinfo) &&
+       queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
+      genx_batch_emit_pipe_control
+         (&batch, devinfo, GPGPU,
+          ANV_PIPE_CS_STALL_BIT |
+          ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
+          ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
+          ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
+          ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
+          ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
+          ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
+   }
+
+   anv_batch_emit(&batch, GENX(STATE_COMPUTE_MODE), cm) {
+      cm.PixelAsyncComputeThreadLimit = 4;
+      cm.PixelAsyncComputeThreadLimitMask = 0x7;
+   }
+#endif
+
+   init_common_queue_state(queue, &batch);
+
+#if GFX_VERx10 >= 125
+   anv_batch_emit(&batch, GENX(CFE_STATE), cfe) {
+      cfe.MaximumNumberofThreads =
+         devinfo->max_cs_threads * devinfo->subslice_total;
+   }
+#endif
+
+   anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
+
+   assert(batch.next <= batch.end);
+
+   return anv_queue_submit_simple_batch(queue, &batch,
+                                        false /* is_companion_rcs_batch */);
+}
+
+static VkResult
+init_copy_video_queue_state(struct anv_queue *queue)
+{
+#if GFX_VER >= 12
+   UNUSED const struct intel_device_info *devinfo = queue->device->info;
+   uint32_t cmds[64];
+   UNUSED struct anv_batch batch = {
+      .start = cmds,
+      .next = cmds,
+      .end = (void *) cmds + sizeof(cmds),
+   };
+
+   if (queue->device->info->has_aux_map) {
+      uint64_t reg = GENX(VD0_AUX_TABLE_BASE_ADDR_num);
+
+      if (queue->family->engine_class == INTEL_ENGINE_CLASS_COPY) {
+#if GFX_VERx10 >= 125
+         reg = GENX(BCS_AUX_TABLE_BASE_ADDR_num);
+#endif
+      }
+
+      uint64_t aux_base_addr =
+         intel_aux_map_get_base(queue->device->aux_map_ctx);
+      assert(aux_base_addr % (32 * 1024) == 0);
+      anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+         lri.RegisterOffset = reg;
+         lri.DataDWord = aux_base_addr & 0xffffffff;
+      }
+      anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+         lri.RegisterOffset = reg + 4;
+         lri.DataDWord = aux_base_addr >> 32;
+      }
+
+      anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
+      assert(batch.next <= batch.end);
+
+      return anv_queue_submit_simple_batch(queue, &batch,
+                                           false /* is_companion_rcs_batch */);
+   }
+#else
+   assert(!queue->device->info->has_aux_map);
+#endif
+
+   return VK_SUCCESS;
+}
+
+void
+genX(init_physical_device_state)(ASSERTED struct anv_physical_device *pdevice)
+{
+   assert(pdevice->info.verx10 == GFX_VERx10);
+#if GFX_VERx10 == 125 && ANV_SUPPORT_RT
+   genX(grl_load_rt_uuid)(pdevice->rt_uuid);
+   pdevice->max_grl_scratch_size = genX(grl_max_scratch_size)();
+#endif
+
+   pdevice->cmd_emit_timestamp = genX(cmd_emit_timestamp);
+
+   pdevice->gpgpu_pipeline_value = GPGPU;
+}
+
+VkResult
+genX(init_device_state)(struct anv_device *device)
+{
+   VkResult res;
+
+   device->slice_hash = (struct anv_state) { 0 };
+   for (uint32_t i = 0; i < device->queue_count; i++) {
+      struct anv_queue *queue = &device->queues[i];
+      switch (queue->family->engine_class) {
+      case INTEL_ENGINE_CLASS_RENDER:
+         res = init_render_queue_state(queue, false /* is_companion_rcs_batch */);
+         break;
+      case INTEL_ENGINE_CLASS_COMPUTE: {
+         res = init_compute_queue_state(queue);
+         if (res != VK_SUCCESS)
+            return res;
+
+         /**
+          * Execute RCS init batch by default on the companion RCS command buffer in
+          * order to support MSAA copy/clear operations on compute queue.
+          */
+         res = init_render_queue_state(queue, true /* is_companion_rcs_batch */);
+         break;
+      }
+      case INTEL_ENGINE_CLASS_VIDEO:
+         res = init_copy_video_queue_state(queue);
+         break;
+      case INTEL_ENGINE_CLASS_COPY:
+         res = init_copy_video_queue_state(queue);
+         if (res != VK_SUCCESS)
+            return res;
+
+         /**
+          * Execute RCS init batch by default on the companion RCS command buffer in
+          * order to support MSAA copy/clear operations on copy queue.
+          */
+         res = init_render_queue_state(queue, true /* is_companion_rcs_batch */);
+         break;
+      default:
+         res = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+         break;
+      }
+      if (res != VK_SUCCESS)
+         return res;
+   }
+
+   if (device->vk.enabled_extensions.EXT_descriptor_buffer &&
+       device->slice_hash.alloc_size) {
+      device->slice_hash_db =
+         anv_state_pool_alloc(&device->dynamic_state_db_pool,
+                              device->slice_hash.alloc_size, 64);
+
+      memcpy(device->slice_hash_db.map,
+             device->slice_hash.map,
+             device->slice_hash.alloc_size);
+   }
+
+   return res;
+}
+
+#if GFX_VERx10 >= 125
+#define maybe_for_each_shading_rate_op(name) \
+   for (VkFragmentShadingRateCombinerOpKHR name = VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR; \
+        name <= VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR; \
+        name++)
+#elif GFX_VER >= 12
+#define maybe_for_each_shading_rate_op(name)
+#endif
+
+/* Rather than reemitting the CPS_STATE structure everything those changes and
+ * for as many viewports as needed, we can just prepare all possible cases and
+ * just pick the right offset from the prepacked states when needed.
+ */
+void
+genX(init_cps_device_state)(struct anv_device *device)
+{
+#if GFX_VER >= 12
+   void *cps_state_ptr = device->cps_states.map;
+
+   /* Disabled CPS mode */
+   for (uint32_t __v = 0; __v < MAX_VIEWPORTS; __v++) {
+      /* ICL PRMs, Volume 2d: Command Reference: Structures: 3DSTATE_CPS_BODY:
+       *
+       *   "It is an INVALID configuration to set the CPS mode other than
+       *    CPS_MODE_NONE and request per-sample dispatch in 3DSTATE_PS_EXTRA.
+       *    Such configuration should be disallowed at the API level, and
+       *    rendering results are undefined."
+       *
+       * Since we select this state when per coarse pixel is disabled and that
+       * includes when per-sample dispatch is enabled, we need to ensure this
+       * is set to NONE.
+       */
+      struct GENX(CPS_STATE) cps_state = {
+         .CoarsePixelShadingMode = CPS_MODE_NONE,
+      };
+
+      GENX(CPS_STATE_pack)(NULL, cps_state_ptr, &cps_state);
+      cps_state_ptr += GENX(CPS_STATE_length) * 4;
+   }
+
+   maybe_for_each_shading_rate_op(op0) {
+      maybe_for_each_shading_rate_op(op1) {
+         for (uint32_t x = 1; x <= 4; x *= 2) {
+            for (uint32_t y = 1; y <= 4; y *= 2) {
+               struct GENX(CPS_STATE) cps_state = {
+                  .CoarsePixelShadingMode = CPS_MODE_CONSTANT,
+                  .MinCPSizeX = x,
+                  .MinCPSizeY = y,
+               };
+
+#if GFX_VERx10 >= 125
+               static const uint32_t combiner_ops[] = {
+                  [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR]    = PASSTHROUGH,
+                  [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR] = OVERRIDE,
+                  [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR]     = HIGH_QUALITY,
+                  [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR]     = LOW_QUALITY,
+                  [VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MUL_KHR]     = RELATIVE,
+               };
+
+               cps_state.Combiner0OpcodeforCPsize = combiner_ops[op0];
+               cps_state.Combiner1OpcodeforCPsize = combiner_ops[op1];
+#endif /* GFX_VERx10 >= 125 */
+
+               for (uint32_t __v = 0; __v < MAX_VIEWPORTS; __v++) {
+                  GENX(CPS_STATE_pack)(NULL, cps_state_ptr, &cps_state);
+                  cps_state_ptr += GENX(CPS_STATE_length) * 4;
+               }
+            }
+         }
+      }
+   }
+#endif /* GFX_VER >= 12 */
+}
+
+void
+genX(emit_l3_config)(struct anv_batch *batch,
+                     const struct anv_device *device,
+                     const struct intel_l3_config *cfg)
+{
+#if GFX_VER < 20
+   UNUSED const struct intel_device_info *devinfo = device->info;
+
+#if GFX_VER >= 12
+#define L3_ALLOCATION_REG GENX(L3ALLOC)
+#define L3_ALLOCATION_REG_num GENX(L3ALLOC_num)
+#else
+#define L3_ALLOCATION_REG GENX(L3CNTLREG)
+#define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num)
+#endif
+
+   anv_batch_write_reg(batch, L3_ALLOCATION_REG, l3cr) {
+      if (cfg == NULL || (GFX_VER >= 12 && cfg->n[INTEL_L3P_ALL] > 126)) {
+         assert(!cfg || !(cfg->n[INTEL_L3P_SLM] || cfg->n[INTEL_L3P_URB] ||
+                          cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_RO] ||
+                          cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_C] ||
+                          cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_TC]));
+#if GFX_VER >= 12
+         l3cr.L3FullWayAllocationEnable = true;
+#else
+         unreachable("Invalid L3$ config");
+#endif
+      } else {
+#if GFX_VER < 11
+         l3cr.SLMEnable = cfg->n[INTEL_L3P_SLM];
+#endif
+#if INTEL_NEEDS_WA_1406697149
+         /* Wa_1406697149: Bit 9 "Error Detection Behavior Control" must be
+          * set in L3CNTLREG register. The default setting of the bit is not
+          * the desirable behavior.
+          */
+         l3cr.ErrorDetectionBehaviorControl = true;
+         l3cr.UseFullWays = true;
+#endif /* INTEL_NEEDS_WA_1406697149 */
+         assert(cfg->n[INTEL_L3P_IS] == 0);
+         assert(cfg->n[INTEL_L3P_C] == 0);
+         assert(cfg->n[INTEL_L3P_T] == 0);
+         l3cr.URBAllocation = cfg->n[INTEL_L3P_URB];
+         l3cr.ROAllocation = cfg->n[INTEL_L3P_RO];
+         l3cr.DCAllocation = cfg->n[INTEL_L3P_DC];
+         l3cr.AllAllocation = cfg->n[INTEL_L3P_ALL];
+      }
+   }
+#endif /* GFX_VER < 20 */
+}
+
+void
+genX(emit_sample_pattern)(struct anv_batch *batch,
+                          const struct vk_sample_locations_state *sl)
+{
+   assert(sl == NULL || sl->grid_size.width == 1);
+   assert(sl == NULL || sl->grid_size.height == 1);
+
+   /* See the Vulkan 1.0 spec Table 24.1 "Standard sample locations" and
+    * VkPhysicalDeviceFeatures::standardSampleLocations.
+    */
+   anv_batch_emit(batch, GENX(3DSTATE_SAMPLE_PATTERN), sp) {
+      /* The Skylake PRM Vol. 2a "3DSTATE_SAMPLE_PATTERN" says:
+       *
+       *    "When programming the sample offsets (for NUMSAMPLES_4 or _8
+       *    and MSRASTMODE_xxx_PATTERN), the order of the samples 0 to 3
+       *    (or 7 for 8X, or 15 for 16X) must have monotonically increasing
+       *    distance from the pixel center. This is required to get the
+       *    correct centroid computation in the device."
+       *
+       * However, the Vulkan spec seems to require that the the samples occur
+       * in the order provided through the API. The standard sample patterns
+       * have the above property that they have monotonically increasing
+       * distances from the center but client-provided ones do not. As long as
+       * this only affects centroid calculations as the docs say, we should be
+       * ok because OpenGL and Vulkan only require that the centroid be some
+       * lit sample and that it's the same for all samples in a pixel; they
+       * have no requirement that it be the one closest to center.
+       */
+      for (uint32_t i = 1; i <= 16; i *= 2) {
+         switch (i) {
+         case VK_SAMPLE_COUNT_1_BIT:
+            if (sl && sl->per_pixel == i) {
+               INTEL_SAMPLE_POS_1X_ARRAY(sp._1xSample, sl->locations);
+            } else {
+               INTEL_SAMPLE_POS_1X(sp._1xSample);
+            }
+            break;
+         case VK_SAMPLE_COUNT_2_BIT:
+            if (sl && sl->per_pixel == i) {
+               INTEL_SAMPLE_POS_2X_ARRAY(sp._2xSample, sl->locations);
+            } else {
+               INTEL_SAMPLE_POS_2X(sp._2xSample);
+            }
+            break;
+         case VK_SAMPLE_COUNT_4_BIT:
+            if (sl && sl->per_pixel == i) {
+               INTEL_SAMPLE_POS_4X_ARRAY(sp._4xSample, sl->locations);
+            } else {
+               INTEL_SAMPLE_POS_4X(sp._4xSample);
+            }
+            break;
+         case VK_SAMPLE_COUNT_8_BIT:
+            if (sl && sl->per_pixel == i) {
+               INTEL_SAMPLE_POS_8X_ARRAY(sp._8xSample, sl->locations);
+            } else {
+               INTEL_SAMPLE_POS_8X(sp._8xSample);
+            }
+            break;
+         case VK_SAMPLE_COUNT_16_BIT:
+            if (sl && sl->per_pixel == i) {
+               INTEL_SAMPLE_POS_16X_ARRAY(sp._16xSample, sl->locations);
+            } else {
+               INTEL_SAMPLE_POS_16X(sp._16xSample);
+            }
+            break;
+         default:
+            unreachable("Invalid sample count");
+         }
+      }
+   }
+}
+
+static uint32_t
+vk_to_intel_tex_filter(VkFilter filter, bool anisotropyEnable)
+{
+   switch (filter) {
+   default:
+      unreachable("Invalid filter");
+   case VK_FILTER_NEAREST:
+      return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_NEAREST;
+   case VK_FILTER_LINEAR:
+      return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_LINEAR;
+   }
+}
+
+static uint32_t
+vk_to_intel_max_anisotropy(float ratio)
+{
+   return (CLAMP(ratio, 2, 16) - 2) / 2;
+}
+
+static const uint32_t vk_to_intel_mipmap_mode[] = {
+   [VK_SAMPLER_MIPMAP_MODE_NEAREST]          = MIPFILTER_NEAREST,
+   [VK_SAMPLER_MIPMAP_MODE_LINEAR]           = MIPFILTER_LINEAR
+};
+
+static const uint32_t vk_to_intel_tex_address[] = {
+   [VK_SAMPLER_ADDRESS_MODE_REPEAT]          = TCM_WRAP,
+   [VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT] = TCM_MIRROR,
+   [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE]   = TCM_CLAMP,
+   [VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
+   [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
+};
+
+/* Vulkan specifies the result of shadow comparisons as:
+ *     1     if   ref <op> texel,
+ *     0     otherwise.
+ *
+ * The hardware does:
+ *     0     if texel <op> ref,
+ *     1     otherwise.
+ *
+ * So, these look a bit strange because there's both a negation
+ * and swapping of the arguments involved.
+ */
+static const uint32_t vk_to_intel_shadow_compare_op[] = {
+   [VK_COMPARE_OP_NEVER]                        = PREFILTEROP_ALWAYS,
+   [VK_COMPARE_OP_LESS]                         = PREFILTEROP_LEQUAL,
+   [VK_COMPARE_OP_EQUAL]                        = PREFILTEROP_NOTEQUAL,
+   [VK_COMPARE_OP_LESS_OR_EQUAL]                = PREFILTEROP_LESS,
+   [VK_COMPARE_OP_GREATER]                      = PREFILTEROP_GEQUAL,
+   [VK_COMPARE_OP_NOT_EQUAL]                    = PREFILTEROP_EQUAL,
+   [VK_COMPARE_OP_GREATER_OR_EQUAL]             = PREFILTEROP_GREATER,
+   [VK_COMPARE_OP_ALWAYS]                       = PREFILTEROP_NEVER,
+};
+
+static const uint32_t vk_to_intel_sampler_reduction_mode[] = {
+   [VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE] = STD_FILTER,
+   [VK_SAMPLER_REDUCTION_MODE_MIN]              = MINIMUM,
+   [VK_SAMPLER_REDUCTION_MODE_MAX]              = MAXIMUM,
+};
+
+VkResult genX(CreateSampler)(
+    VkDevice                                    _device,
+    const VkSamplerCreateInfo*                  pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSampler*                                  pSampler)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_sampler *sampler;
+
+   sampler = vk_sampler_create(&device->vk, pCreateInfo,
+                               pAllocator, sizeof(*sampler));
+   if (!sampler)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   const struct vk_format_ycbcr_info *ycbcr_info =
+      sampler->vk.format != VK_FORMAT_UNDEFINED ?
+      vk_format_get_ycbcr_info(sampler->vk.format) : NULL;
+   assert((ycbcr_info == NULL) == (sampler->vk.ycbcr_conversion == NULL));
+
+   sampler->n_planes = ycbcr_info ? ycbcr_info->n_planes : 1;
+
+   uint32_t border_color_stride = 64;
+   uint32_t border_color_offset, border_color_db_offset = 0;
+   void *border_color_ptr;
+   if (sampler->vk.border_color <= VK_BORDER_COLOR_INT_OPAQUE_WHITE) {
+      border_color_offset = device->border_colors.offset +
+                            pCreateInfo->borderColor *
+                            border_color_stride;
+      border_color_db_offset = device->border_colors_db.offset +
+                               pCreateInfo->borderColor *
+                               border_color_stride;
+      border_color_ptr = device->border_colors.map +
+                         pCreateInfo->borderColor * border_color_stride;
+   } else {
+      assert(vk_border_color_is_custom(sampler->vk.border_color));
+      sampler->custom_border_color =
+         anv_state_reserved_pool_alloc(&device->custom_border_colors);
+      border_color_offset = sampler->custom_border_color.offset;
+      border_color_ptr = sampler->custom_border_color.map;
+
+      union isl_color_value color = { .u32 = {
+         sampler->vk.border_color_value.uint32[0],
+         sampler->vk.border_color_value.uint32[1],
+         sampler->vk.border_color_value.uint32[2],
+         sampler->vk.border_color_value.uint32[3],
+      } };
+
+      const struct anv_format *format_desc =
+         sampler->vk.format != VK_FORMAT_UNDEFINED ?
+         anv_get_format(sampler->vk.format) : NULL;
+
+      if (format_desc && format_desc->n_planes == 1 &&
+          !isl_swizzle_is_identity(format_desc->planes[0].swizzle)) {
+         const struct anv_format_plane *fmt_plane = &format_desc->planes[0];
+
+         assert(!isl_format_has_int_channel(fmt_plane->isl_format));
+         color = isl_color_value_swizzle(color, fmt_plane->swizzle, true);
+      }
+
+      memcpy(border_color_ptr, color.u32, sizeof(color));
+
+      if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
+         if (pCreateInfo->flags & VK_SAMPLER_CREATE_DESCRIPTOR_BUFFER_CAPTURE_REPLAY_BIT_EXT) {
+            const VkOpaqueCaptureDescriptorDataCreateInfoEXT *opaque_info =
+               vk_find_struct_const(pCreateInfo->pNext,
+                                    OPAQUE_CAPTURE_DESCRIPTOR_DATA_CREATE_INFO_EXT);
+            if (opaque_info) {
+               uint32_t alloc_idx = *((const uint32_t *)opaque_info->opaqueCaptureDescriptorData);
+               sampler->custom_border_color_db =
+                  anv_state_reserved_array_pool_alloc_index(&device->custom_border_colors_db, alloc_idx);
+            } else {
+               sampler->custom_border_color_db =
+                  anv_state_reserved_array_pool_alloc(&device->custom_border_colors_db, true);
+            }
+         } else {
+            sampler->custom_border_color_db =
+               anv_state_reserved_array_pool_alloc(&device->custom_border_colors_db, false);
+         }
+         if (sampler->custom_border_color_db.alloc_size == 0)
+            return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+         border_color_db_offset = sampler->custom_border_color_db.offset;
+         memcpy(sampler->custom_border_color_db.map, color.u32, sizeof(color));
+      }
+   }
+
+   const bool seamless_cube =
+      !(pCreateInfo->flags & VK_SAMPLER_CREATE_NON_SEAMLESS_CUBE_MAP_BIT_EXT);
+
+   struct mesa_sha1 ctx;
+   _mesa_sha1_init(&ctx);
+
+   for (unsigned p = 0; p < sampler->n_planes; p++) {
+      const bool plane_has_chroma =
+         ycbcr_info && ycbcr_info->planes[p].has_chroma;
+      const VkFilter min_filter =
+         plane_has_chroma ? sampler->vk.ycbcr_conversion->state.chroma_filter :
+                            pCreateInfo->minFilter;
+      const VkFilter mag_filter =
+         plane_has_chroma ? sampler->vk.ycbcr_conversion->state.chroma_filter :
+                            pCreateInfo->magFilter;
+      const bool force_addr_rounding =
+            device->physical->instance->force_filter_addr_rounding;
+      const bool enable_min_filter_addr_rounding =
+            force_addr_rounding || min_filter != VK_FILTER_NEAREST;
+      const bool enable_mag_filter_addr_rounding =
+            force_addr_rounding || mag_filter != VK_FILTER_NEAREST;
+      /* From Broadwell PRM, SAMPLER_STATE:
+       *   "Mip Mode Filter must be set to MIPFILTER_NONE for Planar YUV surfaces."
+       */
+      enum isl_format plane0_isl_format = sampler->vk.ycbcr_conversion ?
+         anv_get_format(sampler->vk.format)->planes[0].isl_format :
+         ISL_FORMAT_UNSUPPORTED;
+      const bool isl_format_is_planar_yuv =
+         plane0_isl_format != ISL_FORMAT_UNSUPPORTED &&
+         isl_format_is_yuv(plane0_isl_format) &&
+         isl_format_is_planar(plane0_isl_format);
+
+      const uint32_t mip_filter_mode =
+         isl_format_is_planar_yuv ?
+         MIPFILTER_NONE : vk_to_intel_mipmap_mode[pCreateInfo->mipmapMode];
+
+      struct GENX(SAMPLER_STATE) sampler_state = {
+         .SamplerDisable = false,
+         .TextureBorderColorMode = DX10OGL,
+
+#if GFX_VER >= 11
+         .CPSLODCompensationEnable = true,
+#endif
+
+         .LODPreClampMode = CLAMP_MODE_OGL,
+
+         .MipModeFilter = mip_filter_mode,
+         .MagModeFilter = vk_to_intel_tex_filter(mag_filter, pCreateInfo->anisotropyEnable),
+         .MinModeFilter = vk_to_intel_tex_filter(min_filter, pCreateInfo->anisotropyEnable),
+         .TextureLODBias = CLAMP(pCreateInfo->mipLodBias, -16, 15.996),
+         .AnisotropicAlgorithm =
+            pCreateInfo->anisotropyEnable ? EWAApproximation : LEGACY,
+         .MinLOD = CLAMP(pCreateInfo->minLod, 0, 14),
+         .MaxLOD = CLAMP(pCreateInfo->maxLod, 0, 14),
+         .ChromaKeyEnable = 0,
+         .ChromaKeyIndex = 0,
+         .ChromaKeyMode = 0,
+         .ShadowFunction =
+            vk_to_intel_shadow_compare_op[pCreateInfo->compareEnable ?
+                                        pCreateInfo->compareOp : VK_COMPARE_OP_NEVER],
+         .CubeSurfaceControlMode = seamless_cube ? OVERRIDE : PROGRAMMED,
+
+         .LODClampMagnificationMode = MIPNONE,
+
+         .MaximumAnisotropy = vk_to_intel_max_anisotropy(pCreateInfo->maxAnisotropy),
+         .RAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
+         .RAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
+         .VAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
+         .VAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
+         .UAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
+         .UAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
+         .TrilinearFilterQuality = 0,
+         .NonnormalizedCoordinateEnable = pCreateInfo->unnormalizedCoordinates,
+         .TCXAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeU],
+         .TCYAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeV],
+         .TCZAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeW],
+
+         .ReductionType =
+            vk_to_intel_sampler_reduction_mode[sampler->vk.reduction_mode],
+         .ReductionTypeEnable =
+            sampler->vk.reduction_mode != VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE,
+      };
+
+      /* Pack a version of the SAMPLER_STATE without the border color. We'll
+       * use it to store into the shader cache and also for hashing.
+       */
+      GENX(SAMPLER_STATE_pack)(NULL, sampler->state_no_bc[p], &sampler_state);
+      _mesa_sha1_update(&ctx, sampler->state_no_bc[p], sizeof(sampler->state_no_bc[p]));
+
+      /* Put border color after the hashing, we don't want the allocation
+       * order of border colors to influence the hash. We just need th
+       * parameters to be hashed.
+       */
+      sampler_state.BorderColorPointer = border_color_offset;
+      GENX(SAMPLER_STATE_pack)(NULL, sampler->state[p], &sampler_state);
+
+      if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
+         sampler_state.BorderColorPointer = border_color_db_offset;
+         GENX(SAMPLER_STATE_pack)(NULL, sampler->db_state[p], &sampler_state);
+      }
+   }
+
+   /* If we have bindless, allocate enough samplers.  We allocate 32 bytes
+    * for each sampler instead of 16 bytes because we want all bindless
+    * samplers to be 32-byte aligned so we don't have to use indirect
+    * sampler messages on them.
+    */
+   sampler->bindless_state =
+      anv_state_pool_alloc(&device->dynamic_state_pool,
+                           sampler->n_planes * 32, 32);
+   if (sampler->bindless_state.map) {
+      memcpy(sampler->bindless_state.map, sampler->state,
+             sampler->n_planes * GENX(SAMPLER_STATE_length) * 4);
+   }
+
+   /* Hash the border color */
+   _mesa_sha1_update(&ctx, border_color_ptr,
+                     sizeof(union isl_color_value));
+
+   _mesa_sha1_final(&ctx, sampler->sha1);
+
+   *pSampler = anv_sampler_to_handle(sampler);
+
+   return VK_SUCCESS;
+}
+
+void
+genX(emit_embedded_sampler)(struct anv_device *device,
+                            struct anv_embedded_sampler *sampler,
+                            struct anv_pipeline_embedded_sampler_binding *binding)
+{
+   sampler->ref_cnt = 1;
+   memcpy(&sampler->key, &binding->key, sizeof(binding->key));
+
+   sampler->border_color_state =
+      anv_state_pool_alloc(&device->dynamic_state_db_pool,
+                           sizeof(struct gfx8_border_color), 64);
+   memcpy(sampler->border_color_state.map,
+          binding->key.color,
+          sizeof(binding->key.color));
+
+   sampler->sampler_state =
+      anv_state_pool_alloc(&device->dynamic_state_db_pool,
+                           ANV_SAMPLER_STATE_SIZE, 32);
+
+   struct GENX(SAMPLER_STATE) sampler_state = {
+      .BorderColorPointer = sampler->border_color_state.offset,
+   };
+   uint32_t dwords[GENX(SAMPLER_STATE_length)];
+   GENX(SAMPLER_STATE_pack)(NULL, dwords, &sampler_state);
+
+   for (uint32_t i = 0; i < GENX(SAMPLER_STATE_length); i++) {
+      ((uint32_t *)sampler->sampler_state.map)[i] =
+         dwords[i] | binding->key.sampler[i];
+   }
+}
+
+/* Wa_14015814527
+ *
+ * Check if task shader was utilized within cmd_buffer, if so
+ * commit empty URB states and null prim.
+ */
+void
+genX(apply_task_urb_workaround)(struct anv_cmd_buffer *cmd_buffer)
+{
+   if (!anv_cmd_buffer_is_render_queue(cmd_buffer))
+      return;
+
+#if GFX_VERx10 >= 125
+   const struct intel_device_info *devinfo = &cmd_buffer->device->physical->info;
+
+   if (!intel_needs_workaround(devinfo, 16014390852))
+      return;
+
+   if (cmd_buffer->state.current_pipeline != _3D ||
+       !cmd_buffer->state.gfx.used_task_shader)
+      return;
+
+   cmd_buffer->state.gfx.used_task_shader = false;
+
+   /* Wa_14015821291 mentions that WA below is not required if we have
+    * a pipeline flush going on. It will get flushed during
+    * cmd_buffer_flush_state before draw.
+    */
+   if ((cmd_buffer->state.pending_pipe_bits & ANV_PIPE_CS_STALL_BIT))
+      return;
+
+   for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_VS), urb) {
+         urb._3DCommandSubOpcode += i;
+      }
+   }
+
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_ALLOC_MESH), zero);
+   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_URB_ALLOC_TASK), zero);
+
+   /* Issue 'nullprim' to commit the state. */
+   genx_batch_emit_pipe_control_write
+      (&cmd_buffer->batch, cmd_buffer->device->info,
+       cmd_buffer->state.current_pipeline,
+       WriteImmediateData, cmd_buffer->device->workaround_address, 0, 0);
+#endif
+}
+
+VkResult
+genX(init_trtt_context_state)(struct anv_queue *queue)
+{
+#if GFX_VER >= 12
+   struct anv_device *device = queue->device;
+   struct anv_trtt *trtt = &device->trtt;
+
+   uint32_t cmds[128];
+   struct anv_batch batch = {
+      .start = cmds,
+      .next = cmds,
+      .end = (void *)cmds + sizeof(cmds),
+   };
+
+   anv_batch_write_reg(&batch, GENX(GFX_TRTT_INVAL), trtt_inval) {
+      trtt_inval.InvalidTileDetectionValue = ANV_TRTT_L1_INVALID_TILE_VAL;
+   }
+   anv_batch_write_reg(&batch, GENX(GFX_TRTT_NULL), trtt_null) {
+      trtt_null.NullTileDetectionValue = ANV_TRTT_L1_NULL_TILE_VAL;
+   }
+#if GFX_VER >= 20
+   anv_batch_write_reg(&batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) {
+      trtt_va_range.TRVABase = device->physical->va.trtt.addr >> 44;
+   }
+#else
+   anv_batch_write_reg(&batch, GENX(GFX_TRTT_VA_RANGE), trtt_va_range) {
+      trtt_va_range.TRVAMaskValue = 0xF;
+      trtt_va_range.TRVADataValue = 0xF;
+   }
+#endif
+
+   uint64_t l3_addr = trtt->l3_addr;
+   assert((l3_addr & 0xFFF) == 0);
+   anv_batch_write_reg(&batch, GENX(GFX_TRTT_L3_BASE_LOW), trtt_base_low) {
+      trtt_base_low.TRVAL3PointerLowerAddress =
+         (l3_addr & 0xFFFFF000) >> 12;
+   }
+   anv_batch_write_reg(&batch, GENX(GFX_TRTT_L3_BASE_HIGH),
+         trtt_base_high) {
+      trtt_base_high.TRVAL3PointerUpperAddress =
+         (l3_addr >> 32) & 0xFFFF;
+   }
+   /* Enabling TR-TT needs to be done after setting up the other registers.
+   */
+   anv_batch_write_reg(&batch, GENX(GFX_TRTT_CR), trtt_cr) {
+      trtt_cr.TRTTEnable = true;
+   }
+
+   anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
+   assert(batch.next <= batch.end);
+
+   VkResult res = anv_queue_submit_simple_batch(queue, &batch, false);
+   if (res != VK_SUCCESS)
+      return res;
+
+#endif
+   return VK_SUCCESS;
+}
diff --git a/src/intel/vulkan/genX_internal_kernels.c b/src/intel/vulkan/genX_internal_kernels.c
new file mode 100644
index 00000000000..a476e2bcd04
--- /dev/null
+++ b/src/intel/vulkan/genX_internal_kernels.c
@@ -0,0 +1,111 @@
+/* Copyright © 2023 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "anv_private.h"
+#include "anv_internal_kernels.h"
+
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_serialize.h"
+
+#if GFX_VERx10 == 90
+# include "intel_gfx9_shaders_code.h"
+#elif GFX_VERx10 == 110
+# include "intel_gfx11_shaders_code.h"
+#elif GFX_VERx10 == 120
+# include "intel_gfx12_shaders_code.h"
+#elif GFX_VERx10 == 125
+# include "intel_gfx125_shaders_code.h"
+#elif GFX_VERx10 == 200
+# include "intel_gfx20_shaders_code.h"
+#else
+# error "Unsupported generation"
+#endif
+
+#include "genxml/gen_macros.h"
+
+#define load_param(b, bit_size, struct_name, field_name)          \
+   nir_load_uniform(b, 1, bit_size, nir_imm_int(b, 0),            \
+                    .base = offsetof(struct_name, field_name),   \
+                    .range = bit_size / 8)
+
+static nir_def *
+load_fragment_index(nir_builder *b)
+{
+   nir_def *pos_in = nir_f2i32(b, nir_trim_vector(b, nir_load_frag_coord(b), 2));
+   return nir_iadd(b,
+                   nir_imul_imm(b, nir_channel(b, pos_in, 1), 8192),
+                   nir_channel(b, pos_in, 0));
+}
+
+static nir_def *
+load_compute_index(nir_builder *b)
+{
+   return nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
+}
+
+nir_shader *
+genX(load_libanv_shader)(struct anv_device *device, void *mem_ctx)
+{
+   const nir_shader_compiler_options *nir_options =
+      device->physical->compiler->nir_options[MESA_SHADER_KERNEL];
+
+   struct blob_reader blob;
+   blob_reader_init(&blob, (void *)genX(intel_shaders_nir),
+                    sizeof(genX(intel_shaders_nir)));
+   return nir_deserialize(mem_ctx, nir_options, &blob);
+}
+
+uint32_t
+genX(call_internal_shader)(nir_builder *b, enum anv_internal_kernel_name shader_name)
+{
+   switch (shader_name) {
+   case ANV_INTERNAL_KERNEL_GENERATED_DRAWS:
+      genX(libanv_write_draw)(
+         b,
+         load_param(b, 64, struct anv_gen_indirect_params, generated_cmds_addr),
+         load_param(b, 64, struct anv_gen_indirect_params, indirect_data_addr),
+         load_param(b, 64, struct anv_gen_indirect_params, draw_id_addr),
+         load_param(b, 32, struct anv_gen_indirect_params, indirect_data_stride),
+         load_param(b, 64, struct anv_gen_indirect_params, draw_count_addr),
+         load_param(b, 32, struct anv_gen_indirect_params, draw_base),
+         load_param(b, 32, struct anv_gen_indirect_params, instance_multiplier),
+         load_param(b, 32, struct anv_gen_indirect_params, max_draw_count),
+         load_param(b, 32, struct anv_gen_indirect_params, flags),
+         load_param(b, 32, struct anv_gen_indirect_params, ring_count),
+         load_param(b, 64, struct anv_gen_indirect_params, gen_addr),
+         load_param(b, 64, struct anv_gen_indirect_params, end_addr),
+         load_fragment_index(b));
+      return sizeof(struct anv_gen_indirect_params);
+
+   case ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_COMPUTE:
+   case ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_FRAGMENT:
+      genX(libanv_query_copy)(
+         b,
+         load_param(b, 64, struct anv_query_copy_params, destination_addr),
+         load_param(b, 32, struct anv_query_copy_params, destination_stride),
+         load_param(b, 64, struct anv_query_copy_params, query_data_addr),
+         load_param(b, 32, struct anv_query_copy_params, query_base),
+         load_param(b, 32, struct anv_query_copy_params, num_queries),
+         load_param(b, 32, struct anv_query_copy_params, query_data_offset),
+         load_param(b, 32, struct anv_query_copy_params, query_stride),
+         load_param(b, 32, struct anv_query_copy_params, num_items),
+         load_param(b, 32, struct anv_query_copy_params, flags),
+         shader_name == ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_COMPUTE ?
+         load_compute_index(b) : load_fragment_index(b));
+      return sizeof(struct anv_query_copy_params);
+
+   case ANV_INTERNAL_KERNEL_MEMCPY_COMPUTE:
+      genX(libanv_memcpy)(
+         b,
+         load_param(b, 64, struct anv_memcpy_params, dst_addr),
+         load_param(b, 64, struct anv_memcpy_params, src_addr),
+         load_param(b, 32, struct anv_memcpy_params, num_dwords),
+         nir_imul_imm(b, load_compute_index(b), 4));
+      return sizeof(struct anv_memcpy_params);
+
+   default:
+      unreachable("Invalid shader name");
+      break;
+   }
+}
diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
index cb5605e8883..f667c8bacbd 100644
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -25,13 +25,62 @@
 
 #include "genxml/gen_macros.h"
 #include "genxml/genX_pack.h"
-#include "genxml/gen_rt_pack.h"
+#include "genxml/genX_rt_pack.h"
 
+#include "common/intel_genX_state_brw.h"
 #include "common/intel_l3_config.h"
 #include "common/intel_sample_positions.h"
 #include "nir/nir_xfb_info.h"
 #include "vk_util.h"
 #include "vk_format.h"
+#include "vk_log.h"
+#include "vk_render_pass.h"
+
+static inline struct anv_batch *
+anv_gfx_pipeline_add(struct anv_graphics_pipeline *pipeline,
+                     struct anv_gfx_state_ptr *ptr,
+                     uint32_t n_dwords)
+{
+   struct anv_batch *batch = &pipeline->base.base.batch;
+
+   assert(ptr->len == 0 ||
+          (batch->next - batch->start) / 4 == (ptr->offset + ptr->len));
+   if (ptr->len == 0)
+      ptr->offset = (batch->next - batch->start) / 4;
+   ptr->len += n_dwords;
+
+   return batch;
+}
+
+#define anv_pipeline_emit(pipeline, state, cmd, name)                   \
+   for (struct cmd name = { __anv_cmd_header(cmd) },                    \
+           *_dst = anv_batch_emit_dwords(                               \
+              anv_gfx_pipeline_add(pipeline,                            \
+                                   &(pipeline)->state,                  \
+                                   __anv_cmd_length(cmd)),              \
+              __anv_cmd_length(cmd));                                   \
+        __builtin_expect(_dst != NULL, 1);                              \
+        ({ __anv_cmd_pack(cmd)(&(pipeline)->base.base.batch,            \
+                               _dst, &name);                            \
+           VG(VALGRIND_CHECK_MEM_IS_DEFINED(_dst, __anv_cmd_length(cmd) * 4)); \
+           _dst = NULL;                                                 \
+        }))
+
+#define anv_pipeline_emitn(pipeline, state, n, cmd, ...) ({             \
+   void *__dst = anv_batch_emit_dwords(                                 \
+      anv_gfx_pipeline_add(pipeline, &(pipeline)->state, n), n);        \
+   if (__dst) {                                                         \
+      struct cmd __template = {                                         \
+         __anv_cmd_header(cmd),                                         \
+         .DWordLength = n - __anv_cmd_length_bias(cmd),                 \
+         __VA_ARGS__                                                    \
+      };                                                                \
+      __anv_cmd_pack(cmd)(&pipeline->base.base.batch,                   \
+                          __dst, &__template);                          \
+   }                                                                    \
+   __dst;                                                               \
+   })
+
 
 static uint32_t
 vertex_element_comp_control(enum isl_format format, unsigned comp)
@@ -85,39 +134,23 @@ vertex_element_comp_control(enum isl_format format, unsigned comp)
    }
 }
 
-static void
-emit_vertex_input(struct anv_graphics_pipeline *pipeline,
-                  const VkPipelineVertexInputStateCreateInfo *info)
+void
+genX(emit_vertex_input)(struct anv_batch *batch,
+                        uint32_t *vertex_element_dws,
+                        struct anv_graphics_pipeline *pipeline,
+                        const struct vk_vertex_input_state *vi,
+                        bool emit_in_pipeline)
 {
+   const struct anv_device *device = pipeline->base.base.device;
    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
-
-   /* Pull inputs_read out of the VS prog data */
    const uint64_t inputs_read = vs_prog_data->inputs_read;
    const uint64_t double_inputs_read =
       vs_prog_data->double_inputs_read & inputs_read;
    assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
    const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;
    const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0;
-   const bool needs_svgs_elem = vs_prog_data->uses_vertexid ||
-                                vs_prog_data->uses_instanceid ||
-                                vs_prog_data->uses_firstvertex ||
-                                vs_prog_data->uses_baseinstance;
-
-   uint32_t elem_count = __builtin_popcount(elements) -
-      __builtin_popcount(elements_double) / 2;
 
-   const uint32_t total_elems =
-      MAX2(1, elem_count + needs_svgs_elem + vs_prog_data->uses_drawid);
-
-   uint32_t *p;
-
-   const uint32_t num_dwords = 1 + total_elems * 2;
-   p = anv_batch_emitn(&pipeline->base.batch, num_dwords,
-                       GENX(3DSTATE_VERTEX_ELEMENTS));
-   if (!p)
-      return;
-
-   for (uint32_t i = 0; i < total_elems; i++) {
+   for (uint32_t i = 0; i < pipeline->vs_input_elements; i++) {
       /* The SKL docs for VERTEX_ELEMENT_STATE say:
        *
        *    "All elements must be valid from Element[0] to the last valid
@@ -142,94 +175,168 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
          .Component2Control = VFCOMP_STORE_0,
          .Component3Control = VFCOMP_STORE_0,
       };
-      GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + i * 2], &element);
+      GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
+                                      &vertex_element_dws[i * 2],
+                                      &element);
    }
 
-   for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++) {
-      const VkVertexInputAttributeDescription *desc =
-         &info->pVertexAttributeDescriptions[i];
-      enum isl_format format = anv_get_isl_format(&pipeline->base.device->info,
-                                                  desc->format,
+   u_foreach_bit(a, vi->attributes_valid) {
+      enum isl_format format = anv_get_isl_format(device->info,
+                                                  vi->attributes[a].format,
                                                   VK_IMAGE_ASPECT_COLOR_BIT,
                                                   VK_IMAGE_TILING_LINEAR);
+      assume(format < ISL_NUM_FORMATS);
 
-      assert(desc->binding < MAX_VBS);
+      uint32_t binding = vi->attributes[a].binding;
+      assert(binding < MAX_VBS);
 
-      if ((elements & (1 << desc->location)) == 0)
+      if ((elements & (1 << a)) == 0)
          continue; /* Binding unused */
 
       uint32_t slot =
-         __builtin_popcount(elements & ((1 << desc->location) - 1)) -
+         __builtin_popcount(elements & ((1 << a) - 1)) -
          DIV_ROUND_UP(__builtin_popcount(elements_double &
-                                        ((1 << desc->location) -1)), 2);
+                                        ((1 << a) -1)), 2);
 
       struct GENX(VERTEX_ELEMENT_STATE) element = {
-         .VertexBufferIndex = desc->binding,
+         .VertexBufferIndex = vi->attributes[a].binding,
          .Valid = true,
          .SourceElementFormat = format,
          .EdgeFlagEnable = false,
-         .SourceElementOffset = desc->offset,
+         .SourceElementOffset = vi->attributes[a].offset,
          .Component0Control = vertex_element_comp_control(format, 0),
          .Component1Control = vertex_element_comp_control(format, 1),
          .Component2Control = vertex_element_comp_control(format, 2),
          .Component3Control = vertex_element_comp_control(format, 3),
       };
-      GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + slot * 2], &element);
+      GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
+                                      &vertex_element_dws[slot * 2],
+                                      &element);
 
-#if GFX_VER >= 8
       /* On Broadwell and later, we have a separate VF_INSTANCING packet
        * that controls instancing.  On Haswell and prior, that's part of
        * VERTEX_BUFFER_STATE which we emit later.
        */
-      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
-         vfi.InstancingEnable = pipeline->vb[desc->binding].instanced;
-         vfi.VertexElementIndex = slot;
-         vfi.InstanceDataStepRate =
-            pipeline->vb[desc->binding].instance_divisor;
+      if (emit_in_pipeline) {
+         anv_pipeline_emit(pipeline, final.vf_instancing, GENX(3DSTATE_VF_INSTANCING), vfi) {
+            bool per_instance = vi->bindings[binding].input_rate ==
+               VK_VERTEX_INPUT_RATE_INSTANCE;
+            uint32_t divisor = vi->bindings[binding].divisor *
+               pipeline->instance_multiplier;
+
+            vfi.InstancingEnable = per_instance;
+            vfi.VertexElementIndex = slot;
+            vfi.InstanceDataStepRate = per_instance ? divisor : 1;
+         }
+      } else {
+         anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+            bool per_instance = vi->bindings[binding].input_rate ==
+               VK_VERTEX_INPUT_RATE_INSTANCE;
+            uint32_t divisor = vi->bindings[binding].divisor *
+               pipeline->instance_multiplier;
+
+            vfi.InstancingEnable = per_instance;
+            vfi.VertexElementIndex = slot;
+            vfi.InstanceDataStepRate = per_instance ? divisor : 1;
+         }
       }
-#endif
    }
+}
 
-   const uint32_t id_slot = elem_count;
-   if (needs_svgs_elem) {
-      /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
-       *    "Within a VERTEX_ELEMENT_STATE structure, if a Component
-       *    Control field is set to something other than VFCOMP_STORE_SRC,
-       *    no higher-numbered Component Control fields may be set to
-       *    VFCOMP_STORE_SRC"
-       *
-       * This means, that if we have BaseInstance, we need BaseVertex as
-       * well.  Just do all or nothing.
-       */
-      uint32_t base_ctrl = (vs_prog_data->uses_firstvertex ||
-                            vs_prog_data->uses_baseinstance) ?
-                           VFCOMP_STORE_SRC : VFCOMP_STORE_0;
+static void
+emit_vertex_input(struct anv_graphics_pipeline *pipeline,
+                  const struct vk_graphics_pipeline_state *state,
+                  const struct vk_vertex_input_state *vi)
+{
+   /* Only pack the VERTEX_ELEMENT_STATE if not dynamic so we can just memcpy
+    * everything in gfx8_cmd_buffer.c
+    */
+   if (!BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_VI)) {
+      genX(emit_vertex_input)(NULL,
+                              pipeline->vertex_input_data,
+                              pipeline, vi, true /* emit_in_pipeline */);
+   }
 
-      struct GENX(VERTEX_ELEMENT_STATE) element = {
-         .VertexBufferIndex = ANV_SVGS_VB_INDEX,
-         .Valid = true,
-         .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
-         .Component0Control = base_ctrl,
-         .Component1Control = base_ctrl,
-#if GFX_VER >= 8
-         .Component2Control = VFCOMP_STORE_0,
-         .Component3Control = VFCOMP_STORE_0,
-#else
-         .Component2Control = VFCOMP_STORE_VID,
-         .Component3Control = VFCOMP_STORE_IID,
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+   const bool needs_svgs_elem = pipeline->svgs_count > 1 ||
+                                !vs_prog_data->uses_drawid;
+   const uint32_t id_slot = pipeline->vs_input_elements;
+   const uint32_t drawid_slot = id_slot + needs_svgs_elem;
+   if (pipeline->svgs_count > 0) {
+      assert(pipeline->vertex_input_elems >= pipeline->svgs_count);
+      uint32_t slot_offset =
+         pipeline->vertex_input_elems - pipeline->svgs_count;
+
+      if (needs_svgs_elem) {
+#if GFX_VER < 11
+         /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
+          *    "Within a VERTEX_ELEMENT_STATE structure, if a Component
+          *    Control field is set to something other than VFCOMP_STORE_SRC,
+          *    no higher-numbered Component Control fields may be set to
+          *    VFCOMP_STORE_SRC"
+          *
+          * This means, that if we have BaseInstance, we need BaseVertex as
+          * well.  Just do all or nothing.
+          */
+         uint32_t base_ctrl = (vs_prog_data->uses_firstvertex ||
+                               vs_prog_data->uses_baseinstance) ?
+                              VFCOMP_STORE_SRC : VFCOMP_STORE_0;
 #endif
-      };
-      GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + id_slot * 2], &element);
 
-#if GFX_VER >= 8
-      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
-         vfi.VertexElementIndex = id_slot;
+         struct GENX(VERTEX_ELEMENT_STATE) element = {
+            .VertexBufferIndex = ANV_SVGS_VB_INDEX,
+            .Valid = true,
+            .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
+#if GFX_VER >= 11
+            /* On gen11, these are taken care of by extra parameter slots */
+            .Component0Control = VFCOMP_STORE_0,
+            .Component1Control = VFCOMP_STORE_0,
+#else
+            .Component0Control = base_ctrl,
+            .Component1Control = base_ctrl,
+#endif
+            .Component2Control = VFCOMP_STORE_0,
+            .Component3Control = VFCOMP_STORE_0,
+         };
+         GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
+                                         &pipeline->vertex_input_data[slot_offset * 2],
+                                         &element);
+         slot_offset++;
+
+         anv_pipeline_emit(pipeline, final.vf_sgvs_instancing,
+                           GENX(3DSTATE_VF_INSTANCING), vfi) {
+            vfi.VertexElementIndex = id_slot;
+         }
+      }
+
+      if (vs_prog_data->uses_drawid) {
+         struct GENX(VERTEX_ELEMENT_STATE) element = {
+            .VertexBufferIndex = ANV_DRAWID_VB_INDEX,
+            .Valid = true,
+            .SourceElementFormat = ISL_FORMAT_R32_UINT,
+#if GFX_VER >= 11
+            /* On gen11, this is taken care of by extra parameter slots */
+            .Component0Control = VFCOMP_STORE_0,
+#else
+            .Component0Control = VFCOMP_STORE_SRC,
+#endif
+            .Component1Control = VFCOMP_STORE_0,
+            .Component2Control = VFCOMP_STORE_0,
+            .Component3Control = VFCOMP_STORE_0,
+         };
+         GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
+                                         &pipeline->vertex_input_data[slot_offset * 2],
+                                         &element);
+         slot_offset++;
+
+         anv_pipeline_emit(pipeline, final.vf_sgvs_instancing,
+                           GENX(3DSTATE_VF_INSTANCING), vfi) {
+            vfi.VertexElementIndex = drawid_slot;
+         }
       }
-#endif
    }
 
-#if GFX_VER >= 8
-   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_SGVS), sgvs) {
+   anv_pipeline_emit(pipeline, final.vf_sgvs, GENX(3DSTATE_VF_SGVS), sgvs) {
       sgvs.VertexIDEnable              = vs_prog_data->uses_vertexid;
       sgvs.VertexIDComponentNumber     = 2;
       sgvs.VertexIDElementOffset       = id_slot;
@@ -237,93 +344,187 @@ emit_vertex_input(struct anv_graphics_pipeline *pipeline,
       sgvs.InstanceIDComponentNumber   = 3;
       sgvs.InstanceIDElementOffset     = id_slot;
    }
-#endif
 
-   const uint32_t drawid_slot = elem_count + needs_svgs_elem;
-   if (vs_prog_data->uses_drawid) {
-      struct GENX(VERTEX_ELEMENT_STATE) element = {
-         .VertexBufferIndex = ANV_DRAWID_VB_INDEX,
-         .Valid = true,
-         .SourceElementFormat = ISL_FORMAT_R32_UINT,
-         .Component0Control = VFCOMP_STORE_SRC,
-         .Component1Control = VFCOMP_STORE_0,
-         .Component2Control = VFCOMP_STORE_0,
-         .Component3Control = VFCOMP_STORE_0,
-      };
-      GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
-                                      &p[1 + drawid_slot * 2],
-                                      &element);
+#if GFX_VER >= 11
+   anv_pipeline_emit(pipeline, final.vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), sgvs) {
+      /* gl_BaseVertex */
+      sgvs.XP0Enable                   = vs_prog_data->uses_firstvertex;
+      sgvs.XP0SourceSelect             = XP0_PARAMETER;
+      sgvs.XP0ComponentNumber          = 0;
+      sgvs.XP0ElementOffset            = id_slot;
 
-#if GFX_VER >= 8
-      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
-         vfi.VertexElementIndex = drawid_slot;
-      }
-#endif
+      /* gl_BaseInstance */
+      sgvs.XP1Enable                   = vs_prog_data->uses_baseinstance;
+      sgvs.XP1SourceSelect             = StartingInstanceLocation;
+      sgvs.XP1ComponentNumber          = 1;
+      sgvs.XP1ElementOffset            = id_slot;
+
+      /* gl_DrawID */
+      sgvs.XP2Enable                   = vs_prog_data->uses_drawid;
+      sgvs.XP2ComponentNumber          = 0;
+      sgvs.XP2ElementOffset            = drawid_slot;
    }
+#endif
 }
 
 void
 genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
                      const struct intel_l3_config *l3_config,
                      VkShaderStageFlags active_stages,
-                     const unsigned entry_size[4],
+                     const struct intel_urb_config *urb_cfg_in,
+                     struct intel_urb_config *urb_cfg_out,
                      enum intel_urb_deref_block_size *deref_block_size)
 {
-   const struct intel_device_info *devinfo = &device->info;
+   const struct intel_device_info *devinfo = device->info;
 
-   unsigned entries[4];
-   unsigned start[4];
    bool constrained;
    intel_get_urb_config(devinfo, l3_config,
                         active_stages &
                            VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
                         active_stages & VK_SHADER_STAGE_GEOMETRY_BIT,
-                        entry_size, entries, start, deref_block_size,
+                        urb_cfg_out, deref_block_size,
                         &constrained);
 
-#if GFX_VERx10 == 70
-   /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
-    *
-    *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth stall
-    *    needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
-    *    3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
-    *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one PIPE_CONTROL
-    *    needs to be sent before any combination of VS associated 3DSTATE."
-    */
-   anv_batch_emit(batch, GFX7_PIPE_CONTROL, pc) {
-      pc.DepthStallEnable  = true;
-      pc.PostSyncOperation = WriteImmediateData;
-      pc.Address           = device->workaround_address;
-   }
+#if INTEL_NEEDS_WA_16014912113
+      if (intel_urb_setup_changed(urb_cfg_in, urb_cfg_out,
+          MESA_SHADER_TESS_EVAL) && urb_cfg_in->size[0] != 0) {
+         for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
+            anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
+               urb._3DCommandSubOpcode      += i;
+               urb.VSURBStartingAddress      = urb_cfg_in->start[i];
+               urb.VSURBEntryAllocationSize  = urb_cfg_in->size[i] - 1;
+               urb.VSNumberofURBEntries      = i == 0 ? 256 : 0;
+            }
+         }
+         genx_batch_emit_pipe_control(batch, device->info, _3D,
+                                      ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
+      }
 #endif
 
    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
       anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
          urb._3DCommandSubOpcode      += i;
-         urb.VSURBStartingAddress      = start[i];
-         urb.VSURBEntryAllocationSize  = entry_size[i] - 1;
-         urb.VSNumberofURBEntries      = entries[i];
+         urb.VSURBStartingAddress      = urb_cfg_out->start[i];
+         urb.VSURBEntryAllocationSize  = urb_cfg_out->size[i] - 1;
+         urb.VSNumberofURBEntries      = urb_cfg_out->entries[i];
+      }
+   }
+#if GFX_VERx10 >= 125
+   if (device->vk.enabled_extensions.EXT_mesh_shader) {
+      anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_MESH), zero);
+      anv_batch_emit(batch, GENX(3DSTATE_URB_ALLOC_TASK), zero);
+   }
+#endif
+}
+
+#if GFX_VERx10 >= 125
+static void
+emit_urb_setup_mesh(struct anv_graphics_pipeline *pipeline,
+                    enum intel_urb_deref_block_size *deref_block_size)
+{
+   const struct intel_device_info *devinfo = pipeline->base.base.device->info;
+
+   const struct brw_task_prog_data *task_prog_data =
+      anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK) ?
+      get_task_prog_data(pipeline) : NULL;
+   const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+
+   const struct intel_mesh_urb_allocation alloc =
+      intel_get_mesh_urb_config(devinfo, pipeline->base.base.l3_config,
+                                task_prog_data ? task_prog_data->map.size_dw : 0,
+                                mesh_prog_data->map.size_dw);
+
+   /* Zero out the primitive pipeline URB allocations. */
+   for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
+      anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) {
+         urb._3DCommandSubOpcode += i;
       }
    }
+
+   anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), urb) {
+      if (task_prog_data) {
+         urb.TASKURBEntryAllocationSize   = alloc.task_entry_size_64b - 1;
+         urb.TASKNumberofURBEntriesSlice0 = alloc.task_entries;
+         urb.TASKNumberofURBEntriesSliceN = alloc.task_entries;
+         urb.TASKURBStartingAddressSlice0 = alloc.task_starting_address_8kb;
+         urb.TASKURBStartingAddressSliceN = alloc.task_starting_address_8kb;
+      }
+   }
+
+   anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_MESH), urb) {
+      urb.MESHURBEntryAllocationSize   = alloc.mesh_entry_size_64b - 1;
+      urb.MESHNumberofURBEntriesSlice0 = alloc.mesh_entries;
+      urb.MESHNumberofURBEntriesSliceN = alloc.mesh_entries;
+      urb.MESHURBStartingAddressSlice0 = alloc.mesh_starting_address_8kb;
+      urb.MESHURBStartingAddressSliceN = alloc.mesh_starting_address_8kb;
+   }
+
+   *deref_block_size = alloc.deref_block_size;
 }
+#endif
 
 static void
 emit_urb_setup(struct anv_graphics_pipeline *pipeline,
                enum intel_urb_deref_block_size *deref_block_size)
 {
-   unsigned entry_size[4];
+#if GFX_VERx10 >= 125
+   if (anv_pipeline_is_mesh(pipeline)) {
+      emit_urb_setup_mesh(pipeline, deref_block_size);
+      return;
+   }
+#endif
    for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
       const struct brw_vue_prog_data *prog_data =
          !anv_pipeline_has_stage(pipeline, i) ? NULL :
-         (const struct brw_vue_prog_data *) pipeline->shaders[i]->prog_data;
+         (const struct brw_vue_prog_data *) pipeline->base.shaders[i]->prog_data;
 
-      entry_size[i] = prog_data ? prog_data->urb_entry_size : 1;
+      pipeline->urb_cfg.size[i] = prog_data ? prog_data->urb_entry_size : 1;
    }
 
-   genX(emit_urb_setup)(pipeline->base.device, &pipeline->base.batch,
-                        pipeline->base.l3_config,
-                        pipeline->active_stages, entry_size,
-                        deref_block_size);
+   struct anv_device *device = pipeline->base.base.device;
+   const struct intel_device_info *devinfo = device->info;
+
+
+   bool constrained;
+   intel_get_urb_config(devinfo,
+                        pipeline->base.base.l3_config,
+                        pipeline->base.base.active_stages &
+                           VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
+                        pipeline->base.base.active_stages &
+                           VK_SHADER_STAGE_GEOMETRY_BIT,
+                        &pipeline->urb_cfg, deref_block_size,
+                        &constrained);
+
+   for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
+      anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_VS), urb) {
+         urb._3DCommandSubOpcode      += i;
+         urb.VSURBStartingAddress      = pipeline->urb_cfg.start[i];
+         urb.VSURBEntryAllocationSize  = pipeline->urb_cfg.size[i] - 1;
+         urb.VSNumberofURBEntries      = pipeline->urb_cfg.entries[i];
+      }
+   }
+
+#if GFX_VERx10 >= 125
+   if (device->vk.enabled_extensions.EXT_mesh_shader) {
+      anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_TASK), zero);
+      anv_pipeline_emit(pipeline, final.urb, GENX(3DSTATE_URB_ALLOC_MESH), zero);
+   }
+#endif
+
+}
+
+static bool
+sbe_primitive_id_override(struct anv_graphics_pipeline *pipeline)
+{
+   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+   if (!wm_prog_data)
+      return false;
+
+   const struct intel_vue_map *fs_input_map =
+      &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
+
+   return (wm_prog_data->inputs & VARYING_BIT_PRIMITIVE_ID) &&
+          fs_input_map->varying_to_slot[VARYING_SLOT_PRIMITIVE_ID] == -1;
 }
 
 static void
@@ -332,117 +533,167 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
 
    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
-      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE), sbe);
-#if GFX_VER >= 8
-      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE_SWIZ), sbe);
+      anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe);
+      anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), sbe);
+#if GFX_VERx10 >= 125
+      if (anv_pipeline_is_mesh(pipeline))
+         anv_pipeline_emit(pipeline, final.sbe_mesh, GENX(3DSTATE_SBE_MESH), sbe);
 #endif
       return;
    }
 
-   const struct brw_vue_map *fs_input_map =
-      &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
-
-   struct GENX(3DSTATE_SBE) sbe = {
-      GENX(3DSTATE_SBE_header),
-      .AttributeSwizzleEnable = true,
-      .PointSpriteTextureCoordinateOrigin = UPPERLEFT,
-      .NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs,
-      .ConstantInterpolationEnable = wm_prog_data->flat_inputs,
-   };
-
-#if GFX_VER >= 9
-   for (unsigned i = 0; i < 32; i++)
-      sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
-#endif
-
-#if GFX_VER >= 8
-   /* On Broadwell, they broke 3DSTATE_SBE into two packets */
-   struct GENX(3DSTATE_SBE_SWIZ) swiz = {
-      GENX(3DSTATE_SBE_SWIZ_header),
-   };
-#else
-#  define swiz sbe
-#endif
-
-   int first_slot = brw_compute_first_urb_slot_required(wm_prog_data->inputs,
-                                                        fs_input_map);
-   assert(first_slot % 2 == 0);
-   unsigned urb_entry_read_offset = first_slot / 2;
-   int max_source_attr = 0;
-   for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
-      uint8_t attr = wm_prog_data->urb_setup_attribs[idx];
-      int input_index = wm_prog_data->urb_setup[attr];
+   anv_pipeline_emit(pipeline, final.sbe, GENX(3DSTATE_SBE), sbe) {
+   anv_pipeline_emit(pipeline, final.sbe_swiz, GENX(3DSTATE_SBE_SWIZ), swiz) {
 
-      assert(0 <= input_index);
-
-      /* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the
-       * VUE header
+      /* TODO(mesh): Figure out cases where we need attribute swizzling.  See also
+       * calculate_urb_setup() and related functions.
        */
-      if (attr == VARYING_SLOT_VIEWPORT ||
-          attr == VARYING_SLOT_LAYER ||
-          attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
-         continue;
-      }
-
-      if (attr == VARYING_SLOT_PNTC) {
-         sbe.PointSpriteTextureCoordinateEnable = 1 << input_index;
-         continue;
-      }
+      sbe.AttributeSwizzleEnable = anv_pipeline_is_primitive(pipeline);
+      sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
+      sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
+      sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
+
+      for (unsigned i = 0; i < 32; i++)
+         sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
+
+      if (anv_pipeline_is_primitive(pipeline)) {
+         const struct intel_vue_map *fs_input_map =
+            &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
+
+         int first_slot =
+            brw_compute_first_urb_slot_required(wm_prog_data->inputs,
+                                                fs_input_map);
+         assert(first_slot % 2 == 0);
+         unsigned urb_entry_read_offset = first_slot / 2;
+         int max_source_attr = 0;
+         for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
+            uint8_t attr = wm_prog_data->urb_setup_attribs[idx];
+            int input_index = wm_prog_data->urb_setup[attr];
+
+            assert(0 <= input_index);
+
+            /* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the
+             * VUE header
+             */
+            if (attr == VARYING_SLOT_VIEWPORT ||
+                attr == VARYING_SLOT_LAYER ||
+                attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
+               continue;
+            }
+
+            if (attr == VARYING_SLOT_PNTC) {
+               sbe.PointSpriteTextureCoordinateEnable = 1 << input_index;
+               continue;
+            }
+
+            const int slot = fs_input_map->varying_to_slot[attr];
+
+            if (slot == -1) {
+               /* This attribute does not exist in the VUE--that means that
+                * the vertex shader did not write to it. It could be that it's
+                * a regular varying read by the fragment shader but not
+                * written by the vertex shader or it's gl_PrimitiveID. In the
+                * first case the value is undefined, in the second it needs to
+                * be gl_PrimitiveID.
+                */
+               swiz.Attribute[input_index].ConstantSource = PRIM_ID;
+               swiz.Attribute[input_index].ComponentOverrideX = true;
+               swiz.Attribute[input_index].ComponentOverrideY = true;
+               swiz.Attribute[input_index].ComponentOverrideZ = true;
+               swiz.Attribute[input_index].ComponentOverrideW = true;
+               continue;
+            }
+
+            /* We have to subtract two slots to account for the URB entry
+             * output read offset in the VS and GS stages.
+             */
+            const int source_attr = slot - 2 * urb_entry_read_offset;
+            assert(source_attr >= 0 && source_attr < 32);
+            max_source_attr = MAX2(max_source_attr, source_attr);
+            /* The hardware can only do overrides on 16 overrides at a time,
+             * and the other up to 16 have to be lined up so that the input
+             * index = the output index. We'll need to do some tweaking to
+             * make sure that's the case.
+             */
+            if (input_index < 16)
+               swiz.Attribute[input_index].SourceAttribute = source_attr;
+            else
+               assert(source_attr == input_index);
+         }
 
-      const int slot = fs_input_map->varying_to_slot[attr];
+         sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
+         sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);
+         sbe.ForceVertexURBEntryReadOffset = true;
+         sbe.ForceVertexURBEntryReadLength = true;
 
-      if (slot == -1) {
-         /* This attribute does not exist in the VUE--that means that the
-          * vertex shader did not write to it.  It could be that it's a
-          * regular varying read by the fragment shader but not written by
-          * the vertex shader or it's gl_PrimitiveID. In the first case the
-          * value is undefined, in the second it needs to be
-          * gl_PrimitiveID.
+         /* Ask the hardware to supply PrimitiveID if the fragment shader
+          * reads it but a previous stage didn't write one.
           */
-         swiz.Attribute[input_index].ConstantSource = PRIM_ID;
-         swiz.Attribute[input_index].ComponentOverrideX = true;
-         swiz.Attribute[input_index].ComponentOverrideY = true;
-         swiz.Attribute[input_index].ComponentOverrideZ = true;
-         swiz.Attribute[input_index].ComponentOverrideW = true;
-         continue;
+         if (sbe_primitive_id_override(pipeline)) {
+            sbe.PrimitiveIDOverrideAttributeSelect =
+               wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_ID];
+            sbe.PrimitiveIDOverrideComponentX = true;
+            sbe.PrimitiveIDOverrideComponentY = true;
+            sbe.PrimitiveIDOverrideComponentZ = true;
+            sbe.PrimitiveIDOverrideComponentW = true;
+         }
+      } else {
+         assert(anv_pipeline_is_mesh(pipeline));
+#if GFX_VERx10 >= 125
+         const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+         anv_pipeline_emit(pipeline, final.sbe_mesh,
+                           GENX(3DSTATE_SBE_MESH), sbe_mesh) {
+            const struct brw_mue_map *mue = &mesh_prog_data->map;
+
+            assert(mue->per_vertex_header_size_dw % 8 == 0);
+            sbe_mesh.PerVertexURBEntryOutputReadOffset = mue->per_vertex_header_size_dw / 8;
+            sbe_mesh.PerVertexURBEntryOutputReadLength = DIV_ROUND_UP(mue->per_vertex_data_size_dw, 8);
+
+            /* Clip distance array is passed in the per-vertex header so that
+             * it can be consumed by the HW. If user wants to read it in the
+             * FS, adjust the offset and length to cover it. Conveniently it
+             * is at the end of the per-vertex header, right before per-vertex
+             * attributes.
+             *
+             * Note that FS attribute reading must be aware that the clip
+             * distances have fixed position.
+             */
+            if (mue->per_vertex_header_size_dw > 8 &&
+                (wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] >= 0 ||
+                 wm_prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] >= 0)) {
+               sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
+               sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
+            }
+
+            if (mue->user_data_in_vertex_header) {
+               sbe_mesh.PerVertexURBEntryOutputReadOffset -= 1;
+               sbe_mesh.PerVertexURBEntryOutputReadLength += 1;
+            }
+
+            assert(mue->per_primitive_header_size_dw % 8 == 0);
+            sbe_mesh.PerPrimitiveURBEntryOutputReadOffset =
+               mue->per_primitive_header_size_dw / 8;
+            sbe_mesh.PerPrimitiveURBEntryOutputReadLength =
+               DIV_ROUND_UP(mue->per_primitive_data_size_dw, 8);
+
+            /* Just like with clip distances, if Primitive Shading Rate,
+             * Viewport Index or Layer is read back in the FS, adjust the
+             * offset and length to cover the Primitive Header, where PSR,
+             * Viewport Index & Layer are stored.
+             */
+            if (wm_prog_data->urb_setup[VARYING_SLOT_VIEWPORT] >= 0 ||
+                wm_prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] >= 0 ||
+                wm_prog_data->urb_setup[VARYING_SLOT_LAYER] >= 0 ||
+                mue->user_data_in_primitive_header) {
+               assert(sbe_mesh.PerPrimitiveURBEntryOutputReadOffset > 0);
+               sbe_mesh.PerPrimitiveURBEntryOutputReadOffset -= 1;
+               sbe_mesh.PerPrimitiveURBEntryOutputReadLength += 1;
+            }
+         }
+#endif
       }
-
-      /* We have to subtract two slots to accout for the URB entry output
-       * read offset in the VS and GS stages.
-       */
-      const int source_attr = slot - 2 * urb_entry_read_offset;
-      assert(source_attr >= 0 && source_attr < 32);
-      max_source_attr = MAX2(max_source_attr, source_attr);
-      /* The hardware can only do overrides on 16 overrides at a time, and the
-       * other up to 16 have to be lined up so that the input index = the
-       * output index. We'll need to do some tweaking to make sure that's the
-       * case.
-       */
-      if (input_index < 16)
-         swiz.Attribute[input_index].SourceAttribute = source_attr;
-      else
-         assert(source_attr == input_index);
    }
-
-   sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
-   sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);
-#if GFX_VER >= 8
-   sbe.ForceVertexURBEntryReadOffset = true;
-   sbe.ForceVertexURBEntryReadLength = true;
-#endif
-
-   uint32_t *dw = anv_batch_emit_dwords(&pipeline->base.batch,
-                                        GENX(3DSTATE_SBE_length));
-   if (!dw)
-      return;
-   GENX(3DSTATE_SBE_pack)(&pipeline->base.batch, dw, &sbe);
-
-#if GFX_VER >= 8
-   dw = anv_batch_emit_dwords(&pipeline->base.batch, GENX(3DSTATE_SBE_SWIZ_length));
-   if (!dw)
-      return;
-   GENX(3DSTATE_SBE_SWIZ_pack)(&pipeline->base.batch, dw, &swiz);
-#endif
+   }
 }
 
 /** Returns the final polygon mode for rasterization
@@ -451,10 +702,22 @@ emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
  * different shader stages which might generate their own type of primitives.
  */
 VkPolygonMode
-genX(raster_polygon_mode)(struct anv_graphics_pipeline *pipeline,
+genX(raster_polygon_mode)(const struct anv_graphics_pipeline *pipeline,
+                          VkPolygonMode polygon_mode,
                           VkPrimitiveTopology primitive_topology)
 {
-   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
+   if (anv_pipeline_is_mesh(pipeline)) {
+      switch (get_mesh_prog_data(pipeline)->primitive_type) {
+      case MESA_PRIM_POINTS:
+         return VK_POLYGON_MODE_POINT;
+      case MESA_PRIM_LINES:
+         return VK_POLYGON_MODE_LINE;
+      case MESA_PRIM_TRIANGLES:
+         return polygon_mode;
+      default:
+         unreachable("invalid primitive type for mesh");
+      }
+   } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
       switch (get_gs_prog_data(pipeline)->output_topology) {
       case _3DPRIM_POINTLIST:
          return VK_POLYGON_MODE_POINT;
@@ -471,20 +734,20 @@ genX(raster_polygon_mode)(struct anv_graphics_pipeline *pipeline,
       case _3DPRIM_QUADLIST:
       case _3DPRIM_QUADSTRIP:
       case _3DPRIM_POLYGON:
-         return pipeline->polygon_mode;
+         return polygon_mode;
       }
       unreachable("Unsupported GS output topology");
    } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
       switch (get_tes_prog_data(pipeline)->output_topology) {
-      case BRW_TESS_OUTPUT_TOPOLOGY_POINT:
+      case INTEL_TESS_OUTPUT_TOPOLOGY_POINT:
          return VK_POLYGON_MODE_POINT;
 
-      case BRW_TESS_OUTPUT_TOPOLOGY_LINE:
+      case INTEL_TESS_OUTPUT_TOPOLOGY_LINE:
          return VK_POLYGON_MODE_LINE;
 
-      case BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW:
-      case BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW:
-         return pipeline->polygon_mode;
+      case INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CW:
+      case INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CCW:
+         return polygon_mode;
       }
       unreachable("Unsupported TCS output topology");
    } else {
@@ -503,7 +766,7 @@ genX(raster_polygon_mode)(struct anv_graphics_pipeline *pipeline,
       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
-         return pipeline->polygon_mode;
+         return polygon_mode;
 
       default:
          unreachable("Unsupported primitive topology");
@@ -511,42 +774,6 @@ genX(raster_polygon_mode)(struct anv_graphics_pipeline *pipeline,
    }
 }
 
-uint32_t
-genX(ms_rasterization_mode)(struct anv_graphics_pipeline *pipeline,
-                            VkPolygonMode raster_mode)
-{
-#if GFX_VER <= 7
-   if (raster_mode == VK_POLYGON_MODE_LINE) {
-      switch (pipeline->line_mode) {
-      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
-         return MSRASTMODE_ON_PATTERN;
-
-      case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
-      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
-         return MSRASTMODE_OFF_PIXEL;
-
-      default:
-         unreachable("Unsupported line rasterization mode");
-      }
-   } else {
-      return pipeline->rasterization_samples > 1 ?
-         MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
-   }
-#else
-   unreachable("Only on gen7");
-#endif
-}
-
-static VkProvokingVertexModeEXT
-vk_provoking_vertex_mode(const VkPipelineRasterizationStateCreateInfo *rs_info)
-{
-   const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *rs_pv_info =
-      vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
-
-   return rs_pv_info == NULL ? VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT :
-                               rs_pv_info->provokingVertexMode;
-}
-
 const uint32_t genX(vk_to_intel_cullmode)[] = {
    [VK_CULL_MODE_NONE]                       = CULLMODE_NONE,
    [VK_CULL_MODE_FRONT_BIT]                  = CULLMODE_FRONT,
@@ -565,302 +792,72 @@ const uint32_t genX(vk_to_intel_front_face)[] = {
    [VK_FRONT_FACE_CLOCKWISE]                 = 0
 };
 
-#if GFX_VER >= 9
-static VkConservativeRasterizationModeEXT
-vk_conservative_rasterization_mode(const VkPipelineRasterizationStateCreateInfo *rs_info)
-{
-   const VkPipelineRasterizationConservativeStateCreateInfoEXT *cr =
-      vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_CONSERVATIVE_STATE_CREATE_INFO_EXT);
-
-   return cr ? cr->conservativeRasterizationMode :
-               VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT;
-}
-#endif
-
-void
-genX(rasterization_mode)(VkPolygonMode raster_mode,
-                         VkLineRasterizationModeEXT line_mode,
-                         float line_width,
-                         uint32_t *api_mode,
-                         bool *msaa_rasterization_enable)
-{
-#if GFX_VER >= 8
-   if (raster_mode == VK_POLYGON_MODE_LINE) {
-      /* Unfortunately, configuring our line rasterization hardware on gfx8
-       * and later is rather painful.  Instead of giving us bits to tell the
-       * hardware what line mode to use like we had on gfx7, we now have an
-       * arcane combination of API Mode and MSAA enable bits which do things
-       * in a table which are expected to magically put the hardware into the
-       * right mode for your API.  Sadly, Vulkan isn't any of the APIs the
-       * hardware people thought of so nothing works the way you want it to.
-       *
-       * Look at the table titled "Multisample Rasterization Modes" in Vol 7
-       * of the Skylake PRM for more details.
-       */
-      switch (line_mode) {
-      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
-         *api_mode = DX100;
-#if GFX_VER <= 9
-         /* Prior to ICL, the algorithm the HW uses to draw wide lines
-          * doesn't quite match what the CTS expects, at least for rectangular
-          * lines, so we set this to false here, making it draw parallelograms
-          * instead, which work well enough.
-          */
-         *msaa_rasterization_enable = line_width < 1.0078125;
-#else
-         *msaa_rasterization_enable = true;
-#endif
-         break;
-
-      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
-      case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
-         *api_mode = DX9OGL;
-         *msaa_rasterization_enable = false;
-         break;
-
-      default:
-         unreachable("Unsupported line rasterization mode");
-      }
-   } else {
-      *api_mode = DX100;
-      *msaa_rasterization_enable = true;
-   }
-#else
-   unreachable("Invalid call");
-#endif
-}
-
 static void
 emit_rs_state(struct anv_graphics_pipeline *pipeline,
-              const VkPipelineInputAssemblyStateCreateInfo *ia_info,
-              const VkPipelineRasterizationStateCreateInfo *rs_info,
-              const VkPipelineMultisampleStateCreateInfo *ms_info,
-              const VkPipelineRasterizationLineStateCreateInfoEXT *line_info,
-              const uint32_t dynamic_states,
-              const struct anv_render_pass *pass,
-              const struct anv_subpass *subpass,
+              const struct vk_input_assembly_state *ia,
+              const struct vk_rasterization_state *rs,
+              const struct vk_multisample_state *ms,
+              const struct vk_render_pass_state *rp,
               enum intel_urb_deref_block_size urb_deref_block_size)
 {
-   struct GENX(3DSTATE_SF) sf = {
-      GENX(3DSTATE_SF_header),
-   };
-
-   sf.ViewportTransformEnable = true;
-   sf.StatisticsEnable = true;
-   sf.VertexSubPixelPrecisionSelect = _8Bit;
-   sf.AALineDistanceMode = true;
-
-   switch (vk_provoking_vertex_mode(rs_info)) {
-   case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
-      sf.TriangleStripListProvokingVertexSelect = 0;
-      sf.LineStripListProvokingVertexSelect = 0;
-      sf.TriangleFanProvokingVertexSelect = 1;
-      break;
-
-   case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
-      sf.TriangleStripListProvokingVertexSelect = 2;
-      sf.LineStripListProvokingVertexSelect = 1;
-      sf.TriangleFanProvokingVertexSelect = 2;
-      break;
-
-   default:
-      unreachable("Invalid provoking vertex mode");
-   }
-
-#if GFX_VERx10 == 75
-   sf.LineStippleEnable = line_info && line_info->stippledLineEnable;
-#endif
+   anv_pipeline_emit(pipeline, partial.sf, GENX(3DSTATE_SF), sf) {
+      sf.ViewportTransformEnable = true;
+      sf.StatisticsEnable = true;
+      sf.VertexSubPixelPrecisionSelect = _8Bit;
+      sf.AALineDistanceMode = true;
 
 #if GFX_VER >= 12
-   sf.DerefBlockSize = urb_deref_block_size;
+      sf.DerefBlockSize = urb_deref_block_size;
 #endif
 
-   const struct brw_vue_prog_data *last_vue_prog_data =
-      anv_pipeline_get_last_vue_prog_data(pipeline);
+      bool point_from_shader;
+      if (anv_pipeline_is_primitive(pipeline)) {
+         const struct brw_vue_prog_data *last_vue_prog_data =
+            anv_pipeline_get_last_vue_prog_data(pipeline);
+         point_from_shader = last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ;
+      } else {
+         assert(anv_pipeline_is_mesh(pipeline));
+         const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+         point_from_shader = mesh_prog_data->map.start_dw[VARYING_SLOT_PSIZ] >= 0;
+      }
 
-   if (last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
-      sf.PointWidthSource = Vertex;
-   } else {
-      sf.PointWidthSource = State;
-      sf.PointWidth = 1.0;
+      if (point_from_shader) {
+         sf.PointWidthSource = Vertex;
+      } else {
+         sf.PointWidthSource = State;
+         sf.PointWidth = 1.0;
+      }
    }
 
-#if GFX_VER >= 8
-   struct GENX(3DSTATE_RASTER) raster = {
-      GENX(3DSTATE_RASTER_header),
-   };
-#else
-#  define raster sf
-#endif
-
-   VkPolygonMode raster_mode =
-      genX(raster_polygon_mode)(pipeline, ia_info->topology);
-   bool dynamic_primitive_topology =
-      dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
-
-   /* For details on 3DSTATE_RASTER multisample state, see the BSpec table
-    * "Multisample Modes State".
-    */
-#if GFX_VER >= 8
-   if (!dynamic_primitive_topology)
-      genX(rasterization_mode)(raster_mode, pipeline->line_mode,
-                               rs_info->lineWidth,
-                               &raster.APIMode,
-                               &raster.DXMultisampleRasterizationEnable);
-
-   /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix
-    * computations.  If we ever set this bit to a different value, they will
-    * need to be updated accordingly.
-    */
-   raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
-   raster.ForceMultisampling = false;
-#else
-   uint32_t ms_rast_mode = 0;
-
-   if (!dynamic_primitive_topology)
-      ms_rast_mode = genX(ms_rasterization_mode)(pipeline, raster_mode);
-
-   raster.MultisampleRasterizationMode = ms_rast_mode;
-#endif
-
-   raster.AntialiasingEnable =
-      dynamic_primitive_topology ? 0 :
-      anv_rasterization_aa_mode(raster_mode, pipeline->line_mode);
-
-   raster.FrontWinding =
-      dynamic_states & ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE ?
-         0 : genX(vk_to_intel_front_face)[rs_info->frontFace];
-   raster.CullMode =
-      dynamic_states & ANV_CMD_DIRTY_DYNAMIC_CULL_MODE ?
-         0 : genX(vk_to_intel_cullmode)[rs_info->cullMode];
-
-   raster.FrontFaceFillMode = genX(vk_to_intel_fillmode)[rs_info->polygonMode];
-   raster.BackFaceFillMode = genX(vk_to_intel_fillmode)[rs_info->polygonMode];
-   raster.ScissorRectangleEnable = true;
-
-#if GFX_VER >= 9
-   /* GFX9+ splits ViewportZClipTestEnable into near and far enable bits */
-   raster.ViewportZFarClipTestEnable = pipeline->depth_clip_enable;
-   raster.ViewportZNearClipTestEnable = pipeline->depth_clip_enable;
-#elif GFX_VER >= 8
-   raster.ViewportZClipTestEnable = pipeline->depth_clip_enable;
-#endif
-
-#if GFX_VER >= 9
-   raster.ConservativeRasterizationEnable =
-      vk_conservative_rasterization_mode(rs_info) !=
-         VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT;
-#endif
-
-   bool depth_bias_enable =
-      dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE ?
-         0 : rs_info->depthBiasEnable;
-
-   raster.GlobalDepthOffsetEnableSolid = depth_bias_enable;
-   raster.GlobalDepthOffsetEnableWireframe = depth_bias_enable;
-   raster.GlobalDepthOffsetEnablePoint = depth_bias_enable;
+   anv_pipeline_emit(pipeline, partial.raster, GENX(3DSTATE_RASTER), raster) {
+      /* For details on 3DSTATE_RASTER multisample state, see the BSpec table
+       * "Multisample Modes State".
+       */
+      /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix
+       * computations.  If we ever set this bit to a different value, they will
+       * need to be updated accordingly.
+       */
+      raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
+      raster.ForceMultisampling = false;
 
-#if GFX_VER == 7
-   /* Gfx7 requires that we provide the depth format in 3DSTATE_SF so that it
-    * can get the depth offsets correct.
-    */
-   if (subpass->depth_stencil_attachment) {
-      VkFormat vk_format =
-         pass->attachments[subpass->depth_stencil_attachment->attachment].format;
-      assert(vk_format_is_depth_or_stencil(vk_format));
-      if (vk_format_aspects(vk_format) & VK_IMAGE_ASPECT_DEPTH_BIT) {
-         enum isl_format isl_format =
-            anv_get_isl_format(&pipeline->base.device->info, vk_format,
-                               VK_IMAGE_ASPECT_DEPTH_BIT,
-                               VK_IMAGE_TILING_OPTIMAL);
-         sf.DepthBufferSurfaceFormat =
-            isl_format_get_depth_format(isl_format, false);
-      }
+      raster.ScissorRectangleEnable = true;
    }
-#endif
-
-#if GFX_VER >= 8
-   GENX(3DSTATE_SF_pack)(NULL, pipeline->gfx8.sf, &sf);
-   GENX(3DSTATE_RASTER_pack)(NULL, pipeline->gfx8.raster, &raster);
-#else
-#  undef raster
-   GENX(3DSTATE_SF_pack)(NULL, &pipeline->gfx7.sf, &sf);
-#endif
 }
 
 static void
 emit_ms_state(struct anv_graphics_pipeline *pipeline,
-              const VkPipelineMultisampleStateCreateInfo *info,
-              uint32_t dynamic_states)
+              const struct vk_multisample_state *ms)
 {
-   /* Only lookup locations if the extensions is active, otherwise the default
-    * ones will be used either at device initialization time or through
-    * 3DSTATE_MULTISAMPLE on Gfx7/7.5 by passing NULL locations.
-    */
-   if (pipeline->base.device->vk.enabled_extensions.EXT_sample_locations) {
-      /* If the sample locations are dynamic, 3DSTATE_MULTISAMPLE on Gfx7/7.5
-       * will be emitted dynamically, so skip it here. On Gfx8+
-       * 3DSTATE_SAMPLE_PATTERN will be emitted dynamically, so skip it here.
-       */
-      if (!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)) {
-#if GFX_VER >= 8
-         genX(emit_sample_pattern)(&pipeline->base.batch,
-                                   pipeline->dynamic_state.sample_locations.samples,
-                                   pipeline->dynamic_state.sample_locations.locations);
-#endif
-      }
+   anv_pipeline_emit(pipeline, partial.ms, GENX(3DSTATE_MULTISAMPLE), ms) {
+      ms.PixelLocation              = CENTER;
 
-      genX(emit_multisample)(&pipeline->base.batch,
-                             pipeline->dynamic_state.sample_locations.samples,
-                             pipeline->dynamic_state.sample_locations.locations);
-   } else {
-      /* On Gfx8+ 3DSTATE_MULTISAMPLE does not hold anything we need to modify
-       * for sample locations, so we don't have to emit it dynamically.
+      /* The PRM says that this bit is valid only for DX9:
+       *
+       *    SW can choose to set this bit only for DX9 API. DX10/OGL API's
+       *    should not have any effect by setting or not setting this bit.
        */
-#if GFX_VER >= 8
-      genX(emit_multisample)(&pipeline->base.batch,
-                             info ? info->rasterizationSamples : 1,
-                             NULL);
-#endif
-   }
-
-   /* From the Vulkan 1.0 spec:
-    *    If pSampleMask is NULL, it is treated as if the mask has all bits
-    *    enabled, i.e. no coverage is removed from fragments.
-    *
-    * 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits.
-    */
-#if GFX_VER >= 8
-   uint32_t sample_mask = 0xffff;
-#else
-   uint32_t sample_mask = 0xff;
-#endif
-
-   if (info && info->pSampleMask)
-      sample_mask &= info->pSampleMask[0];
-
-   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SAMPLE_MASK), sm) {
-      sm.SampleMask = sample_mask;
-   }
-
-   pipeline->cps_state = ANV_STATE_NULL;
-#if GFX_VER >= 11
-   if (!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE) &&
-       pipeline->base.device->vk.enabled_extensions.KHR_fragment_shading_rate) {
-#if GFX_VER >= 12
-      struct anv_device *device = pipeline->base.device;
-      const uint32_t num_dwords =
-         GENX(CPS_STATE_length) * 4 * pipeline->dynamic_state.viewport.count;
-      pipeline->cps_state =
-         anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords, 32);
-#endif
-
-      genX(emit_shading_rate)(&pipeline->base.batch,
-                              pipeline,
-                              pipeline->cps_state,
-                              &pipeline->dynamic_state);
+      ms.PixelPositionOffsetEnable  = false;
    }
-#endif
 }
 
 const uint32_t genX(vk_to_intel_logic_op)[] = {
@@ -882,36 +879,6 @@ const uint32_t genX(vk_to_intel_logic_op)[] = {
    [VK_LOGIC_OP_SET]                         = LOGICOP_SET,
 };
 
-static const uint32_t vk_to_intel_blend[] = {
-   [VK_BLEND_FACTOR_ZERO]                    = BLENDFACTOR_ZERO,
-   [VK_BLEND_FACTOR_ONE]                     = BLENDFACTOR_ONE,
-   [VK_BLEND_FACTOR_SRC_COLOR]               = BLENDFACTOR_SRC_COLOR,
-   [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR]     = BLENDFACTOR_INV_SRC_COLOR,
-   [VK_BLEND_FACTOR_DST_COLOR]               = BLENDFACTOR_DST_COLOR,
-   [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR]     = BLENDFACTOR_INV_DST_COLOR,
-   [VK_BLEND_FACTOR_SRC_ALPHA]               = BLENDFACTOR_SRC_ALPHA,
-   [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA]     = BLENDFACTOR_INV_SRC_ALPHA,
-   [VK_BLEND_FACTOR_DST_ALPHA]               = BLENDFACTOR_DST_ALPHA,
-   [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA]     = BLENDFACTOR_INV_DST_ALPHA,
-   [VK_BLEND_FACTOR_CONSTANT_COLOR]          = BLENDFACTOR_CONST_COLOR,
-   [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR,
-   [VK_BLEND_FACTOR_CONSTANT_ALPHA]          = BLENDFACTOR_CONST_ALPHA,
-   [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA,
-   [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE]      = BLENDFACTOR_SRC_ALPHA_SATURATE,
-   [VK_BLEND_FACTOR_SRC1_COLOR]              = BLENDFACTOR_SRC1_COLOR,
-   [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR]    = BLENDFACTOR_INV_SRC1_COLOR,
-   [VK_BLEND_FACTOR_SRC1_ALPHA]              = BLENDFACTOR_SRC1_ALPHA,
-   [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA]    = BLENDFACTOR_INV_SRC1_ALPHA,
-};
-
-static const uint32_t vk_to_intel_blend_op[] = {
-   [VK_BLEND_OP_ADD]                         = BLENDFUNCTION_ADD,
-   [VK_BLEND_OP_SUBTRACT]                    = BLENDFUNCTION_SUBTRACT,
-   [VK_BLEND_OP_REVERSE_SUBTRACT]            = BLENDFUNCTION_REVERSE_SUBTRACT,
-   [VK_BLEND_OP_MIN]                         = BLENDFUNCTION_MIN,
-   [VK_BLEND_OP_MAX]                         = BLENDFUNCTION_MAX,
-};
-
 const uint32_t genX(vk_to_intel_compare_op)[] = {
    [VK_COMPARE_OP_NEVER]                        = PREFILTEROP_NEVER,
    [VK_COMPARE_OP_LESS]                         = PREFILTEROP_LESS,
@@ -947,656 +914,99 @@ const uint32_t genX(vk_to_intel_primitive_type)[] = {
    [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
 };
 
-/* This function sanitizes the VkStencilOpState by looking at the compare ops
- * and trying to determine whether or not a given stencil op can ever actually
- * occur.  Stencil ops which can never occur are set to VK_STENCIL_OP_KEEP.
- * This function returns true if, after sanitation, any of the stencil ops are
- * set to something other than VK_STENCIL_OP_KEEP.
- */
-static bool
-sanitize_stencil_face(VkStencilOpState *face,
-                      VkCompareOp depthCompareOp)
-{
-   /* If compareOp is ALWAYS then the stencil test will never fail and failOp
-    * will never happen.  Set failOp to KEEP in this case.
-    */
-   if (face->compareOp == VK_COMPARE_OP_ALWAYS)
-      face->failOp = VK_STENCIL_OP_KEEP;
-
-   /* If compareOp is NEVER or depthCompareOp is NEVER then one of the depth
-    * or stencil tests will fail and passOp will never happen.
-    */
-   if (face->compareOp == VK_COMPARE_OP_NEVER ||
-       depthCompareOp == VK_COMPARE_OP_NEVER)
-      face->passOp = VK_STENCIL_OP_KEEP;
-
-   /* If compareOp is NEVER or depthCompareOp is ALWAYS then either the
-    * stencil test will fail or the depth test will pass.  In either case,
-    * depthFailOp will never happen.
-    */
-   if (face->compareOp == VK_COMPARE_OP_NEVER ||
-       depthCompareOp == VK_COMPARE_OP_ALWAYS)
-      face->depthFailOp = VK_STENCIL_OP_KEEP;
-
-   return face->failOp != VK_STENCIL_OP_KEEP ||
-          face->depthFailOp != VK_STENCIL_OP_KEEP ||
-          face->passOp != VK_STENCIL_OP_KEEP;
-}
-
-/* Intel hardware is fairly sensitive to whether or not depth/stencil writes
- * are enabled.  In the presence of discards, it's fairly easy to get into the
- * non-promoted case which means a fairly big performance hit.  From the Iron
- * Lake PRM, Vol 2, pt. 1, section 8.4.3.2, "Early Depth Test Cases":
- *
- *    "Non-promoted depth (N) is active whenever the depth test can be done
- *    early but it cannot determine whether or not to write source depth to
- *    the depth buffer, therefore the depth write must be performed post pixel
- *    shader. This includes cases where the pixel shader can kill pixels,
- *    including via sampler chroma key, as well as cases where the alpha test
- *    function is enabled, which kills pixels based on a programmable alpha
- *    test. In this case, even if the depth test fails, the pixel cannot be
- *    killed if a stencil write is indicated. Whether or not the stencil write
- *    happens depends on whether or not the pixel is killed later. In these
- *    cases if stencil test fails and stencil writes are off, the pixels can
- *    also be killed early. If stencil writes are enabled, the pixels must be
- *    treated as Computed depth (described above)."
- *
- * The same thing as mentioned in the stencil case can happen in the depth
- * case as well if it thinks it writes depth but, thanks to the depth test
- * being GL_EQUAL, the write doesn't actually matter.  A little extra work
- * up-front to try and disable depth and stencil writes can make a big
- * difference.
- *
- * Unfortunately, the way depth and stencil testing is specified, there are
- * many case where, regardless of depth/stencil writes being enabled, nothing
- * actually gets written due to some other bit of state being set.  This
- * function attempts to "sanitize" the depth stencil state and disable writes
- * and sometimes even testing whenever possible.
- */
-static void
-sanitize_ds_state(VkPipelineDepthStencilStateCreateInfo *state,
-                  bool *stencilWriteEnable,
-                  VkImageAspectFlags ds_aspects)
-{
-   *stencilWriteEnable = state->stencilTestEnable;
-
-   /* If the depth test is disabled, we won't be writing anything. Make sure we
-    * treat the test as always passing later on as well.
-    *
-    * Also, the Vulkan spec requires that if either depth or stencil is not
-    * present, the pipeline is to act as if the test silently passes. In that
-    * case we won't write either.
-    */
-   if (!state->depthTestEnable || !(ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
-      state->depthWriteEnable = false;
-      state->depthCompareOp = VK_COMPARE_OP_ALWAYS;
-   }
-
-   if (!(ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT)) {
-      *stencilWriteEnable = false;
-      state->front.compareOp = VK_COMPARE_OP_ALWAYS;
-      state->back.compareOp = VK_COMPARE_OP_ALWAYS;
-   }
-
-   /* If the stencil test is enabled and always fails, then we will never get
-    * to the depth test so we can just disable the depth test entirely.
-    */
-   if (state->stencilTestEnable &&
-       state->front.compareOp == VK_COMPARE_OP_NEVER &&
-       state->back.compareOp == VK_COMPARE_OP_NEVER) {
-      state->depthTestEnable = false;
-      state->depthWriteEnable = false;
-   }
-
-   /* If depthCompareOp is EQUAL then the value we would be writing to the
-    * depth buffer is the same as the value that's already there so there's no
-    * point in writing it.
-    */
-   if (state->depthCompareOp == VK_COMPARE_OP_EQUAL)
-      state->depthWriteEnable = false;
-
-   /* If the stencil ops are such that we don't actually ever modify the
-    * stencil buffer, we should disable writes.
-    */
-   if (!sanitize_stencil_face(&state->front, state->depthCompareOp) &&
-       !sanitize_stencil_face(&state->back, state->depthCompareOp))
-      *stencilWriteEnable = false;
-
-   /* If the depth test always passes and we never write out depth, that's the
-    * same as if the depth test is disabled entirely.
-    */
-   if (state->depthCompareOp == VK_COMPARE_OP_ALWAYS &&
-       !state->depthWriteEnable)
-      state->depthTestEnable = false;
-
-   /* If the stencil test always passes and we never write out stencil, that's
-    * the same as if the stencil test is disabled entirely.
-    */
-   if (state->front.compareOp == VK_COMPARE_OP_ALWAYS &&
-       state->back.compareOp == VK_COMPARE_OP_ALWAYS &&
-       !*stencilWriteEnable)
-      state->stencilTestEnable = false;
-}
-
-static void
-emit_ds_state(struct anv_graphics_pipeline *pipeline,
-              const VkPipelineDepthStencilStateCreateInfo *pCreateInfo,
-              const uint32_t dynamic_states,
-              const struct anv_render_pass *pass,
-              const struct anv_subpass *subpass)
-{
-#if GFX_VER == 7
-#  define depth_stencil_dw pipeline->gfx7.depth_stencil_state
-#elif GFX_VER == 8
-#  define depth_stencil_dw pipeline->gfx8.wm_depth_stencil
-#else
-#  define depth_stencil_dw pipeline->gfx9.wm_depth_stencil
-#endif
-
-   if (pCreateInfo == NULL) {
-      /* We're going to OR this together with the dynamic state.  We need
-       * to make sure it's initialized to something useful.
-       */
-      pipeline->writes_stencil = false;
-      pipeline->stencil_test_enable = false;
-      pipeline->writes_depth = false;
-      pipeline->depth_test_enable = false;
-      pipeline->depth_bounds_test_enable = false;
-      memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));
-      return;
-   }
-
-   VkImageAspectFlags ds_aspects = 0;
-   if (subpass->depth_stencil_attachment) {
-      VkFormat depth_stencil_format =
-         pass->attachments[subpass->depth_stencil_attachment->attachment].format;
-      ds_aspects = vk_format_aspects(depth_stencil_format);
-   }
-
-   VkPipelineDepthStencilStateCreateInfo info = *pCreateInfo;
-   sanitize_ds_state(&info, &pipeline->writes_stencil, ds_aspects);
-   pipeline->stencil_test_enable = info.stencilTestEnable;
-   pipeline->writes_depth = info.depthWriteEnable;
-   pipeline->depth_test_enable = info.depthTestEnable;
-   pipeline->depth_bounds_test_enable = info.depthBoundsTestEnable;
-
-   bool dynamic_stencil_op =
-      dynamic_states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
-
-#if GFX_VER <= 7
-   struct GENX(DEPTH_STENCIL_STATE) depth_stencil = {
-#else
-   struct GENX(3DSTATE_WM_DEPTH_STENCIL) depth_stencil = {
-#endif
-      .DepthTestEnable =
-         dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE ?
-            0 : info.depthTestEnable,
-
-      .DepthBufferWriteEnable =
-         dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE ?
-            0 : info.depthWriteEnable,
-
-      .DepthTestFunction =
-         dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP ?
-            0 : genX(vk_to_intel_compare_op)[info.depthCompareOp],
-
-      .DoubleSidedStencilEnable = true,
-
-      .StencilTestEnable =
-         dynamic_states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE ?
-            0 : info.stencilTestEnable,
-
-      .StencilFailOp = genX(vk_to_intel_stencil_op)[info.front.failOp],
-      .StencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[info.front.passOp],
-      .StencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[info.front.depthFailOp],
-      .StencilTestFunction = genX(vk_to_intel_compare_op)[info.front.compareOp],
-      .BackfaceStencilFailOp = genX(vk_to_intel_stencil_op)[info.back.failOp],
-      .BackfaceStencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[info.back.passOp],
-      .BackfaceStencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[info.back.depthFailOp],
-      .BackfaceStencilTestFunction = genX(vk_to_intel_compare_op)[info.back.compareOp],
-   };
-
-   if (dynamic_stencil_op) {
-      depth_stencil.StencilFailOp = 0;
-      depth_stencil.StencilPassDepthPassOp = 0;
-      depth_stencil.StencilPassDepthFailOp = 0;
-      depth_stencil.StencilTestFunction = 0;
-      depth_stencil.BackfaceStencilFailOp = 0;
-      depth_stencil.BackfaceStencilPassDepthPassOp = 0;
-      depth_stencil.BackfaceStencilPassDepthFailOp = 0;
-      depth_stencil.BackfaceStencilTestFunction = 0;
-   }
-
-#if GFX_VER <= 7
-   GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil);
-#else
-   GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, depth_stencil_dw, &depth_stencil);
-#endif
-}
-
-static bool
-is_dual_src_blend_factor(VkBlendFactor factor)
-{
-   return factor == VK_BLEND_FACTOR_SRC1_COLOR ||
-          factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR ||
-          factor == VK_BLEND_FACTOR_SRC1_ALPHA ||
-          factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA;
-}
-
-static inline uint32_t *
-write_disabled_blend(uint32_t *state)
-{
-   struct GENX(BLEND_STATE_ENTRY) entry = {
-      .WriteDisableAlpha = true,
-      .WriteDisableRed = true,
-      .WriteDisableGreen = true,
-      .WriteDisableBlue = true,
-   };
-   GENX(BLEND_STATE_ENTRY_pack)(NULL, state, &entry);
-   return state + GENX(BLEND_STATE_ENTRY_length);
-}
-
 static void
-emit_cb_state(struct anv_graphics_pipeline *pipeline,
-              const VkPipelineColorBlendStateCreateInfo *info,
-              const VkPipelineMultisampleStateCreateInfo *ms_info,
-              uint32_t dynamic_states)
+emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,
+                  const struct vk_input_assembly_state *ia,
+                  const struct vk_viewport_state *vp,
+                  const struct vk_rasterization_state *rs)
 {
-   struct anv_device *device = pipeline->base.device;
    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+   (void) wm_prog_data;
 
-   struct GENX(BLEND_STATE) blend_state = {
-#if GFX_VER >= 8
-      .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
-      .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
-#endif
-   };
-
-   uint32_t surface_count = 0;
-   struct anv_pipeline_bind_map *map;
-   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
-      map = &pipeline->shaders[MESA_SHADER_FRAGMENT]->bind_map;
-      surface_count = map->surface_count;
-   }
-
-   const uint32_t num_dwords = GENX(BLEND_STATE_length) +
-      GENX(BLEND_STATE_ENTRY_length) * surface_count;
-   uint32_t *blend_state_start, *state_pos;
-
-   if (dynamic_states & (ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |
-                         ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP)) {
-      const struct intel_device_info *devinfo = &pipeline->base.device->info;
-      blend_state_start = devinfo->ver >= 8 ?
-         pipeline->gfx8.blend_state : pipeline->gfx7.blend_state;
-      pipeline->blend_state = ANV_STATE_NULL;
-   } else {
-      pipeline->blend_state =
-         anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords * 4, 64);
-      blend_state_start = pipeline->blend_state.map;
-   }
-   state_pos = blend_state_start;
-
-   bool has_writeable_rt = false;
-   state_pos += GENX(BLEND_STATE_length);
-#if GFX_VER >= 8
-   struct GENX(BLEND_STATE_ENTRY) bs0 = { 0 };
-#endif
-   for (unsigned i = 0; i < surface_count; i++) {
-      struct anv_pipeline_binding *binding = &map->surface_to_descriptor[i];
-
-      /* All color attachments are at the beginning of the binding table */
-      if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)
-         break;
-
-      /* We can have at most 8 attachments */
-      assert(i < MAX_RTS);
-
-      if (info == NULL || binding->index >= info->attachmentCount) {
-         state_pos = write_disabled_blend(state_pos);
-         continue;
-      }
+   anv_pipeline_emit(pipeline, partial.clip, GENX(3DSTATE_CLIP), clip) {
+      clip.ClipEnable               = true;
+      clip.StatisticsEnable         = true;
+      clip.EarlyCullEnable          = true;
+      clip.GuardbandClipTestEnable  = true;
 
-      if ((pipeline->dynamic_state.color_writes & (1u << binding->index)) == 0) {
-         state_pos = write_disabled_blend(state_pos);
-         continue;
-      }
+      clip.VertexSubPixelPrecisionSelect = _8Bit;
+      clip.ClipMode = CLIPMODE_NORMAL;
 
-      const VkPipelineColorBlendAttachmentState *a =
-         &info->pAttachments[binding->index];
+      clip.MinimumPointWidth = 0.125;
+      clip.MaximumPointWidth = 255.875;
 
-      struct GENX(BLEND_STATE_ENTRY) entry = {
-#if GFX_VER < 8
-         .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
-         .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
-#endif
-         .LogicOpEnable = info->logicOpEnable,
-         .LogicOpFunction = dynamic_states & ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP ?
-                            0: genX(vk_to_intel_logic_op)[info->logicOp],
+      /* TODO(mesh): Multiview. */
+      if (anv_pipeline_is_primitive(pipeline)) {
+         const struct brw_vue_prog_data *last =
+            anv_pipeline_get_last_vue_prog_data(pipeline);
 
-         /* Vulkan specification 1.2.168, VkLogicOp:
+         /* From the Vulkan 1.0.45 spec:
           *
-          *   "Logical operations are controlled by the logicOpEnable and
-          *    logicOp members of VkPipelineColorBlendStateCreateInfo. If
-          *    logicOpEnable is VK_TRUE, then a logical operation selected by
-          *    logicOp is applied between each color attachment and the
-          *    fragment’s corresponding output value, and blending of all
-          *    attachments is treated as if it were disabled."
-          *
-          * From the Broadwell PRM Volume 2d: Command Reference: Structures:
-          * BLEND_STATE_ENTRY:
-          *
-          *   "Enabling LogicOp and Color Buffer Blending at the same time is
-          *    UNDEFINED"
+          *    "If the last active vertex processing stage shader entry
+          *    point's interface does not include a variable decorated with
+          *    ViewportIndex, then the first viewport is used."
           */
-         .ColorBufferBlendEnable = !info->logicOpEnable && a->blendEnable,
-         .ColorClampRange = COLORCLAMP_RTFORMAT,
-         .PreBlendColorClampEnable = true,
-         .PostBlendColorClampEnable = true,
-         .SourceBlendFactor = vk_to_intel_blend[a->srcColorBlendFactor],
-         .DestinationBlendFactor = vk_to_intel_blend[a->dstColorBlendFactor],
-         .ColorBlendFunction = vk_to_intel_blend_op[a->colorBlendOp],
-         .SourceAlphaBlendFactor = vk_to_intel_blend[a->srcAlphaBlendFactor],
-         .DestinationAlphaBlendFactor = vk_to_intel_blend[a->dstAlphaBlendFactor],
-         .AlphaBlendFunction = vk_to_intel_blend_op[a->alphaBlendOp],
-         .WriteDisableAlpha = !(a->colorWriteMask & VK_COLOR_COMPONENT_A_BIT),
-         .WriteDisableRed = !(a->colorWriteMask & VK_COLOR_COMPONENT_R_BIT),
-         .WriteDisableGreen = !(a->colorWriteMask & VK_COLOR_COMPONENT_G_BIT),
-         .WriteDisableBlue = !(a->colorWriteMask & VK_COLOR_COMPONENT_B_BIT),
-      };
-
-      if (a->srcColorBlendFactor != a->srcAlphaBlendFactor ||
-          a->dstColorBlendFactor != a->dstAlphaBlendFactor ||
-          a->colorBlendOp != a->alphaBlendOp) {
-#if GFX_VER >= 8
-         blend_state.IndependentAlphaBlendEnable = true;
-#else
-         entry.IndependentAlphaBlendEnable = true;
-#endif
-      }
-
-      /* The Dual Source Blending documentation says:
-       *
-       * "If SRC1 is included in a src/dst blend factor and
-       * a DualSource RT Write message is not used, results
-       * are UNDEFINED. (This reflects the same restriction in DX APIs,
-       * where undefined results are produced if “o1” is not written
-       * by a PS – there are no default values defined)."
-       *
-       * There is no way to gracefully fix this undefined situation
-       * so we just disable the blending to prevent possible issues.
-       */
-      if (!wm_prog_data->dual_src_blend &&
-          (is_dual_src_blend_factor(a->srcColorBlendFactor) ||
-           is_dual_src_blend_factor(a->dstColorBlendFactor) ||
-           is_dual_src_blend_factor(a->srcAlphaBlendFactor) ||
-           is_dual_src_blend_factor(a->dstAlphaBlendFactor))) {
-         vk_debug_report(&device->physical->instance->vk,
-                         VK_DEBUG_REPORT_WARNING_BIT_EXT,
-                         &device->vk.base, 0, 0, "anv",
-                         "Enabled dual-src blend factors without writing both targets "
-                         "in the shader.  Disabling blending to avoid GPU hangs.");
-         entry.ColorBufferBlendEnable = false;
-      }
+         if (vp && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) {
+            clip.MaximumVPIndex = vp->viewport_count > 0 ?
+               vp->viewport_count - 1 : 0;
+         } else {
+            clip.MaximumVPIndex = 0;
+         }
 
-      if (a->colorWriteMask != 0)
-         has_writeable_rt = true;
+         /* From the Vulkan 1.0.45 spec:
+          *
+          *    "If the last active vertex processing stage shader entry point's
+          *    interface does not include a variable decorated with Layer, then
+          *    the first layer is used."
+          */
+         clip.ForceZeroRTAIndexEnable =
+            !(last->vue_map.slots_valid & VARYING_BIT_LAYER);
+
+      } else if (anv_pipeline_is_mesh(pipeline)) {
+         const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+         if (vp && vp->viewport_count > 0 &&
+             mesh_prog_data->map.start_dw[VARYING_SLOT_VIEWPORT] >= 0) {
+            clip.MaximumVPIndex = vp->viewport_count - 1;
+         } else {
+            clip.MaximumVPIndex = 0;
+         }
 
-      /* Our hardware applies the blend factor prior to the blend function
-       * regardless of what function is used.  Technically, this means the
-       * hardware can do MORE than GL or Vulkan specify.  However, it also
-       * means that, for MIN and MAX, we have to stomp the blend factor to
-       * ONE to make it a no-op.
-       */
-      if (a->colorBlendOp == VK_BLEND_OP_MIN ||
-          a->colorBlendOp == VK_BLEND_OP_MAX) {
-         entry.SourceBlendFactor = BLENDFACTOR_ONE;
-         entry.DestinationBlendFactor = BLENDFACTOR_ONE;
-      }
-      if (a->alphaBlendOp == VK_BLEND_OP_MIN ||
-          a->alphaBlendOp == VK_BLEND_OP_MAX) {
-         entry.SourceAlphaBlendFactor = BLENDFACTOR_ONE;
-         entry.DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
+         clip.ForceZeroRTAIndexEnable =
+            mesh_prog_data->map.start_dw[VARYING_SLOT_LAYER] < 0;
       }
-      GENX(BLEND_STATE_ENTRY_pack)(NULL, state_pos, &entry);
-      state_pos += GENX(BLEND_STATE_ENTRY_length);
-#if GFX_VER >= 8
-      if (i == 0)
-         bs0 = entry;
-#endif
-   }
 
-#if GFX_VER >= 8
-   struct GENX(3DSTATE_PS_BLEND) blend = {
-      GENX(3DSTATE_PS_BLEND_header),
-   };
-   blend.AlphaToCoverageEnable         = blend_state.AlphaToCoverageEnable;
-   blend.HasWriteableRT                = has_writeable_rt;
-   blend.ColorBufferBlendEnable        = bs0.ColorBufferBlendEnable;
-   blend.SourceAlphaBlendFactor        = bs0.SourceAlphaBlendFactor;
-   blend.DestinationAlphaBlendFactor   = bs0.DestinationAlphaBlendFactor;
-   blend.SourceBlendFactor             = bs0.SourceBlendFactor;
-   blend.DestinationBlendFactor        = bs0.DestinationBlendFactor;
-   blend.AlphaTestEnable               = false;
-   blend.IndependentAlphaBlendEnable   = blend_state.IndependentAlphaBlendEnable;
-
-   if (dynamic_states & (ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |
-                        ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP)) {
-      GENX(3DSTATE_PS_BLEND_pack)(NULL, pipeline->gfx8.ps_blend, &blend);
-   } else {
-      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_BLEND), _blend)
-         _blend = blend;
+      clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
+         wm_prog_data->uses_nonperspective_interp_modes : 0;
    }
-#else
-   (void)has_writeable_rt;
-#endif
 
-   GENX(BLEND_STATE_pack)(NULL, blend_state_start, &blend_state);
-
-   if (!(dynamic_states & (ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |
-                           ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP))) {
-      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
-         bsp.BlendStatePointer      = pipeline->blend_state.offset;
-#if GFX_VER >= 8
-         bsp.BlendStatePointerValid = true;
-#endif
+#if GFX_VERx10 >= 125
+   if (anv_pipeline_is_mesh(pipeline)) {
+      const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+      anv_pipeline_emit(pipeline, final.clip_mesh,
+                        GENX(3DSTATE_CLIP_MESH), clip_mesh) {
+         clip_mesh.PrimitiveHeaderEnable = mesh_prog_data->map.per_primitive_header_size_dw > 0;
+         clip_mesh.UserClipDistanceClipTestEnableBitmask = mesh_prog_data->clip_distance_mask;
+         clip_mesh.UserClipDistanceCullTestEnableBitmask = mesh_prog_data->cull_distance_mask;
       }
    }
-}
-
-static void
-emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,
-                  const VkPipelineInputAssemblyStateCreateInfo *ia_info,
-                  const VkPipelineViewportStateCreateInfo *vp_info,
-                  const VkPipelineRasterizationStateCreateInfo *rs_info,
-                  const uint32_t dynamic_states)
-{
-   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
-   (void) wm_prog_data;
-
-   struct GENX(3DSTATE_CLIP) clip = {
-      GENX(3DSTATE_CLIP_header),
-   };
-
-   clip.ClipEnable               = true;
-   clip.StatisticsEnable         = true;
-   clip.EarlyCullEnable          = true;
-   clip.APIMode                  = APIMODE_D3D;
-   clip.GuardbandClipTestEnable  = true;
-
-   /* Only enable the XY clip test when the final polygon rasterization
-    * mode is VK_POLYGON_MODE_FILL.  We want to leave it disabled for
-    * points and lines so we get "pop-free" clipping.
-    */
-   VkPolygonMode raster_mode =
-      genX(raster_polygon_mode)(pipeline, ia_info->topology);
-   clip.ViewportXYClipTestEnable =
-      dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY ?
-         0 : (raster_mode == VK_POLYGON_MODE_FILL);
-
-#if GFX_VER >= 8
-   clip.VertexSubPixelPrecisionSelect = _8Bit;
-#endif
-   clip.ClipMode = CLIPMODE_NORMAL;
-
-   switch (vk_provoking_vertex_mode(rs_info)) {
-   case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
-      clip.TriangleStripListProvokingVertexSelect = 0;
-      clip.LineStripListProvokingVertexSelect = 0;
-      clip.TriangleFanProvokingVertexSelect = 1;
-      break;
-
-   case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
-      clip.TriangleStripListProvokingVertexSelect = 2;
-      clip.LineStripListProvokingVertexSelect = 1;
-      clip.TriangleFanProvokingVertexSelect = 2;
-      break;
-
-   default:
-      unreachable("Invalid provoking vertex mode");
-   }
-
-   clip.MinimumPointWidth = 0.125;
-   clip.MaximumPointWidth = 255.875;
-
-   const struct brw_vue_prog_data *last =
-      anv_pipeline_get_last_vue_prog_data(pipeline);
-
-   /* From the Vulkan 1.0.45 spec:
-    *
-    *    "If the last active vertex processing stage shader entry point's
-    *    interface does not include a variable decorated with
-    *    ViewportIndex, then the first viewport is used."
-    */
-   if (vp_info && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) {
-      clip.MaximumVPIndex = vp_info->viewportCount > 0 ?
-         vp_info->viewportCount - 1 : 0;
-   } else {
-      clip.MaximumVPIndex = 0;
-   }
-
-   /* From the Vulkan 1.0.45 spec:
-    *
-    *    "If the last active vertex processing stage shader entry point's
-    *    interface does not include a variable decorated with Layer, then
-    *    the first layer is used."
-    */
-   clip.ForceZeroRTAIndexEnable =
-      !(last->vue_map.slots_valid & VARYING_BIT_LAYER);
-
-#if GFX_VER == 7
-   clip.FrontWinding            = genX(vk_to_intel_front_face)[rs_info->frontFace];
-   clip.CullMode                = genX(vk_to_intel_cullmode)[rs_info->cullMode];
-   clip.ViewportZClipTestEnable = pipeline->depth_clip_enable;
-   clip.UserClipDistanceClipTestEnableBitmask = last->clip_distance_mask;
-   clip.UserClipDistanceCullTestEnableBitmask = last->cull_distance_mask;
-#else
-   clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
-      (wm_prog_data->barycentric_interp_modes &
-       BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0 : 0;
 #endif
-
-   GENX(3DSTATE_CLIP_pack)(NULL, pipeline->gfx7.clip, &clip);
 }
 
 static void
 emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
-                       const VkPipelineRasterizationStateCreateInfo *rs_info,
-                       const uint32_t dynamic_states)
+                       const struct vk_rasterization_state *rs)
 {
    const struct brw_vue_prog_data *prog_data =
       anv_pipeline_get_last_vue_prog_data(pipeline);
-   const struct brw_vue_map *vue_map = &prog_data->vue_map;
+   const struct intel_vue_map *vue_map = &prog_data->vue_map;
 
    nir_xfb_info *xfb_info;
    if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
-      xfb_info = pipeline->shaders[MESA_SHADER_GEOMETRY]->xfb_info;
+      xfb_info = pipeline->base.shaders[MESA_SHADER_GEOMETRY]->xfb_info;
    else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
-      xfb_info = pipeline->shaders[MESA_SHADER_TESS_EVAL]->xfb_info;
+      xfb_info = pipeline->base.shaders[MESA_SHADER_TESS_EVAL]->xfb_info;
    else
-      xfb_info = pipeline->shaders[MESA_SHADER_VERTEX]->xfb_info;
-
-#if GFX_VER == 7
-#  define streamout_state_dw pipeline->gfx7.streamout_state
-#else
-#  define streamout_state_dw pipeline->gfx8.streamout_state
-#endif
-
-   struct GENX(3DSTATE_STREAMOUT) so = {
-      GENX(3DSTATE_STREAMOUT_header),
-      .RenderingDisable =
-         (dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE) ?
-            0 : rs_info->rasterizerDiscardEnable,
-   };
-
-   if (xfb_info) {
-      so.SOFunctionEnable = true;
-      so.SOStatisticsEnable = true;
-
-      switch (vk_provoking_vertex_mode(rs_info)) {
-      case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
-         so.ReorderMode = LEADING;
-         break;
-
-      case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
-         so.ReorderMode = TRAILING;
-         break;
-
-      default:
-         unreachable("Invalid provoking vertex mode");
-      }
-
-      const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info =
-         vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT);
-      so.RenderStreamSelect = stream_info ?
-                              stream_info->rasterizationStream : 0;
-
-#if GFX_VER >= 8
-      so.Buffer0SurfacePitch = xfb_info->buffers[0].stride;
-      so.Buffer1SurfacePitch = xfb_info->buffers[1].stride;
-      so.Buffer2SurfacePitch = xfb_info->buffers[2].stride;
-      so.Buffer3SurfacePitch = xfb_info->buffers[3].stride;
-#else
-      pipeline->gfx7.xfb_bo_pitch[0] = xfb_info->buffers[0].stride;
-      pipeline->gfx7.xfb_bo_pitch[1] = xfb_info->buffers[1].stride;
-      pipeline->gfx7.xfb_bo_pitch[2] = xfb_info->buffers[2].stride;
-      pipeline->gfx7.xfb_bo_pitch[3] = xfb_info->buffers[3].stride;
-
-      /* On Gfx7, the SO buffer enables live in 3DSTATE_STREAMOUT which
-       * is a bit inconvenient because we don't know what buffers will
-       * actually be enabled until draw time.  We do our best here by
-       * setting them based on buffers_written and we disable them
-       * as-needed at draw time by setting EndAddress = BaseAddress.
-       */
-      so.SOBufferEnable0 = xfb_info->buffers_written & (1 << 0);
-      so.SOBufferEnable1 = xfb_info->buffers_written & (1 << 1);
-      so.SOBufferEnable2 = xfb_info->buffers_written & (1 << 2);
-      so.SOBufferEnable3 = xfb_info->buffers_written & (1 << 3);
-#endif
-
-      int urb_entry_read_offset = 0;
-      int urb_entry_read_length =
-         (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset;
-
-      /* We always read the whole vertex.  This could be reduced at some
-       * point by reading less and offsetting the register index in the
-       * SO_DECLs.
-       */
-      so.Stream0VertexReadOffset = urb_entry_read_offset;
-      so.Stream0VertexReadLength = urb_entry_read_length - 1;
-      so.Stream1VertexReadOffset = urb_entry_read_offset;
-      so.Stream1VertexReadLength = urb_entry_read_length - 1;
-      so.Stream2VertexReadOffset = urb_entry_read_offset;
-      so.Stream2VertexReadLength = urb_entry_read_length - 1;
-      so.Stream3VertexReadOffset = urb_entry_read_offset;
-      so.Stream3VertexReadLength = urb_entry_read_length - 1;
-   }
-
-   if (dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE) {
-      GENX(3DSTATE_STREAMOUT_pack)(NULL, streamout_state_dw, &so);
-   } else {
-      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_STREAMOUT), _so)
-         _so = so;
-   }
+      xfb_info = pipeline->base.shaders[MESA_SHADER_VERTEX]->xfb_info;
 
    if (xfb_info) {
       struct GENX(SO_DECL) so_decl[MAX_XFB_STREAMS][128];
@@ -1679,16 +1089,17 @@ emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
             sbs[xfb_info->buffer_to_stream[b]] |= 1 << b;
       }
 
-      uint32_t *dw = anv_batch_emitn(&pipeline->base.batch, 3 + 2 * max_decls,
-                                     GENX(3DSTATE_SO_DECL_LIST),
-                                     .StreamtoBufferSelects0 = sbs[0],
-                                     .StreamtoBufferSelects1 = sbs[1],
-                                     .StreamtoBufferSelects2 = sbs[2],
-                                     .StreamtoBufferSelects3 = sbs[3],
-                                     .NumEntries0 = decls[0],
-                                     .NumEntries1 = decls[1],
-                                     .NumEntries2 = decls[2],
-                                     .NumEntries3 = decls[3]);
+      uint32_t *dw = anv_pipeline_emitn(pipeline, final.so_decl_list,
+                                        3 + 2 * max_decls,
+                                        GENX(3DSTATE_SO_DECL_LIST),
+                                        .StreamtoBufferSelects0 = sbs[0],
+                                        .StreamtoBufferSelects1 = sbs[1],
+                                        .StreamtoBufferSelects2 = sbs[2],
+                                        .StreamtoBufferSelects3 = sbs[3],
+                                        .NumEntries0 = decls[0],
+                                        .NumEntries1 = decls[1],
+                                        .NumEntries2 = decls[2],
+                                        .NumEntries3 = decls[3]);
 
       for (int i = 0; i < max_decls; i++) {
          GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2,
@@ -1700,6 +1111,37 @@ emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
             });
       }
    }
+
+   anv_pipeline_emit(pipeline, partial.so, GENX(3DSTATE_STREAMOUT), so) {
+      if (xfb_info) {
+         pipeline->uses_xfb = true;
+
+         so.SOFunctionEnable = true;
+         so.SOStatisticsEnable = true;
+
+         so.Buffer0SurfacePitch = xfb_info->buffers[0].stride;
+         so.Buffer1SurfacePitch = xfb_info->buffers[1].stride;
+         so.Buffer2SurfacePitch = xfb_info->buffers[2].stride;
+         so.Buffer3SurfacePitch = xfb_info->buffers[3].stride;
+
+         int urb_entry_read_offset = 0;
+         int urb_entry_read_length =
+            (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset;
+
+         /* We always read the whole vertex. This could be reduced at some
+          * point by reading less and offsetting the register index in the
+          * SO_DECLs.
+          */
+         so.Stream0VertexReadOffset = urb_entry_read_offset;
+         so.Stream0VertexReadLength = urb_entry_read_length - 1;
+         so.Stream1VertexReadOffset = urb_entry_read_offset;
+         so.Stream1VertexReadLength = urb_entry_read_length - 1;
+         so.Stream2VertexReadOffset = urb_entry_read_offset;
+         so.Stream2VertexReadLength = urb_entry_read_length - 1;
+         so.Stream3VertexReadOffset = urb_entry_read_offset;
+         so.Stream3VertexReadLength = urb_entry_read_length - 1;
+      }
+   }
 }
 
 static uint32_t
@@ -1735,8 +1177,17 @@ get_scratch_space(const struct anv_shader_bin *bin)
 
 static UNUSED uint32_t
 get_scratch_surf(struct anv_pipeline *pipeline,
+                 gl_shader_stage stage,
                  const struct anv_shader_bin *bin)
 {
+   if (bin->prog_data->total_scratch == 0)
+      return 0;
+
+   struct anv_bo *bo =
+      anv_scratch_pool_alloc(pipeline->device,
+                             &pipeline->device->scratch_pool,
+                             stage, bin->prog_data->total_scratch);
+   anv_reloc_list_add_bo(pipeline->batch.relocs, bo);
    return anv_scratch_pool_get_surf(pipeline->device,
                                     &pipeline->device->scratch_pool,
                                     bin->prog_data->total_scratch) >> 4;
@@ -1745,18 +1196,18 @@ get_scratch_surf(struct anv_pipeline *pipeline,
 static void
 emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
 {
-   const struct intel_device_info *devinfo = &pipeline->base.device->info;
+   const struct intel_device_info *devinfo = pipeline->base.base.device->info;
    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
    const struct anv_shader_bin *vs_bin =
-      pipeline->shaders[MESA_SHADER_VERTEX];
+      pipeline->base.shaders[MESA_SHADER_VERTEX];
 
    assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX));
 
-   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VS), vs) {
+   anv_pipeline_emit(pipeline, final.vs, GENX(3DSTATE_VS), vs) {
       vs.Enable               = true;
       vs.StatisticsEnable     = true;
       vs.KernelStartPointer   = vs_bin->kernel.offset;
-#if GFX_VER >= 8
+#if GFX_VER < 20
       vs.SIMD8DispatchEnable  =
          vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;
 #endif
@@ -1785,7 +1236,7 @@ emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
           * but the Haswell docs for the "VS Reference Count Full Force Miss
           * Enable" field of the "Thread Mode" register refer to a HSW bug in
           * which the VUE handle reference count would overflow resulting in
-          * internal reference counting bugs.  My (Jason's) best guess is that
+          * internal reference counting bugs.  My (Faith's) best guess is that
           * this bug cropped back up on SKL GT4 when we suddenly had more
           * threads in play than any previous gfx9 hardware.
           *
@@ -1805,44 +1256,42 @@ emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
       vs.DispatchGRFStartRegisterForURBData =
          vs_prog_data->base.base.dispatch_grf_start_reg;
 
-#if GFX_VER >= 8
       vs.UserClipDistanceClipTestEnableBitmask =
          vs_prog_data->base.clip_distance_mask;
       vs.UserClipDistanceCullTestEnableBitmask =
          vs_prog_data->base.cull_distance_mask;
-#endif
 
 #if GFX_VERx10 >= 125
-      vs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, vs_bin);
+      vs.ScratchSpaceBuffer =
+         get_scratch_surf(&pipeline->base.base, MESA_SHADER_VERTEX, vs_bin);
 #else
       vs.PerThreadScratchSpace   = get_scratch_space(vs_bin);
       vs.ScratchSpaceBasePointer =
-         get_scratch_address(&pipeline->base, MESA_SHADER_VERTEX, vs_bin);
+         get_scratch_address(&pipeline->base.base, MESA_SHADER_VERTEX, vs_bin);
 #endif
    }
 }
 
 static void
-emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
-                      const VkPipelineTessellationStateCreateInfo *tess_info)
+emit_3dstate_hs_ds(struct anv_graphics_pipeline *pipeline,
+                   const struct vk_tessellation_state *ts)
 {
    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
-      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_HS), hs);
-      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te);
-      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds);
+      anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs);
+      anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds);
       return;
    }
 
-   const struct intel_device_info *devinfo = &pipeline->base.device->info;
+   const struct intel_device_info *devinfo = pipeline->base.base.device->info;
    const struct anv_shader_bin *tcs_bin =
-      pipeline->shaders[MESA_SHADER_TESS_CTRL];
+      pipeline->base.shaders[MESA_SHADER_TESS_CTRL];
    const struct anv_shader_bin *tes_bin =
-      pipeline->shaders[MESA_SHADER_TESS_EVAL];
+      pipeline->base.shaders[MESA_SHADER_TESS_EVAL];
 
    const struct brw_tcs_prog_data *tcs_prog_data = get_tcs_prog_data(pipeline);
    const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
 
-   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_HS), hs) {
+   anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs) {
       hs.Enable = true;
       hs.StatisticsEnable = true;
       hs.KernelStartPointer = tcs_bin->kernel.offset;
@@ -1874,11 +1323,12 @@ emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
 #endif
 
 #if GFX_VERx10 >= 125
-      hs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, tcs_bin);
+      hs.ScratchSpaceBuffer =
+         get_scratch_surf(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin);
 #else
       hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);
       hs.ScratchSpaceBasePointer =
-         get_scratch_address(&pipeline->base, MESA_SHADER_TESS_CTRL, tcs_bin);
+         get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_CTRL, tcs_bin);
 #endif
 
 #if GFX_VER == 12
@@ -1888,42 +1338,13 @@ emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
       hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold;
 #endif
 
-#if GFX_VER >= 9
+#if GFX_VER < 20
       hs.DispatchMode = tcs_prog_data->base.dispatch_mode;
-      hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
 #endif
-   }
-
-   const VkPipelineTessellationDomainOriginStateCreateInfo *domain_origin_state =
-      tess_info ? vk_find_struct_const(tess_info, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO) : NULL;
-
-   VkTessellationDomainOrigin uv_origin =
-      domain_origin_state ? domain_origin_state->domainOrigin :
-                            VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT;
-
-   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te) {
-      te.Partitioning = tes_prog_data->partitioning;
-
-      if (uv_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
-         te.OutputTopology = tes_prog_data->output_topology;
-      } else {
-         /* When the origin is upper-left, we have to flip the winding order */
-         if (tes_prog_data->output_topology == OUTPUT_TRI_CCW) {
-            te.OutputTopology = OUTPUT_TRI_CW;
-         } else if (tes_prog_data->output_topology == OUTPUT_TRI_CW) {
-            te.OutputTopology = OUTPUT_TRI_CCW;
-         } else {
-            te.OutputTopology = tes_prog_data->output_topology;
-         }
-      }
-
-      te.TEDomain = tes_prog_data->domain;
-      te.TEEnable = true;
-      te.MaximumTessellationFactorOdd = 63.0;
-      te.MaximumTessellationFactorNotOdd = 64.0;
-   }
+      hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
+   };
 
-   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds) {
+   anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds) {
       ds.Enable = true;
       ds.StatisticsEnable = true;
       ds.KernelStartPointer = tes_bin->kernel.offset;
@@ -1933,21 +1354,20 @@ emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
       ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
 
       ds.ComputeWCoordinateEnable =
-         tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
+         tes_prog_data->domain == INTEL_TESS_DOMAIN_TRI;
 
       ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;
       ds.PatchURBEntryReadOffset = 0;
       ds.DispatchGRFStartRegisterForURBData =
          tes_prog_data->base.base.dispatch_grf_start_reg;
 
-#if GFX_VER >= 8
 #if GFX_VER < 11
       ds.DispatchMode =
          tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ?
-            DISPATCH_MODE_SIMD8_SINGLE_PATCH :
-            DISPATCH_MODE_SIMD4X2;
+         DISPATCH_MODE_SIMD8_SINGLE_PATCH :
+         DISPATCH_MODE_SIMD4X2;
 #else
-      assert(tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8);
+      assert(tes_prog_data->base.dispatch_mode == INTEL_DISPATCH_MODE_SIMD8);
       ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
 #endif
 
@@ -1955,37 +1375,105 @@ emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
          tes_prog_data->base.clip_distance_mask;
       ds.UserClipDistanceCullTestEnableBitmask =
          tes_prog_data->base.cull_distance_mask;
-#endif
 
+#if GFX_VER >= 12
+      ds.PrimitiveIDNotRequired = !tes_prog_data->include_primitive_id;
+#endif
 #if GFX_VERx10 >= 125
-      ds.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, tes_bin);
+      ds.ScratchSpaceBuffer =
+         get_scratch_surf(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin);
 #else
       ds.PerThreadScratchSpace = get_scratch_space(tes_bin);
       ds.ScratchSpaceBasePointer =
-         get_scratch_address(&pipeline->base, MESA_SHADER_TESS_EVAL, tes_bin);
+         get_scratch_address(&pipeline->base.base, MESA_SHADER_TESS_EVAL, tes_bin);
 #endif
    }
 }
 
+static UNUSED bool
+geom_or_tess_prim_id_used(struct anv_graphics_pipeline *pipeline)
+{
+   const struct brw_tcs_prog_data *tcs_prog_data =
+      anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL) ?
+      get_tcs_prog_data(pipeline) : NULL;
+   const struct brw_tes_prog_data *tes_prog_data =
+      anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL) ?
+      get_tes_prog_data(pipeline) : NULL;
+   const struct brw_gs_prog_data *gs_prog_data =
+      anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY) ?
+      get_gs_prog_data(pipeline) : NULL;
+
+   return (tcs_prog_data && tcs_prog_data->include_primitive_id) ||
+          (tes_prog_data && tes_prog_data->include_primitive_id) ||
+          (gs_prog_data && gs_prog_data->include_primitive_id);
+}
+
 static void
-emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
+emit_3dstate_te(struct anv_graphics_pipeline *pipeline)
 {
-   const struct intel_device_info *devinfo = &pipeline->base.device->info;
-   const struct anv_shader_bin *gs_bin =
-      pipeline->shaders[MESA_SHADER_GEOMETRY];
+   anv_pipeline_emit(pipeline, partial.te, GENX(3DSTATE_TE), te) {
+      if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
+         const struct brw_tes_prog_data *tes_prog_data =
+            get_tes_prog_data(pipeline);
+
+         te.Partitioning = tes_prog_data->partitioning;
+         te.TEDomain = tes_prog_data->domain;
+         te.TEEnable = true;
+         te.MaximumTessellationFactorOdd = 63.0;
+         te.MaximumTessellationFactorNotOdd = 64.0;
+#if GFX_VERx10 >= 125
+         const struct anv_device *device = pipeline->base.base.device;
+         if (intel_needs_workaround(device->info, 22012699309))
+            te.TessellationDistributionMode = TEDMODE_RR_STRICT;
+         else
+            te.TessellationDistributionMode = TEDMODE_RR_FREE;
+
+         if (intel_needs_workaround(device->info, 14015055625)) {
+            /* Wa_14015055625:
+             *
+             * Disable Tessellation Distribution when primitive Id is enabled.
+             */
+            if (sbe_primitive_id_override(pipeline) ||
+                geom_or_tess_prim_id_used(pipeline))
+               te.TessellationDistributionMode = TEDMODE_OFF;
+         }
+
+#if GFX_VER >= 20
+         te.TessellationDistributionLevel = TEDLEVEL_REGION;
+#else
+         te.TessellationDistributionLevel = TEDLEVEL_PATCH;
+#endif
+         /* 64_TRIANGLES */
+         te.SmallPatchThreshold = 3;
+         /* 1K_TRIANGLES */
+         te.TargetBlockSize = 8;
+         /* 1K_TRIANGLES */
+         te.LocalBOPAccumulatorThreshold = 1;
+#endif
+      }
+   }
+}
 
+static void
+emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
+{
    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
-      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_GS), gs);
+      anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs);
       return;
    }
 
+   const struct intel_device_info *devinfo = pipeline->base.base.device->info;
+   const struct anv_shader_bin *gs_bin =
+      pipeline->base.shaders[MESA_SHADER_GEOMETRY];
    const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline);
 
-   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_GS), gs) {
+   anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs) {
       gs.Enable                  = true;
       gs.StatisticsEnable        = true;
       gs.KernelStartPointer      = gs_bin->kernel.offset;
+#if GFX_VER < 20
       gs.DispatchMode            = gs_prog_data->base.dispatch_mode;
+#endif
 
       gs.SingleProgramFlow       = false;
       gs.VectorMaskEnable        = false;
@@ -1995,400 +1483,201 @@ emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
       gs.IncludeVertexHandles    = gs_prog_data->base.include_vue_handles;
       gs.IncludePrimitiveID      = gs_prog_data->include_primitive_id;
 
-      if (GFX_VER == 8) {
-         /* Broadwell is weird.  It needs us to divide by 2. */
-         gs.MaximumNumberofThreads = devinfo->max_gs_threads / 2 - 1;
-      } else {
-         gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
-      }
+      gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
 
       gs.OutputVertexSize        = gs_prog_data->output_vertex_size_hwords * 2 - 1;
       gs.OutputTopology          = gs_prog_data->output_topology;
-      gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length;
       gs.ControlDataFormat       = gs_prog_data->control_data_format;
       gs.ControlDataHeaderSize   = gs_prog_data->control_data_header_size_hwords;
       gs.InstanceControl         = MAX2(gs_prog_data->invocations, 1) - 1;
-      gs.ReorderMode             = TRAILING;
 
-#if GFX_VER >= 8
       gs.ExpectedVertexCount     = gs_prog_data->vertices_in;
       gs.StaticOutput            = gs_prog_data->static_vertex_count >= 0;
       gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count >= 0 ?
-                                   gs_prog_data->static_vertex_count : 0;
-#endif
+         gs_prog_data->static_vertex_count : 0;
 
       gs.VertexURBEntryReadOffset = 0;
       gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length;
       gs.DispatchGRFStartRegisterForURBData =
          gs_prog_data->base.base.dispatch_grf_start_reg;
 
-#if GFX_VER >= 8
       gs.UserClipDistanceClipTestEnableBitmask =
          gs_prog_data->base.clip_distance_mask;
       gs.UserClipDistanceCullTestEnableBitmask =
          gs_prog_data->base.cull_distance_mask;
-#endif
 
 #if GFX_VERx10 >= 125
-      gs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, gs_bin);
+      gs.ScratchSpaceBuffer =
+         get_scratch_surf(&pipeline->base.base, MESA_SHADER_GEOMETRY, gs_bin);
 #else
       gs.PerThreadScratchSpace   = get_scratch_space(gs_bin);
       gs.ScratchSpaceBasePointer =
-         get_scratch_address(&pipeline->base, MESA_SHADER_GEOMETRY, gs_bin);
+         get_scratch_address(&pipeline->base.base, MESA_SHADER_GEOMETRY, gs_bin);
 #endif
    }
 }
 
-static bool
-has_color_buffer_write_enabled(const struct anv_graphics_pipeline *pipeline,
-                               const VkPipelineColorBlendStateCreateInfo *blend)
-{
-   const struct anv_shader_bin *shader_bin =
-      pipeline->shaders[MESA_SHADER_FRAGMENT];
-   if (!shader_bin)
-      return false;
-
-   if (!pipeline->dynamic_state.color_writes)
-      return false;
-
-   const struct anv_pipeline_bind_map *bind_map = &shader_bin->bind_map;
-   for (int i = 0; i < bind_map->surface_count; i++) {
-      struct anv_pipeline_binding *binding = &bind_map->surface_to_descriptor[i];
-
-      if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)
-         continue;
-
-      if (binding->index == UINT32_MAX)
-         continue;
-
-      if (blend && blend->pAttachments[binding->index].colorWriteMask != 0)
-         return true;
-   }
-
-   return false;
-}
-
 static void
-emit_3dstate_wm(struct anv_graphics_pipeline *pipeline, struct anv_subpass *subpass,
-                const VkPipelineInputAssemblyStateCreateInfo *ia,
-                const VkPipelineRasterizationStateCreateInfo *raster,
-                const VkPipelineColorBlendStateCreateInfo *blend,
-                const VkPipelineMultisampleStateCreateInfo *multisample,
-                const VkPipelineRasterizationLineStateCreateInfoEXT *line,
-                const uint32_t dynamic_states)
+emit_3dstate_wm(struct anv_graphics_pipeline *pipeline,
+                const struct vk_input_assembly_state *ia,
+                const struct vk_rasterization_state *rs,
+                const struct vk_multisample_state *ms,
+                const struct vk_color_blend_state *cb,
+                const struct vk_render_pass_state *rp)
 {
    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
 
-   struct GENX(3DSTATE_WM) wm = {
-      GENX(3DSTATE_WM_header),
-   };
-   wm.StatisticsEnable                    = true;
-   wm.LineEndCapAntialiasingRegionWidth   = _05pixels;
-   wm.LineAntialiasingRegionWidth         = _10pixels;
-   wm.PointRasterizationRule              = RASTRULE_UPPER_RIGHT;
+   anv_pipeline_emit(pipeline, partial.wm, GENX(3DSTATE_WM), wm) {
+      wm.StatisticsEnable                    = true;
+      wm.LineEndCapAntialiasingRegionWidth   = _05pixels;
+      wm.LineAntialiasingRegionWidth         = _10pixels;
+      wm.PointRasterizationRule              = RASTRULE_UPPER_LEFT;
 
-   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
-      if (wm_prog_data->early_fragment_tests) {
+      if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
+         if (wm_prog_data->early_fragment_tests) {
             wm.EarlyDepthStencilControl         = EDSC_PREPS;
-      } else if (wm_prog_data->has_side_effects) {
-         wm.EarlyDepthStencilControl         = EDSC_PSEXEC;
-      } else {
-         wm.EarlyDepthStencilControl         = EDSC_NORMAL;
-      }
-
-#if GFX_VER >= 8
-      /* Gen8 hardware tries to compute ThreadDispatchEnable for us but
-       * doesn't take into account KillPixels when no depth or stencil
-       * writes are enabled.  In order for occlusion queries to work
-       * correctly with no attachments, we need to force-enable PS thread
-       * dispatch.
-       *
-       * The BDW docs are pretty clear that that this bit isn't validated
-       * and probably shouldn't be used in production:
-       *
-       *    "This must always be set to Normal. This field should not be
-       *    tested for functional validation."
-       *
-       * Unfortunately, however, the other mechanism we have for doing this
-       * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW.
-       * Given two bad options, we choose the one which works.
-       */
-      pipeline->force_fragment_thread_dispatch =
-         wm_prog_data->has_side_effects ||
-         wm_prog_data->uses_kill;
-
-      if (pipeline->force_fragment_thread_dispatch ||
-          !has_color_buffer_write_enabled(pipeline, blend)) {
-         /* Only set this value in non dynamic mode. */
-         wm.ForceThreadDispatchEnable =
-            !(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE) ? ForceON : 0;
-      }
-#endif
-
-      wm.BarycentricInterpolationMode =
-         wm_prog_data->barycentric_interp_modes;
-
-#if GFX_VER < 8
-      wm.PixelShaderComputedDepthMode  = wm_prog_data->computed_depth_mode;
-      wm.PixelShaderUsesSourceDepth    = wm_prog_data->uses_src_depth;
-      wm.PixelShaderUsesSourceW        = wm_prog_data->uses_src_w;
-      wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
-
-      /* If the subpass has a depth or stencil self-dependency, then we
-       * need to force the hardware to do the depth/stencil write *after*
-       * fragment shader execution.  Otherwise, the writes may hit memory
-       * before we get around to fetching from the input attachment and we
-       * may get the depth or stencil value from the current draw rather
-       * than the previous one.
-       */
-      wm.PixelShaderKillsPixel         = subpass->has_ds_self_dep ||
-                                         wm_prog_data->uses_kill;
-
-      pipeline->force_fragment_thread_dispatch =
-         wm.PixelShaderComputedDepthMode != PSCDEPTH_OFF ||
-         wm_prog_data->has_side_effects ||
-         wm.PixelShaderKillsPixel;
-
-      if (pipeline->force_fragment_thread_dispatch ||
-          has_color_buffer_write_enabled(pipeline, blend)) {
-         /* Only set this value in non dynamic mode. */
-         wm.ThreadDispatchEnable = !(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE);
-      }
-
-      if (multisample && multisample->rasterizationSamples > 1) {
-         if (wm_prog_data->persample_dispatch) {
-            wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
+         } else if (wm_prog_data->has_side_effects) {
+            wm.EarlyDepthStencilControl         = EDSC_PSEXEC;
          } else {
-            wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
+            wm.EarlyDepthStencilControl         = EDSC_NORMAL;
          }
-      } else {
-         wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
-      }
-
-      VkPolygonMode raster_mode =
-         genX(raster_polygon_mode)(pipeline, ia->topology);
-
-      wm.MultisampleRasterizationMode =
-         dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY ? 0 :
-         genX(ms_rasterization_mode)(pipeline, raster_mode);
-#endif
-
-      wm.LineStippleEnable = line && line->stippledLineEnable;
-   }
-
-   uint32_t dynamic_wm_states = ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE;
 
-#if GFX_VER < 8
-   dynamic_wm_states |= ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
-#endif
-
-   if (dynamic_states & dynamic_wm_states) {
-      const struct intel_device_info *devinfo = &pipeline->base.device->info;
-      uint32_t *dws = devinfo->ver >= 8 ? pipeline->gfx8.wm : pipeline->gfx7.wm;
-      GENX(3DSTATE_WM_pack)(NULL, dws, &wm);
-   } else {
-      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_WM), _wm)
-         _wm = wm;
+         /* Gen8 hardware tries to compute ThreadDispatchEnable for us but
+          * doesn't take into account KillPixels when no depth or stencil
+          * writes are enabled. In order for occlusion queries to work
+          * correctly with no attachments, we need to force-enable PS thread
+          * dispatch.
+          *
+          * The BDW docs are pretty clear that that this bit isn't validated
+          * and probably shouldn't be used in production:
+          *
+          *    "This must always be set to Normal. This field should not be
+          *     tested for functional validation."
+          *
+          * Unfortunately, however, the other mechanism we have for doing this
+          * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW.
+          * Given two bad options, we choose the one which works.
+          */
+         pipeline->force_fragment_thread_dispatch =
+            wm_prog_data->has_side_effects ||
+            wm_prog_data->uses_kill;
+      }
    }
 }
 
 static void
 emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
-                const VkPipelineColorBlendStateCreateInfo *blend,
-                const VkPipelineMultisampleStateCreateInfo *multisample)
+                const struct vk_multisample_state *ms,
+                const struct vk_color_blend_state *cb)
 {
    UNUSED const struct intel_device_info *devinfo =
-      &pipeline->base.device->info;
+      pipeline->base.base.device->info;
    const struct anv_shader_bin *fs_bin =
-      pipeline->shaders[MESA_SHADER_FRAGMENT];
+      pipeline->base.shaders[MESA_SHADER_FRAGMENT];
 
    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
-      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) {
-#if GFX_VER == 7
-         /* Even if no fragments are ever dispatched, gfx7 hardware hangs if
-          * we don't at least set the maximum number of threads.
-          */
-         ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
-#endif
-      }
+      anv_pipeline_emit(pipeline, partial.ps, GENX(3DSTATE_PS), ps);
       return;
    }
 
    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
 
-#if GFX_VER < 8
-   /* The hardware wedges if you have this bit set but don't turn on any dual
-    * source blend factors.
-    */
-   bool dual_src_blend = false;
-   if (wm_prog_data->dual_src_blend && blend) {
-      for (uint32_t i = 0; i < blend->attachmentCount; i++) {
-         const VkPipelineColorBlendAttachmentState *bstate =
-            &blend->pAttachments[i];
-
-         if (bstate->blendEnable &&
-             (is_dual_src_blend_factor(bstate->srcColorBlendFactor) ||
-              is_dual_src_blend_factor(bstate->dstColorBlendFactor) ||
-              is_dual_src_blend_factor(bstate->srcAlphaBlendFactor) ||
-              is_dual_src_blend_factor(bstate->dstAlphaBlendFactor))) {
-            dual_src_blend = true;
-            break;
-         }
-      }
-   }
-#endif
-
-   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) {
-      ps._8PixelDispatchEnable      = wm_prog_data->dispatch_8;
-      ps._16PixelDispatchEnable     = wm_prog_data->dispatch_16;
-      ps._32PixelDispatchEnable     = wm_prog_data->dispatch_32;
-
-      /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
-       *
-       *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
-       *    Dispatch must not be enabled for PER_PIXEL dispatch mode."
-       *
-       * Since 16x MSAA is first introduced on SKL, we don't need to apply
-       * the workaround on any older hardware.
+   anv_pipeline_emit(pipeline, partial.ps, GENX(3DSTATE_PS), ps) {
+#if GFX_VER == 12
+      assert(wm_prog_data->dispatch_multi == 0 ||
+             (wm_prog_data->dispatch_multi == 16 && wm_prog_data->max_polygons == 2));
+      ps.DualSIMD8DispatchEnable = wm_prog_data->dispatch_multi;
+      /* XXX - No major improvement observed from enabling
+       *       overlapping subspans, but it could be helpful
+       *       in theory when the requirements listed on the
+       *       BSpec page for 3DSTATE_PS_BODY are met.
        */
-      if (GFX_VER >= 9 && !wm_prog_data->persample_dispatch &&
-          multisample && multisample->rasterizationSamples == 16) {
-         assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
-         ps._32PixelDispatchEnable = false;
-      }
-
-      ps.KernelStartPointer0 = fs_bin->kernel.offset +
-                               brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
-      ps.KernelStartPointer1 = fs_bin->kernel.offset +
-                               brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
-      ps.KernelStartPointer2 = fs_bin->kernel.offset +
-                               brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
+      ps.OverlappingSubspansEnable = false;
+#endif
 
       ps.SingleProgramFlow          = false;
-      ps.VectorMaskEnable           = GFX_VER >= 8;
+      ps.VectorMaskEnable           = wm_prog_data->uses_vmask;
       /* Wa_1606682166 */
       ps.SamplerCount               = GFX_VER == 11 ? 0 : get_sampler_count(fs_bin);
       ps.BindingTableEntryCount     = fs_bin->bind_map.surface_count;
+#if GFX_VER < 20
       ps.PushConstantEnable         = wm_prog_data->base.nr_params > 0 ||
                                       wm_prog_data->base.ubo_ranges[0].length;
-      ps.PositionXYOffsetSelect     = wm_prog_data->uses_pos_offset ?
-                                      POSOFFSET_SAMPLE: POSOFFSET_NONE;
-#if GFX_VER < 8
-      ps.AttributeEnable            = wm_prog_data->num_varying_inputs > 0;
-      ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
-      ps.DualSourceBlendEnable      = dual_src_blend;
-#endif
-
-#if GFX_VERx10 == 75
-      /* Haswell requires the sample mask to be set in this packet as well
-       * as in 3DSTATE_SAMPLE_MASK; the values should match.
-       */
-      ps.SampleMask                 = 0xff;
 #endif
 
-#if GFX_VER >= 9
-      ps.MaximumNumberofThreadsPerPSD  = 64 - 1;
-#elif GFX_VER >= 8
-      ps.MaximumNumberofThreadsPerPSD  = 64 - 2;
-#else
-      ps.MaximumNumberofThreads        = devinfo->max_wm_threads - 1;
-#endif
-
-      ps.DispatchGRFStartRegisterForConstantSetupData0 =
-         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
-      ps.DispatchGRFStartRegisterForConstantSetupData1 =
-         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
-      ps.DispatchGRFStartRegisterForConstantSetupData2 =
-         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
+      ps.MaximumNumberofThreadsPerPSD = devinfo->max_threads_per_psd - 1;
 
 #if GFX_VERx10 >= 125
-      ps.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, fs_bin);
+      ps.ScratchSpaceBuffer =
+         get_scratch_surf(&pipeline->base.base, MESA_SHADER_FRAGMENT, fs_bin);
 #else
       ps.PerThreadScratchSpace   = get_scratch_space(fs_bin);
       ps.ScratchSpaceBasePointer =
-         get_scratch_address(&pipeline->base, MESA_SHADER_FRAGMENT, fs_bin);
+         get_scratch_address(&pipeline->base.base, MESA_SHADER_FRAGMENT, fs_bin);
 #endif
    }
 }
 
-#if GFX_VER >= 8
 static void
 emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
-                      struct anv_subpass *subpass,
-                      const VkPipelineRasterizationStateCreateInfo *rs_info)
+                      const struct vk_rasterization_state *rs,
+                      const struct vk_graphics_pipeline_state *state)
 {
    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
 
    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
-      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_EXTRA), ps);
+      anv_pipeline_emit(pipeline, partial.ps_extra, GENX(3DSTATE_PS_EXTRA), ps);
       return;
    }
 
-   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_EXTRA), ps) {
+   anv_pipeline_emit(pipeline, partial.ps_extra, GENX(3DSTATE_PS_EXTRA), ps) {
       ps.PixelShaderValid              = true;
+#if GFX_VER < 20
       ps.AttributeEnable               = wm_prog_data->num_varying_inputs > 0;
+#endif
       ps.oMaskPresenttoRenderTarget    = wm_prog_data->uses_omask;
-      ps.PixelShaderIsPerSample        = wm_prog_data->persample_dispatch;
       ps.PixelShaderComputedDepthMode  = wm_prog_data->computed_depth_mode;
       ps.PixelShaderUsesSourceDepth    = wm_prog_data->uses_src_depth;
       ps.PixelShaderUsesSourceW        = wm_prog_data->uses_src_w;
 
-      /* If the subpass has a depth or stencil self-dependency, then we need
-       * to force the hardware to do the depth/stencil write *after* fragment
-       * shader execution.  Otherwise, the writes may hit memory before we get
-       * around to fetching from the input attachment and we may get the depth
-       * or stencil value from the current draw rather than the previous one.
-       */
-      ps.PixelShaderKillsPixel         = subpass->has_ds_self_dep ||
-                                         wm_prog_data->uses_kill;
-
-#if GFX_VER >= 9
       ps.PixelShaderComputesStencil = wm_prog_data->computed_stencil;
+#if GFX_VER >= 20
+      assert(!wm_prog_data->pulls_bary);
+#else
       ps.PixelShaderPullsBary    = wm_prog_data->pulls_bary;
+#endif
 
       ps.InputCoverageMaskState = ICMS_NONE;
       assert(!wm_prog_data->inner_coverage); /* Not available in SPIR-V */
       if (!wm_prog_data->uses_sample_mask)
          ps.InputCoverageMaskState = ICMS_NONE;
-      else if (wm_prog_data->per_coarse_pixel_dispatch)
+      else if (brw_wm_prog_data_is_coarse(wm_prog_data, 0))
          ps.InputCoverageMaskState  = ICMS_NORMAL;
       else if (wm_prog_data->post_depth_coverage)
          ps.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
       else
          ps.InputCoverageMaskState = ICMS_NORMAL;
-#else
-      ps.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
-#endif
 
 #if GFX_VER >= 11
       ps.PixelShaderRequiresSourceDepthandorWPlaneCoefficients =
          wm_prog_data->uses_depth_w_coefficients;
-      ps.PixelShaderIsPerCoarsePixel = wm_prog_data->per_coarse_pixel_dispatch;
 #endif
    }
 }
 
 static void
-emit_3dstate_vf_topology(struct anv_graphics_pipeline *pipeline)
-{
-   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
-      vft.PrimitiveTopologyType = pipeline->topology;
-   }
-}
-#endif
-
-static void
 emit_3dstate_vf_statistics(struct anv_graphics_pipeline *pipeline)
 {
-   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_STATISTICS), vfs) {
+   anv_pipeline_emit(pipeline, final.vf_statistics,
+                     GENX(3DSTATE_VF_STATISTICS), vfs) {
       vfs.StatisticsEnable = true;
    }
 }
 
 static void
 compute_kill_pixel(struct anv_graphics_pipeline *pipeline,
-                   const VkPipelineMultisampleStateCreateInfo *ms_info,
-                   const struct anv_subpass *subpass)
+                   const struct vk_multisample_state *ms,
+                   const struct vk_graphics_pipeline_state *state)
 {
    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
       pipeline->kill_pixel = false;
@@ -2411,31 +1700,47 @@ compute_kill_pixel(struct anv_graphics_pipeline *pipeline,
     * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept
     * of an alpha test.
     */
+   pipeline->rp_has_ds_self_dep =
+      (state->pipeline_flags &
+       VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT) != 0;
    pipeline->kill_pixel =
-      subpass->has_ds_self_dep || wm_prog_data->uses_kill ||
+      pipeline->rp_has_ds_self_dep ||
+      wm_prog_data->uses_kill ||
       wm_prog_data->uses_omask ||
-      (ms_info && ms_info->alphaToCoverageEnable);
+      (ms && ms->alpha_to_coverage_enable);
 }
 
-#if GFX_VER == 12
+#if GFX_VER >= 12
 static void
-emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline)
+emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline,
+                                   const struct vk_render_pass_state *rp)
 {
-   if (!pipeline->use_primitive_replication) {
-      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
+   if (anv_pipeline_is_mesh(pipeline)) {
+      anv_pipeline_emit(pipeline, final.primitive_replication,
+                        GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
       return;
    }
 
-   uint32_t view_mask = pipeline->subpass->view_mask;
-   int view_count = util_bitcount(view_mask);
-   assert(view_count > 1 && view_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION);
+   const int replication_count =
+      anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map.num_pos_slots;
 
-   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
-      pr.ReplicaMask = (1 << view_count) - 1;
-      pr.ReplicationCount = view_count - 1;
+   assert(replication_count >= 1);
+   if (replication_count == 1) {
+      anv_pipeline_emit(pipeline, final.primitive_replication,
+                        GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
+      return;
+   }
+
+   assert(replication_count == util_bitcount(rp->view_mask));
+   assert(replication_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION);
+
+   anv_pipeline_emit(pipeline, final.primitive_replication,
+                     GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
+      pr.ReplicaMask = (1 << replication_count) - 1;
+      pr.ReplicationCount = replication_count - 1;
 
       int i = 0;
-      u_foreach_bit(view_index, view_mask) {
+      u_foreach_bit(view_index, rp->view_mask) {
          pr.RTAIOffset[i] = view_index;
          i++;
       }
@@ -2443,174 +1748,293 @@ emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline)
 }
 #endif
 
-static VkResult
-genX(graphics_pipeline_create)(
-    VkDevice                                    _device,
-    struct anv_pipeline_cache *                 cache,
-    const VkGraphicsPipelineCreateInfo*         pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkPipeline*                                 pPipeline)
+#if GFX_VERx10 >= 125
+static void
+emit_task_state(struct anv_graphics_pipeline *pipeline)
 {
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   ANV_FROM_HANDLE(anv_render_pass, pass, pCreateInfo->renderPass);
-   struct anv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
-   struct anv_graphics_pipeline *pipeline;
-   VkResult result;
-
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO);
-
-   /* Use the default pipeline cache if none is specified */
-   if (cache == NULL && device->physical->instance->pipeline_cache_enabled)
-      cache = &device->default_pipeline_cache;
-
-   pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
-                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-   if (pipeline == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   result = anv_graphics_pipeline_init(pipeline, device, cache,
-                                       pCreateInfo, pAllocator);
-   if (result != VK_SUCCESS) {
-      vk_free2(&device->vk.alloc, pAllocator, pipeline);
-      if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT)
-         *pPipeline = VK_NULL_HANDLE;
-      return result;
+   assert(anv_pipeline_is_mesh(pipeline));
+
+   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
+      anv_pipeline_emit(pipeline, final.task_control,
+                        GENX(3DSTATE_TASK_CONTROL), zero);
+      anv_pipeline_emit(pipeline, final.task_shader,
+                        GENX(3DSTATE_TASK_SHADER), zero);
+      anv_pipeline_emit(pipeline, final.task_redistrib,
+                        GENX(3DSTATE_TASK_REDISTRIB), zero);
+      return;
+   }
+
+   const struct anv_shader_bin *task_bin =
+      pipeline->base.shaders[MESA_SHADER_TASK];
+
+   anv_pipeline_emit(pipeline, final.task_control,
+                     GENX(3DSTATE_TASK_CONTROL), tc) {
+      tc.TaskShaderEnable = true;
+      tc.ScratchSpaceBuffer =
+         get_scratch_surf(&pipeline->base.base, MESA_SHADER_TASK, task_bin);
+      tc.MaximumNumberofThreadGroups = 511;
+   }
+
+   const struct intel_device_info *devinfo = pipeline->base.base.device->info;
+   const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
+   const struct intel_cs_dispatch_info task_dispatch =
+      brw_cs_get_dispatch_info(devinfo, &task_prog_data->base, NULL);
+
+   anv_pipeline_emit(pipeline, final.task_shader,
+                     GENX(3DSTATE_TASK_SHADER), task) {
+      task.KernelStartPointer                = task_bin->kernel.offset;
+      task.SIMDSize                          = task_dispatch.simd_size / 16;
+      task.MessageSIMD                       = task.SIMDSize;
+      task.NumberofThreadsinGPGPUThreadGroup = task_dispatch.threads;
+      task.ExecutionMask                     = task_dispatch.right_mask;
+      task.LocalXMaximum                     = task_dispatch.group_size - 1;
+      task.EmitLocalIDX                      = true;
+
+      task.NumberofBarriers                  = task_prog_data->base.uses_barrier;
+      task.SharedLocalMemorySize             =
+         encode_slm_size(GFX_VER, task_prog_data->base.base.total_shared);
+      task.PreferredSLMAllocationSize        =
+         preferred_slm_allocation_size(devinfo);
+
+      /*
+       * 3DSTATE_TASK_SHADER_DATA.InlineData[0:1] will be used for an address
+       * of a buffer with push constants and descriptor set table and
+       * InlineData[2:7] will be used for first few push constants.
+       */
+      task.EmitInlineParameter = true;
+
+      task.XP0Required = task_prog_data->uses_drawid;
    }
 
-   /* Information on which states are considered dynamic. */
-   const VkPipelineDynamicStateCreateInfo *dyn_info =
-      pCreateInfo->pDynamicState;
-   uint32_t dynamic_states = 0;
-   if (dyn_info) {
-      for (unsigned i = 0; i < dyn_info->dynamicStateCount; i++)
-         dynamic_states |=
-            anv_cmd_dirty_bit_for_vk_dynamic_state(dyn_info->pDynamicStates[i]);
+   /* Recommended values from "Task and Mesh Distribution Programming". */
+   anv_pipeline_emit(pipeline, final.task_redistrib,
+                     GENX(3DSTATE_TASK_REDISTRIB), redistrib) {
+      redistrib.LocalBOTAccumulatorThreshold = MULTIPLIER_1;
+      redistrib.SmallTaskThreshold = 1; /* 2^N */
+      redistrib.TargetMeshBatchSize = devinfo->num_slices > 2 ? 3 : 5; /* 2^N */
+      redistrib.TaskRedistributionLevel = TASKREDISTRIB_BOM;
+      redistrib.TaskRedistributionMode = TASKREDISTRIB_RR_STRICT;
    }
+}
 
+static void
+emit_mesh_state(struct anv_graphics_pipeline *pipeline)
+{
+   assert(anv_pipeline_is_mesh(pipeline));
 
-   /* If rasterization is not enabled, various CreateInfo structs must be
-    * ignored.
-    */
-   const bool raster_enabled =
-      !pCreateInfo->pRasterizationState->rasterizerDiscardEnable ||
-      (dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE);
+   const struct anv_shader_bin *mesh_bin = pipeline->base.shaders[MESA_SHADER_MESH];
 
-   const VkPipelineViewportStateCreateInfo *vp_info =
-      raster_enabled ? pCreateInfo->pViewportState : NULL;
+   anv_pipeline_emit(pipeline, final.mesh_control,
+                     GENX(3DSTATE_MESH_CONTROL), mc) {
+      mc.MeshShaderEnable = true;
+      mc.ScratchSpaceBuffer =
+         get_scratch_surf(&pipeline->base.base, MESA_SHADER_MESH, mesh_bin);
+      mc.MaximumNumberofThreadGroups = 511;
+   }
 
-   const VkPipelineMultisampleStateCreateInfo *ms_info =
-      raster_enabled ? pCreateInfo->pMultisampleState : NULL;
+   const struct intel_device_info *devinfo = pipeline->base.base.device->info;
+   const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
+   const struct intel_cs_dispatch_info mesh_dispatch =
+      brw_cs_get_dispatch_info(devinfo, &mesh_prog_data->base, NULL);
 
-   const VkPipelineDepthStencilStateCreateInfo *ds_info =
-      raster_enabled ? pCreateInfo->pDepthStencilState : NULL;
+   const unsigned output_topology =
+      mesh_prog_data->primitive_type == MESA_PRIM_POINTS ? OUTPUT_POINT :
+      mesh_prog_data->primitive_type == MESA_PRIM_LINES  ? OUTPUT_LINE :
+                                                             OUTPUT_TRI;
 
-   const VkPipelineColorBlendStateCreateInfo *cb_info =
-      raster_enabled ? pCreateInfo->pColorBlendState : NULL;
+   uint32_t index_format;
+   switch (mesh_prog_data->index_format) {
+   case BRW_INDEX_FORMAT_U32:
+      index_format = INDEX_U32;
+      break;
+   case BRW_INDEX_FORMAT_U888X:
+      index_format = INDEX_U888X;
+      break;
+   default:
+      unreachable("invalid index format");
+   }
+
+   anv_pipeline_emit(pipeline, final.mesh_shader,
+                     GENX(3DSTATE_MESH_SHADER), mesh) {
+      mesh.KernelStartPointer                = mesh_bin->kernel.offset;
+      mesh.SIMDSize                          = mesh_dispatch.simd_size / 16;
+      mesh.MessageSIMD                       = mesh.SIMDSize;
+      mesh.NumberofThreadsinGPGPUThreadGroup = mesh_dispatch.threads;
+      mesh.ExecutionMask                     = mesh_dispatch.right_mask;
+      mesh.LocalXMaximum                     = mesh_dispatch.group_size - 1;
+      mesh.EmitLocalIDX                      = true;
+
+      mesh.MaximumPrimitiveCount             = MAX2(mesh_prog_data->map.max_primitives, 1) - 1;
+      mesh.OutputTopology                    = output_topology;
+      mesh.PerVertexDataPitch                = mesh_prog_data->map.per_vertex_pitch_dw / 8;
+      mesh.PerPrimitiveDataPresent           = mesh_prog_data->map.per_primitive_pitch_dw > 0;
+      mesh.PerPrimitiveDataPitch             = mesh_prog_data->map.per_primitive_pitch_dw / 8;
+      mesh.IndexFormat                       = index_format;
+
+      mesh.NumberofBarriers                  = mesh_prog_data->base.uses_barrier;
+      mesh.SharedLocalMemorySize             =
+         encode_slm_size(GFX_VER, mesh_prog_data->base.base.total_shared);
+      mesh.PreferredSLMAllocationSize        =
+         preferred_slm_allocation_size(devinfo);
+
+      /*
+       * 3DSTATE_MESH_SHADER_DATA.InlineData[0:1] will be used for an address
+       * of a buffer with push constants and descriptor set table and
+       * InlineData[2:7] will be used for first few push constants.
+       */
+      mesh.EmitInlineParameter = true;
 
-   const VkPipelineRasterizationLineStateCreateInfoEXT *line_info =
-      vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
-                           PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
+      mesh.XP0Required = mesh_prog_data->uses_drawid;
+   }
 
+   /* Recommended values from "Task and Mesh Distribution Programming". */
+   anv_pipeline_emit(pipeline, final.mesh_distrib,
+                     GENX(3DSTATE_MESH_DISTRIB), distrib) {
+      distrib.DistributionMode = MESH_RR_FREE;
+      distrib.TaskDistributionBatchSize = devinfo->num_slices > 2 ? 4 : 9; /* 2^N thread groups */
+      distrib.MeshDistributionBatchSize = devinfo->num_slices > 2 ? 3 : 3; /* 2^N thread groups */
+   }
+}
+#endif
+
+void
+genX(graphics_pipeline_emit)(struct anv_graphics_pipeline *pipeline,
+                             const struct vk_graphics_pipeline_state *state)
+{
    enum intel_urb_deref_block_size urb_deref_block_size;
    emit_urb_setup(pipeline, &urb_deref_block_size);
 
-   assert(pCreateInfo->pVertexInputState);
-   emit_vertex_input(pipeline, pCreateInfo->pVertexInputState);
-   assert(pCreateInfo->pRasterizationState);
-   emit_rs_state(pipeline, pCreateInfo->pInputAssemblyState,
-                           pCreateInfo->pRasterizationState,
-                           ms_info, line_info, dynamic_states, pass, subpass,
-                           urb_deref_block_size);
-   emit_ms_state(pipeline, ms_info, dynamic_states);
-   emit_ds_state(pipeline, ds_info, dynamic_states, pass, subpass);
-   emit_cb_state(pipeline, cb_info, ms_info, dynamic_states);
-   compute_kill_pixel(pipeline, ms_info, subpass);
-
-   emit_3dstate_clip(pipeline,
-                     pCreateInfo->pInputAssemblyState,
-                     vp_info,
-                     pCreateInfo->pRasterizationState,
-                     dynamic_states);
-   emit_3dstate_streamout(pipeline, pCreateInfo->pRasterizationState,
-                          dynamic_states);
+   emit_rs_state(pipeline, state->ia, state->rs, state->ms, state->rp,
+                 urb_deref_block_size);
+   emit_ms_state(pipeline, state->ms);
+   compute_kill_pixel(pipeline, state->ms, state);
 
-#if GFX_VER == 12
-   emit_3dstate_primitive_replication(pipeline);
+   emit_3dstate_clip(pipeline, state->ia, state->vp, state->rs);
+
+#if GFX_VER >= 12
+   emit_3dstate_primitive_replication(pipeline, state->rp);
 #endif
 
-#if 0
-   /* From gfx7_vs_state.c */
+#if GFX_VERx10 >= 125
+   bool needs_instance_granularity =
+      intel_needs_workaround(pipeline->base.base.device->info, 14019166699) &&
+      (sbe_primitive_id_override(pipeline) ||
+       geom_or_tess_prim_id_used(pipeline));
+
+   anv_pipeline_emit(pipeline, partial.vfg, GENX(3DSTATE_VFG), vfg) {
+      /* If 3DSTATE_TE: TE Enable == 1 then RR_STRICT else RR_FREE*/
+      vfg.DistributionMode =
+         anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL) ? RR_STRICT :
+         RR_FREE;
+      vfg.DistributionGranularity = needs_instance_granularity ?
+         InstanceLevelGranularity : BatchLevelGranularity;
+#if INTEL_WA_14014851047_GFX_VER
+      vfg.GranularityThresholdDisable =
+         intel_needs_workaround(pipeline->base.base.device->info, 14014851047);
+#endif
+      /* 192 vertices for TRILIST_ADJ */
+      vfg.ListNBatchSizeScale = 0;
+      /* Batch size of 384 vertices */
+      vfg.List3BatchSizeScale = 2;
+      /* Batch size of 128 vertices */
+      vfg.List2BatchSizeScale = 1;
+      /* Batch size of 128 vertices */
+      vfg.List1BatchSizeScale = 2;
+      /* Batch size of 256 vertices for STRIP topologies */
+      vfg.StripBatchSizeScale = 3;
+      /* 192 control points for PATCHLIST_3 */
+      vfg.PatchBatchSizeScale = 1;
+      /* 192 control points for PATCHLIST_3 */
+      vfg.PatchBatchSizeMultiplier = 31;
+   }
+#endif
 
-   /**
-    * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
-    * Geometry > Geometry Shader > State:
-    *
-    *     "Note: Because of corruption in IVB:GT2, software needs to flush the
-    *     whole fixed function pipeline when the GS enable changes value in
-    *     the 3DSTATE_GS."
-    *
-    * The hardware architects have clarified that in this context "flush the
-    * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
-    * Stall" bit set.
-    */
-   if (!device->info.is_haswell && !device->info.is_baytrail)
-      gfx7_emit_vs_workaround_flush(brw);
+   emit_3dstate_vf_statistics(pipeline);
+
+   if (anv_pipeline_is_primitive(pipeline)) {
+      emit_vertex_input(pipeline, state, state->vi);
+
+      emit_3dstate_vs(pipeline);
+      emit_3dstate_hs_ds(pipeline, state->ts);
+      emit_3dstate_te(pipeline);
+      emit_3dstate_gs(pipeline);
+
+      emit_3dstate_streamout(pipeline, state->rs);
+
+#if GFX_VERx10 >= 125
+      const struct anv_device *device = pipeline->base.base.device;
+      /* Disable Mesh. */
+      if (device->vk.enabled_extensions.EXT_mesh_shader) {
+         anv_pipeline_emit(pipeline, final.mesh_control,
+                           GENX(3DSTATE_MESH_CONTROL), zero);
+         anv_pipeline_emit(pipeline, final.mesh_shader,
+                           GENX(3DSTATE_MESH_SHADER), zero);
+         anv_pipeline_emit(pipeline, final.mesh_distrib,
+                           GENX(3DSTATE_MESH_DISTRIB), zero);
+         anv_pipeline_emit(pipeline, final.clip_mesh,
+                           GENX(3DSTATE_CLIP_MESH), zero);
+         anv_pipeline_emit(pipeline, final.sbe_mesh,
+                           GENX(3DSTATE_SBE_MESH), zero);
+         anv_pipeline_emit(pipeline, final.task_control,
+                           GENX(3DSTATE_TASK_CONTROL), zero);
+         anv_pipeline_emit(pipeline, final.task_shader,
+                           GENX(3DSTATE_TASK_SHADER), zero);
+         anv_pipeline_emit(pipeline, final.task_redistrib,
+                           GENX(3DSTATE_TASK_REDISTRIB), zero);
+      }
 #endif
+   } else {
+      assert(anv_pipeline_is_mesh(pipeline));
 
-   emit_3dstate_vs(pipeline);
-   emit_3dstate_hs_te_ds(pipeline, pCreateInfo->pTessellationState);
-   emit_3dstate_gs(pipeline);
-   emit_3dstate_sbe(pipeline);
-   emit_3dstate_wm(pipeline, subpass,
-                   pCreateInfo->pInputAssemblyState,
-                   pCreateInfo->pRasterizationState,
-                   cb_info, ms_info, line_info, dynamic_states);
-   emit_3dstate_ps(pipeline, cb_info, ms_info);
-#if GFX_VER >= 8
-   emit_3dstate_ps_extra(pipeline, subpass,
-                         pCreateInfo->pRasterizationState);
-
-   if (!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY))
-      emit_3dstate_vf_topology(pipeline);
+      anv_pipeline_emit(pipeline, final.vf_sgvs, GENX(3DSTATE_VF_SGVS), sgvs);
+#if GFX_VER >= 11
+      anv_pipeline_emit(pipeline, final.vf_sgvs_2, GENX(3DSTATE_VF_SGVS_2), sgvs);
 #endif
-   emit_3dstate_vf_statistics(pipeline);
+      anv_pipeline_emit(pipeline, final.vs, GENX(3DSTATE_VS), vs);
+      anv_pipeline_emit(pipeline, final.hs, GENX(3DSTATE_HS), hs);
+      anv_pipeline_emit(pipeline, final.ds, GENX(3DSTATE_DS), ds);
+      anv_pipeline_emit(pipeline, partial.te, GENX(3DSTATE_TE), te);
+      anv_pipeline_emit(pipeline, partial.gs, GENX(3DSTATE_GS), gs);
 
-   *pPipeline = anv_pipeline_to_handle(&pipeline->base);
+      /* BSpec 46303 forbids both 3DSTATE_MESH_CONTROL.MeshShaderEnable
+       * and 3DSTATE_STREAMOUT.SOFunctionEnable to be 1.
+       */
+      anv_pipeline_emit(pipeline, partial.so, GENX(3DSTATE_STREAMOUT), so);
 
-   return pipeline->base.batch.status;
+#if GFX_VERx10 >= 125
+      emit_task_state(pipeline);
+      emit_mesh_state(pipeline);
+#endif
+   }
+
+   emit_3dstate_sbe(pipeline);
+   emit_3dstate_wm(pipeline, state->ia, state->rs,
+                   state->ms, state->cb, state->rp);
+   emit_3dstate_ps(pipeline, state->ms, state->cb);
+   emit_3dstate_ps_extra(pipeline, state->rs, state);
 }
 
 #if GFX_VERx10 >= 125
 
-static void
-emit_compute_state(struct anv_compute_pipeline *pipeline,
-                   const struct anv_device *device)
+void
+genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
 {
    const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
    anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
-
-   const UNUSED struct anv_shader_bin *cs_bin = pipeline->cs;
-   const struct intel_device_info *devinfo = &device->info;
-
-   anv_batch_emit(&pipeline->base.batch, GENX(CFE_STATE), cfe) {
-      cfe.MaximumNumberofThreads =
-         devinfo->max_cs_threads * devinfo->subslice_total - 1;
-      cfe.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, cs_bin);
-   }
 }
 
 #else /* #if GFX_VERx10 >= 125 */
 
-static void
-emit_compute_state(struct anv_compute_pipeline *pipeline,
-                   const struct anv_device *device)
+void
+genX(compute_pipeline_emit)(struct anv_compute_pipeline *pipeline)
 {
-   const struct intel_device_info *devinfo = &device->info;
+   struct anv_device *device = pipeline->base.device;
+   const struct intel_device_info *devinfo = device->info;
    const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
 
    anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
 
-   const struct brw_cs_dispatch_info dispatch =
+   const struct intel_cs_dispatch_info dispatch =
       brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
    const uint32_t vfe_curbe_allocation =
       ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
@@ -2619,43 +2043,22 @@ emit_compute_state(struct anv_compute_pipeline *pipeline,
    const struct anv_shader_bin *cs_bin = pipeline->cs;
 
    anv_batch_emit(&pipeline->base.batch, GENX(MEDIA_VFE_STATE), vfe) {
-#if GFX_VER > 7
       vfe.StackSize              = 0;
-#else
-      vfe.GPGPUMode              = true;
-#endif
       vfe.MaximumNumberofThreads =
          devinfo->max_cs_threads * devinfo->subslice_total - 1;
-      vfe.NumberofURBEntries     = GFX_VER <= 7 ? 0 : 2;
+      vfe.NumberofURBEntries     = 2;
 #if GFX_VER < 11
       vfe.ResetGatewayTimer      = true;
 #endif
-#if GFX_VER <= 8
-      vfe.BypassGatewayControl   = true;
-#endif
-      vfe.URBEntryAllocationSize = GFX_VER <= 7 ? 0 : 2;
+      vfe.URBEntryAllocationSize = 2;
       vfe.CURBEAllocationSize    = vfe_curbe_allocation;
 
       if (cs_bin->prog_data->total_scratch) {
-         if (GFX_VER >= 8) {
-            /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
-             * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
-             */
-            vfe.PerThreadScratchSpace =
-               ffs(cs_bin->prog_data->total_scratch) - 11;
-         } else if (GFX_VERx10 == 75) {
-            /* Haswell's Per Thread Scratch Space is in the range [0, 10]
-             * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
-             */
-            vfe.PerThreadScratchSpace =
-               ffs(cs_bin->prog_data->total_scratch) - 12;
-         } else {
-            /* IVB and BYT use the range [0, 11] to mean [1kB, 12kB]
-             * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
-             */
-            vfe.PerThreadScratchSpace =
-               cs_bin->prog_data->total_scratch / 1024 - 1;
-         }
+         /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
+          * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
+          */
+         vfe.PerThreadScratchSpace =
+            ffs(cs_bin->prog_data->total_scratch) - 11;
          vfe.ScratchSpaceBasePointer =
             get_scratch_address(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin);
       }
@@ -2670,20 +2073,19 @@ emit_compute_state(struct anv_compute_pipeline *pipeline,
       .SamplerCount           = GFX_VER == 11 ? 0 : get_sampler_count(cs_bin),
       /* We add 1 because the CS indirect parameters buffer isn't accounted
        * for in bind_map.surface_count.
+       *
+       * Typically set to 0 to avoid prefetching on every thread dispatch.
        */
-      .BindingTableEntryCount = 1 + MIN2(cs_bin->bind_map.surface_count, 30),
+      .BindingTableEntryCount = devinfo->verx10 == 125 ?
+         0 : 1 + MIN2(pipeline->cs->bind_map.surface_count, 30),
       .BarrierEnable          = cs_prog_data->uses_barrier,
       .SharedLocalMemorySize  =
          encode_slm_size(GFX_VER, cs_prog_data->base.total_shared),
 
-#if GFX_VERx10 != 75
       .ConstantURBEntryReadOffset = 0,
-#endif
       .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
-#if GFX_VERx10 >= 75
       .CrossThreadConstantDataReadLength =
          cs_prog_data->push.cross_thread.regs,
-#endif
 #if GFX_VER >= 12
       /* TODO: Check if we are missing workarounds and enable mid-thread
        * preemption.
@@ -2706,268 +2108,38 @@ emit_compute_state(struct anv_compute_pipeline *pipeline,
 
 #endif /* #if GFX_VERx10 >= 125 */
 
-static VkResult
-compute_pipeline_create(
-    VkDevice                                    _device,
-    struct anv_pipeline_cache *                 cache,
-    const VkComputePipelineCreateInfo*          pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkPipeline*                                 pPipeline)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   struct anv_compute_pipeline *pipeline;
-   VkResult result;
-
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO);
-
-   /* Use the default pipeline cache if none is specified */
-   if (cache == NULL && device->physical->instance->pipeline_cache_enabled)
-      cache = &device->default_pipeline_cache;
-
-   pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
-                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-   if (pipeline == NULL)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   result = anv_pipeline_init(&pipeline->base, device,
-                              ANV_PIPELINE_COMPUTE, pCreateInfo->flags,
-                              pAllocator);
-   if (result != VK_SUCCESS) {
-      vk_free2(&device->vk.alloc, pAllocator, pipeline);
-      return result;
-   }
-
-   anv_batch_set_storage(&pipeline->base.batch, ANV_NULL_ADDRESS,
-                         pipeline->batch_data, sizeof(pipeline->batch_data));
-
-   assert(pCreateInfo->stage.stage == VK_SHADER_STAGE_COMPUTE_BIT);
-   VK_FROM_HANDLE(vk_shader_module, module,  pCreateInfo->stage.module);
-   result = anv_pipeline_compile_cs(pipeline, cache, pCreateInfo, module,
-                                    pCreateInfo->stage.pName,
-                                    pCreateInfo->stage.pSpecializationInfo);
-   if (result != VK_SUCCESS) {
-      anv_pipeline_finish(&pipeline->base, device, pAllocator);
-      vk_free2(&device->vk.alloc, pAllocator, pipeline);
-      if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT)
-         *pPipeline = VK_NULL_HANDLE;
-      return result;
-   }
-
-   emit_compute_state(pipeline, device);
-
-   *pPipeline = anv_pipeline_to_handle(&pipeline->base);
-
-   return pipeline->base.batch.status;
-}
-
-VkResult genX(CreateGraphicsPipelines)(
-    VkDevice                                    _device,
-    VkPipelineCache                             pipelineCache,
-    uint32_t                                    count,
-    const VkGraphicsPipelineCreateInfo*         pCreateInfos,
-    const VkAllocationCallbacks*                pAllocator,
-    VkPipeline*                                 pPipelines)
-{
-   ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);
-
-   VkResult result = VK_SUCCESS;
-
-   unsigned i;
-   for (i = 0; i < count; i++) {
-      VkResult res = genX(graphics_pipeline_create)(_device,
-                                                    pipeline_cache,
-                                                    &pCreateInfos[i],
-                                                    pAllocator, &pPipelines[i]);
-
-      if (res == VK_SUCCESS)
-         continue;
-
-      /* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED_EX as it
-       * is not obvious what error should be report upon 2 different failures.
-       * */
-      result = res;
-      if (res != VK_PIPELINE_COMPILE_REQUIRED_EXT)
-         break;
-
-      if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
-         break;
-   }
-
-   for (; i < count; i++)
-      pPipelines[i] = VK_NULL_HANDLE;
-
-   return result;
-}
-
-VkResult genX(CreateComputePipelines)(
-    VkDevice                                    _device,
-    VkPipelineCache                             pipelineCache,
-    uint32_t                                    count,
-    const VkComputePipelineCreateInfo*          pCreateInfos,
-    const VkAllocationCallbacks*                pAllocator,
-    VkPipeline*                                 pPipelines)
-{
-   ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);
-
-   VkResult result = VK_SUCCESS;
-
-   unsigned i;
-   for (i = 0; i < count; i++) {
-      VkResult res = compute_pipeline_create(_device, pipeline_cache,
-                                             &pCreateInfos[i],
-                                             pAllocator, &pPipelines[i]);
-
-      if (res == VK_SUCCESS)
-         continue;
-
-      /* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED_EX as it
-       * is not obvious what error should be report upon 2 different failures.
-       * */
-      result = res;
-      if (res != VK_PIPELINE_COMPILE_REQUIRED_EXT)
-         break;
-
-      if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
-         break;
-   }
-
-   for (; i < count; i++)
-      pPipelines[i] = VK_NULL_HANDLE;
-
-   return result;
-}
-
 #if GFX_VERx10 >= 125
 
-static void
-assert_rt_stage_index_valid(const VkRayTracingPipelineCreateInfoKHR* pCreateInfo,
-                            uint32_t stage_idx,
-                            VkShaderStageFlags valid_stages)
-{
-   if (stage_idx == VK_SHADER_UNUSED_KHR)
-      return;
-
-   assert(stage_idx <= pCreateInfo->stageCount);
-   assert(util_bitcount(pCreateInfo->pStages[stage_idx].stage) == 1);
-   assert(pCreateInfo->pStages[stage_idx].stage & valid_stages);
-}
-
-static VkResult
-ray_tracing_pipeline_create(
-    VkDevice                                    _device,
-    struct anv_pipeline_cache *                 cache,
-    const VkRayTracingPipelineCreateInfoKHR*    pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkPipeline*                                 pPipeline)
+void
+genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline)
 {
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   VkResult result;
-
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RAY_TRACING_PIPELINE_CREATE_INFO_KHR);
-
-   /* Use the default pipeline cache if none is specified */
-   if (cache == NULL && device->physical->instance->pipeline_cache_enabled)
-      cache = &device->default_pipeline_cache;
-
-   VK_MULTIALLOC(ma);
-   VK_MULTIALLOC_DECL(&ma, struct anv_ray_tracing_pipeline, pipeline, 1);
-   VK_MULTIALLOC_DECL(&ma, struct anv_rt_shader_group, groups, pCreateInfo->groupCount);
-   if (!vk_multialloc_zalloc2(&ma, &device->vk.alloc, pAllocator,
-                              VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   result = anv_pipeline_init(&pipeline->base, device,
-                              ANV_PIPELINE_RAY_TRACING, pCreateInfo->flags,
-                              pAllocator);
-   if (result != VK_SUCCESS) {
-      vk_free2(&device->vk.alloc, pAllocator, pipeline);
-      return result;
-   }
-
-   pipeline->group_count = pCreateInfo->groupCount;
-   pipeline->groups = groups;
-
-   ASSERTED const VkShaderStageFlags ray_tracing_stages =
-      VK_SHADER_STAGE_RAYGEN_BIT_KHR |
-      VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
-      VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
-      VK_SHADER_STAGE_MISS_BIT_KHR |
-      VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
-      VK_SHADER_STAGE_CALLABLE_BIT_KHR;
-
-   for (uint32_t i = 0; i < pCreateInfo->stageCount; i++)
-      assert((pCreateInfo->pStages[i].stage & ~ray_tracing_stages) == 0);
-
-   for (uint32_t i = 0; i < pCreateInfo->groupCount; i++) {
-      const VkRayTracingShaderGroupCreateInfoKHR *ginfo =
-         &pCreateInfo->pGroups[i];
-      assert_rt_stage_index_valid(pCreateInfo, ginfo->generalShader,
-                                  VK_SHADER_STAGE_RAYGEN_BIT_KHR |
-                                  VK_SHADER_STAGE_MISS_BIT_KHR |
-                                  VK_SHADER_STAGE_CALLABLE_BIT_KHR);
-      assert_rt_stage_index_valid(pCreateInfo, ginfo->closestHitShader,
-                                  VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR);
-      assert_rt_stage_index_valid(pCreateInfo, ginfo->anyHitShader,
-                                  VK_SHADER_STAGE_ANY_HIT_BIT_KHR);
-      assert_rt_stage_index_valid(pCreateInfo, ginfo->intersectionShader,
-                                  VK_SHADER_STAGE_INTERSECTION_BIT_KHR);
-      switch (ginfo->type) {
-      case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR:
-         assert(ginfo->generalShader < pCreateInfo->stageCount);
-         assert(ginfo->anyHitShader == VK_SHADER_UNUSED_KHR);
-         assert(ginfo->closestHitShader == VK_SHADER_UNUSED_KHR);
-         assert(ginfo->intersectionShader == VK_SHADER_UNUSED_KHR);
-         break;
-
-      case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR:
-         assert(ginfo->generalShader == VK_SHADER_UNUSED_KHR);
-         assert(ginfo->intersectionShader == VK_SHADER_UNUSED_KHR);
-         break;
-
-      case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR:
-         assert(ginfo->generalShader == VK_SHADER_UNUSED_KHR);
-         break;
-
-      default:
-         unreachable("Invalid ray-tracing shader group type");
-      }
-   }
-
-   result = anv_ray_tracing_pipeline_init(pipeline, device, cache,
-                                          pCreateInfo, pAllocator);
-   if (result != VK_SUCCESS) {
-      anv_pipeline_finish(&pipeline->base, device, pAllocator);
-      vk_free2(&device->vk.alloc, pAllocator, pipeline);
-      return result;
-   }
-
    for (uint32_t i = 0; i < pipeline->group_count; i++) {
       struct anv_rt_shader_group *group = &pipeline->groups[i];
 
       switch (group->type) {
       case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR: {
-         struct GFX_RT_GENERAL_SBT_HANDLE sh = {};
+         struct GENX(RT_GENERAL_SBT_HANDLE) sh = {};
          sh.General = anv_shader_bin_get_bsr(group->general, 32);
-         GFX_RT_GENERAL_SBT_HANDLE_pack(NULL, group->handle, &sh);
+         GENX(RT_GENERAL_SBT_HANDLE_pack)(NULL, group->handle, &sh);
          break;
       }
 
       case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR: {
-         struct GFX_RT_TRIANGLES_SBT_HANDLE sh = {};
+         struct GENX(RT_TRIANGLES_SBT_HANDLE) sh = {};
          if (group->closest_hit)
             sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);
          if (group->any_hit)
             sh.AnyHit = anv_shader_bin_get_bsr(group->any_hit, 24);
-         GFX_RT_TRIANGLES_SBT_HANDLE_pack(NULL, group->handle, &sh);
+         GENX(RT_TRIANGLES_SBT_HANDLE_pack)(NULL, group->handle, &sh);
          break;
       }
 
       case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR: {
-         struct GFX_RT_PROCEDURAL_SBT_HANDLE sh = {};
+         struct GENX(RT_PROCEDURAL_SBT_HANDLE) sh = {};
          if (group->closest_hit)
             sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);
          sh.Intersection = anv_shader_bin_get_bsr(group->intersection, 24);
-         GFX_RT_PROCEDURAL_SBT_HANDLE_pack(NULL, group->handle, &sh);
+         GENX(RT_PROCEDURAL_SBT_HANDLE_pack)(NULL, group->handle, &sh);
          break;
       }
 
@@ -2975,48 +2147,14 @@ ray_tracing_pipeline_create(
          unreachable("Invalid shader group type");
       }
    }
-
-   *pPipeline = anv_pipeline_to_handle(&pipeline->base);
-
-   return pipeline->base.batch.status;
 }
 
-VkResult
-genX(CreateRayTracingPipelinesKHR)(
-    VkDevice                                    _device,
-    VkDeferredOperationKHR                      deferredOperation,
-    VkPipelineCache                             pipelineCache,
-    uint32_t                                    createInfoCount,
-    const VkRayTracingPipelineCreateInfoKHR*    pCreateInfos,
-    const VkAllocationCallbacks*                pAllocator,
-    VkPipeline*                                 pPipelines)
-{
-   ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);
-
-   VkResult result = VK_SUCCESS;
-
-   unsigned i;
-   for (i = 0; i < createInfoCount; i++) {
-      VkResult res = ray_tracing_pipeline_create(_device, pipeline_cache,
-                                                 &pCreateInfos[i],
-                                                 pAllocator, &pPipelines[i]);
-
-      if (res == VK_SUCCESS)
-         continue;
-
-      /* Bail out on the first error as it is not obvious what error should be
-       * report upon 2 different failures. */
-      result = res;
-      if (result != VK_PIPELINE_COMPILE_REQUIRED_EXT)
-         break;
-
-      if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
-         break;
-   }
-
-   for (; i < createInfoCount; i++)
-      pPipelines[i] = VK_NULL_HANDLE;
+#else
 
-   return result;
+void
+genX(ray_tracing_pipeline_emit)(struct anv_ray_tracing_pipeline *pipeline)
+{
+   unreachable("Ray tracing not supported");
 }
+
 #endif /* GFX_VERx10 >= 125 */
diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c
index 8978f5843a9..2cb492afcf9 100644
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -29,15 +29,21 @@
 
 #include "anv_private.h"
 
+#include "util/os_time.h"
+
 #include "genxml/gen_macros.h"
 #include "genxml/genX_pack.h"
 
+#include "ds/intel_tracepoints.h"
+
+#include "anv_internal_kernels.h"
+
 /* We reserve :
  *    - GPR 14 for perf queries
  *    - GPR 15 for conditional rendering
  */
 #define MI_BUILDER_NUM_ALLOC_GPRS 14
-#define MI_BUILDER_CAN_WRITE_BATCH GFX_VER >= 8
+#define MI_BUILDER_CAN_WRITE_BATCH true
 #define __gen_get_batch_dwords anv_batch_emit_dwords
 #define __gen_address_offset anv_address_add
 #define __gen_get_batch_address(b, a) anv_batch_address(b, a)
@@ -57,6 +63,18 @@ anv_query_address(struct anv_query_pool *pool, uint32_t query)
    };
 }
 
+static void
+emit_query_mi_flush_availability(struct anv_cmd_buffer *cmd_buffer,
+                                 struct anv_address addr,
+                                 bool available)
+{
+   anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), flush) {
+      flush.PostSyncOperation = WriteImmediateData;
+      flush.Address = addr;
+      flush.ImmediateData = available;
+   }
+}
+
 VkResult genX(CreateQueryPool)(
     VkDevice                                    _device,
     const VkQueryPoolCreateInfo*                pCreateInfo,
@@ -65,12 +83,10 @@ VkResult genX(CreateQueryPool)(
 {
    ANV_FROM_HANDLE(anv_device, device, _device);
    const struct anv_physical_device *pdevice = device->physical;
-#if GFX_VER >= 8
    const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
    struct intel_perf_counter_pass *counter_pass;
    struct intel_perf_query_info **pass_query;
    uint32_t n_passes = 0;
-#endif
    uint32_t data_offset = 0;
    VK_MULTIALLOC(ma);
    VkResult result;
@@ -123,14 +139,13 @@ VkResult genX(CreateQueryPool)(
 
       uint64s_per_slot = 2; /* availability + marker */
       /* Align to the requirement of the layout */
-      uint64s_per_slot = align_u32(uint64s_per_slot,
-                                   DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
+      uint64s_per_slot = align(uint64s_per_slot,
+                               DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
       data_offset = uint64s_per_slot * sizeof(uint64_t);
       /* Add the query data for begin & end commands */
       uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));
       break;
    }
-#if GFX_VER >= 8
    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
       const struct intel_perf_query_field_layout *layout =
          &pdevice->perf->query_layout;
@@ -145,10 +160,10 @@ VkResult genX(CreateQueryPool)(
                              perf_query_info->counterIndexCount);
       vk_multialloc_add(&ma, &pass_query, struct intel_perf_query_info *,
                              n_passes);
-      uint64s_per_slot = 4 /* availability + small batch */;
+      uint64s_per_slot = 1 /* availability */;
       /* Align to the requirement of the layout */
-      uint64s_per_slot = align_u32(uint64s_per_slot,
-                                   DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
+      uint64s_per_slot = align(uint64s_per_slot,
+                               DIV_ROUND_UP(layout->alignment, sizeof(uint64_t)));
       data_offset = uint64s_per_slot * sizeof(uint64_t);
       /* Add the query data for begin & end commands */
       uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t));
@@ -156,26 +171,41 @@ VkResult genX(CreateQueryPool)(
       uint64s_per_slot *= n_passes;
       break;
    }
+   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+      /* Query has two values: begin and end. */
+      uint64s_per_slot = 1 + 2;
+      break;
+#if GFX_VERx10 >= 125
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+      uint64s_per_slot = 1 + 1 /* availability + size (PostbuildInfoCurrentSize, PostbuildInfoCompactedSize) */;
+      break;
+
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+      uint64s_per_slot = 1 + 2 /* availability + size (PostbuildInfoSerializationDesc) */;
+      break;
+
 #endif
+   case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR:
+      uint64s_per_slot = 1;
+      break;
    default:
       assert(!"Invalid query type");
    }
 
-   if (!vk_object_multialloc(&device->vk, &ma, pAllocator,
-                             VK_OBJECT_TYPE_QUERY_POOL))
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   if (!vk_multialloc_zalloc2(&ma, &device->vk.alloc, pAllocator,
+                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT))
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   pool->type = pCreateInfo->queryType;
-   pool->pipeline_statistics = pipeline_statistics;
+   vk_query_pool_init(&device->vk, &pool->vk, pCreateInfo);
    pool->stride = uint64s_per_slot * sizeof(uint64_t);
-   pool->slots = pCreateInfo->queryCount;
 
-   if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
+   if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
       pool->data_offset = data_offset;
       pool->snapshot_size = (pool->stride - data_offset) / 2;
    }
-#if GFX_VER >= 8
-   else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+   else if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
       pool->pass_size = pool->stride / n_passes;
       pool->data_offset = data_offset;
       pool->snapshot_size = (pool->pass_size - data_offset) / 2;
@@ -192,19 +222,27 @@ VkResult genX(CreateQueryPool)(
                               perf_query_info->counterIndexCount,
                               pool->pass_query);
    }
-#endif
 
-   uint64_t size = pool->slots * (uint64_t)pool->stride;
+   uint64_t size = pool->vk.query_count * (uint64_t)pool->stride;
+
+   /* For KHR_performance_query we need some space in the buffer for a small
+    * batch updating ANV_PERF_QUERY_OFFSET_REG.
+    */
+   if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+      pool->khr_perf_preamble_stride = 32;
+      pool->khr_perf_preambles_offset = size;
+      size += (uint64_t)pool->n_passes * pool->khr_perf_preamble_stride;
+   }
+
    result = anv_device_alloc_bo(device, "query-pool", size,
                                 ANV_BO_ALLOC_MAPPED |
-                                ANV_BO_ALLOC_SNOOPED,
+                                ANV_BO_ALLOC_HOST_CACHED_COHERENT,
                                 0 /* explicit_address */,
                                 &pool->bo);
    if (result != VK_SUCCESS)
       goto fail;
 
-#if GFX_VER >= 8
-   if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+   if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
       for (uint32_t p = 0; p < pool->n_passes; p++) {
          struct mi_builder b;
          struct anv_batch batch = {
@@ -213,13 +251,14 @@ VkResult genX(CreateQueryPool)(
          };
          batch.next = batch.start;
 
-         mi_builder_init(&b, &device->info, &batch);
+         mi_builder_init(&b, device->info, &batch);
          mi_store(&b, mi_reg64(ANV_PERF_QUERY_OFFSET_REG),
                       mi_imm(p * (uint64_t)pool->pass_size));
          anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
       }
    }
-#endif
+
+   ANV_RMV(query_pool_create, device, pool, false);
 
    *pQueryPool = anv_query_pool_to_handle(pool);
 
@@ -242,47 +281,36 @@ void genX(DestroyQueryPool)(
    if (!pool)
       return;
 
+   ANV_RMV(resource_destroy, device, pool);
+
    anv_device_release_bo(device, pool->bo);
    vk_object_free(&device->vk, pAllocator, pool);
 }
 
-#if GFX_VER >= 8
 /**
  * VK_KHR_performance_query layout  :
  *
  * --------------------------------------------
  * |       availability (8b)       | |        |
  * |-------------------------------| |        |
- * |      Small batch loading      | |        |
- * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
- * |            (24b)              | | Pass 0 |
- * |-------------------------------| |        |
  * |       some padding (see       | |        |
- * | query_field_layout:alignment) | |        |
+ * | query_field_layout:alignment) | | Pass 0 |
  * |-------------------------------| |        |
  * |           query data          | |        |
  * | (2 * query_field_layout:size) | |        |
  * |-------------------------------|--        | Query 0
  * |       availability (8b)       | |        |
  * |-------------------------------| |        |
- * |      Small batch loading      | |        |
- * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
- * |            (24b)              | | Pass 1 |
- * |-------------------------------| |        |
  * |       some padding (see       | |        |
- * | query_field_layout:alignment) | |        |
+ * | query_field_layout:alignment) | | Pass 1 |
  * |-------------------------------| |        |
  * |           query data          | |        |
  * | (2 * query_field_layout:size) | |        |
  * |-------------------------------|-----------
  * |       availability (8b)       | |        |
  * |-------------------------------| |        |
- * |      Small batch loading      | |        |
- * |   ANV_PERF_QUERY_OFFSET_REG   | |        |
- * |            (24b)              | | Pass 0 |
- * |-------------------------------| |        |
  * |       some padding (see       | |        |
- * | query_field_layout:alignment) | |        |
+ * | query_field_layout:alignment) | | Pass 0 |
  * |-------------------------------| |        |
  * |           query data          | |        |
  * | (2 * query_field_layout:size) | |        |
@@ -333,7 +361,7 @@ khr_perf_query_ensure_relocs(struct anv_cmd_buffer *cmd_buffer)
    const struct anv_physical_device *pdevice = device->physical;
 
    cmd_buffer->self_mod_locations =
-      vk_alloc(&cmd_buffer->pool->alloc,
+      vk_alloc(&cmd_buffer->vk.pool->alloc,
                pdevice->n_perf_query_commands * sizeof(*cmd_buffer->self_mod_locations), 8,
                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 
@@ -344,7 +372,6 @@ khr_perf_query_ensure_relocs(struct anv_cmd_buffer *cmd_buffer)
 
    return true;
 }
-#endif
 
 /**
  * VK_INTEL_performance_query layout :
@@ -396,8 +423,7 @@ query_slot(struct anv_query_pool *pool, uint32_t query)
 static bool
 query_is_available(struct anv_query_pool *pool, uint32_t query)
 {
-#if GFX_VER >= 8
-   if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+   if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
       for (uint32_t p = 0; p < pool->n_passes; p++) {
          volatile uint64_t *slot =
             pool->bo->map + khr_perf_query_availability_offset(pool, query, p);
@@ -406,7 +432,6 @@ query_is_available(struct anv_query_pool *pool, uint32_t query)
       }
       return true;
    }
-#endif
 
    return *(volatile uint64_t *)query_slot(pool, query);
 }
@@ -415,17 +440,29 @@ static VkResult
 wait_for_available(struct anv_device *device,
                    struct anv_query_pool *pool, uint32_t query)
 {
-   uint64_t abs_timeout = anv_get_absolute_timeout(2 * NSEC_PER_SEC);
+   /* By default we leave a 2s timeout before declaring the device lost. */
+   uint64_t rel_timeout = 2 * NSEC_PER_SEC;
+   if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+      /* With performance queries, there is an additional 500us reconfiguration
+       * time in i915.
+       */
+      rel_timeout += 500 * 1000;
+      /* Additionally a command buffer can be replayed N times to gather data
+       * for each of the metric sets to capture all the counters requested.
+       */
+      rel_timeout *= pool->n_passes;
+   }
+   uint64_t abs_timeout_ns = os_time_get_absolute_timeout(rel_timeout);
 
-   while (anv_gettime_ns() < abs_timeout) {
+   while (os_time_get_nano() < abs_timeout_ns) {
       if (query_is_available(pool, query))
          return VK_SUCCESS;
-      VkResult status = anv_device_query_status(device);
+      VkResult status = vk_device_check_status(&device->vk);
       if (status != VK_SUCCESS)
          return status;
    }
 
-   return anv_device_set_lost(device, "query timeout");
+   return vk_device_set_lost(&device->vk, "query timeout");
 }
 
 VkResult genX(GetQueryPoolResults)(
@@ -441,14 +478,23 @@ VkResult genX(GetQueryPoolResults)(
    ANV_FROM_HANDLE(anv_device, device, _device);
    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 
-   assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
-          pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
-          pool->type == VK_QUERY_TYPE_TIMESTAMP ||
-          pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
-          pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
-          pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
-
-   if (anv_device_is_lost(device))
+   assert(
+#if GFX_VERx10 >= 125
+   pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR ||
+   pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR ||
+   pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR ||
+   pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR ||
+#endif
+   pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION ||
+   pool->vk.query_type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
+   pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP ||
+   pool->vk.query_type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
+   pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
+   pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL ||
+   pool->vk.query_type == VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT ||
+   pool->vk.query_type == VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR);
+
+   if (vk_device_is_lost(&device->vk))
       return VK_ERROR_DEVICE_LOST;
 
    if (pData == NULL)
@@ -487,8 +533,9 @@ VkResult genX(GetQueryPoolResults)(
       bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
 
       uint32_t idx = 0;
-      switch (pool->type) {
-      case VK_QUERY_TYPE_OCCLUSION: {
+      switch (pool->vk.query_type) {
+      case VK_QUERY_TYPE_OCCLUSION:
+      case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
          uint64_t *slot = query_slot(pool, firstQuery + i);
          if (write_results) {
             /* From the Vulkan 1.2.132 spec:
@@ -507,22 +554,16 @@ VkResult genX(GetQueryPoolResults)(
 
       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
          uint64_t *slot = query_slot(pool, firstQuery + i);
-         uint32_t statistics = pool->pipeline_statistics;
+         uint32_t statistics = pool->vk.pipeline_statistics;
          while (statistics) {
-            uint32_t stat = u_bit_scan(&statistics);
+            UNUSED uint32_t stat = u_bit_scan(&statistics);
             if (write_results) {
                uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
-
-               /* WaDividePSInvocationCountBy4:HSW,BDW */
-               if ((device->info.ver == 8 || device->info.is_haswell) &&
-                   (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
-                  result >>= 2;
-
                cpu_write_query_result(pData, flags, idx, result);
             }
             idx++;
          }
-         assert(idx == util_bitcount(pool->pipeline_statistics));
+         assert(idx == util_bitcount(pool->vk.pipeline_statistics));
          break;
       }
 
@@ -537,6 +578,26 @@ VkResult genX(GetQueryPoolResults)(
          break;
       }
 
+#if GFX_VERx10 >= 125
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR: {
+         uint64_t *slot = query_slot(pool, firstQuery + i);
+         if (write_results)
+            cpu_write_query_result(pData, flags, idx, slot[1]);
+         idx++;
+         break;
+      }
+
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR: {
+         uint64_t *slot = query_slot(pool, firstQuery + i);
+         if (write_results)
+            cpu_write_query_result(pData, flags, idx, slot[2]);
+         idx++;
+         break;
+      }
+#endif
+
       case VK_QUERY_TYPE_TIMESTAMP: {
          uint64_t *slot = query_slot(pool, firstQuery + i);
          if (write_results)
@@ -545,7 +606,6 @@ VkResult genX(GetQueryPoolResults)(
          break;
       }
 
-#if GFX_VER >= 8
       case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
          const struct anv_physical_device *pdevice = device->physical;
          assert((flags & (VK_QUERY_RESULT_WITH_AVAILABILITY_BIT |
@@ -554,7 +614,7 @@ VkResult genX(GetQueryPoolResults)(
             const struct intel_perf_query_info *query = pool->pass_query[p];
             struct intel_perf_query_result result;
             intel_perf_query_result_clear(&result);
-            intel_perf_query_result_accumulate_fields(&result, query, &device->info,
+            intel_perf_query_result_accumulate_fields(&result, query,
                                                       pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, false),
                                                       pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, true),
                                                       false /* no_oa_accumulate */);
@@ -562,7 +622,6 @@ VkResult genX(GetQueryPoolResults)(
          }
          break;
       }
-#endif
 
       case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
          if (!write_results)
@@ -571,18 +630,26 @@ VkResult genX(GetQueryPoolResults)(
          const struct intel_perf_query_info *query = &device->physical->perf->queries[0];
          struct intel_perf_query_result result;
          intel_perf_query_result_clear(&result);
-         intel_perf_query_result_accumulate_fields(&result, query, &device->info,
+         intel_perf_query_result_accumulate_fields(&result, query,
                                                    query_data + intel_perf_query_data_offset(pool, false),
                                                    query_data + intel_perf_query_data_offset(pool, true),
                                                    false /* no_oa_accumulate */);
          intel_perf_query_result_write_mdapi(pData, stride,
-                                             &device->info,
+                                             device->info,
                                              query, &result);
          const uint64_t *marker = query_data + intel_perf_marker_offset();
-         intel_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
+         intel_perf_query_mdapi_write_marker(pData, stride, device->info, *marker);
          break;
       }
 
+      case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR:
+         if (!write_results)
+            break;
+         const uint32_t *query_data = query_slot(pool, firstQuery + i);
+         uint32_t result = available ? *query_data : 0;
+         cpu_write_query_result(pData, flags, idx, result);
+         break;
+
       default:
          unreachable("invalid pool type");
       }
@@ -608,15 +675,11 @@ emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-      pc.DestinationAddressType  = DAT_PPGTT;
-      pc.PostSyncOperation       = WritePSDepthCount;
-      pc.DepthStallEnable        = true;
-      pc.Address                 = addr;
-
-      if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4)
-         pc.CommandStreamerStallEnable = true;
-   }
+   bool cs_stall_needed = (GFX_VER == 9 && cmd_buffer->device->info->gt == 4);
+   genx_batch_emit_pipe_control_write
+      (&cmd_buffer->batch, cmd_buffer->device->info,
+       cmd_buffer->state.current_pipeline, WritePSDepthCount, addr, 0,
+       ANV_PIPE_DEPTH_STALL_BIT | (cs_stall_needed ? ANV_PIPE_CS_STALL_BIT : 0));
 }
 
 static void
@@ -635,12 +698,10 @@ emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-      pc.DestinationAddressType  = DAT_PPGTT;
-      pc.PostSyncOperation       = WriteImmediateData;
-      pc.Address                 = addr;
-      pc.ImmediateData           = available;
-   }
+   genx_batch_emit_pipe_control_write
+      (&cmd_buffer->batch, cmd_buffer->device->info,
+       cmd_buffer->state.current_pipeline, WriteImmediateData, addr,
+       available, 0);
 }
 
 /**
@@ -652,7 +713,7 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
                   struct mi_builder *b, struct anv_query_pool *pool,
                   uint32_t first_index, uint32_t num_queries)
 {
-   switch (pool->type) {
+   switch (pool->vk.query_type) {
    case VK_QUERY_TYPE_OCCLUSION:
    case VK_QUERY_TYPE_TIMESTAMP:
       /* These queries are written with a PIPE_CONTROL so clear them using the
@@ -673,6 +734,7 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
       }
       break;
 
+   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
       for (uint32_t i = 0; i < num_queries; i++) {
@@ -683,7 +745,6 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
       }
       break;
 
-#if GFX_VER >= 8
    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
       for (uint32_t i = 0; i < num_queries; i++) {
          for (uint32_t p = 0; p < pool->n_passes; p++) {
@@ -696,7 +757,6 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
       }
       break;
    }
-#endif
 
    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
       for (uint32_t i = 0; i < num_queries; i++) {
@@ -720,10 +780,44 @@ void genX(CmdResetQueryPool)(
 {
    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+   struct anv_physical_device *pdevice = cmd_buffer->device->physical;
+
+   /* Shader clearing is only possible on render/compute */
+   if (anv_cmd_buffer_is_render_or_compute_queue(cmd_buffer) &&
+       queryCount >= pdevice->instance->query_clear_with_blorp_threshold) {
+      trace_intel_begin_query_clear_blorp(&cmd_buffer->trace);
+
+      anv_cmd_buffer_fill_area(cmd_buffer,
+                               anv_query_address(pool, firstQuery),
+                               queryCount * pool->stride,
+                               0);
+
+      /* The pending clearing writes are in compute if we're in gpgpu mode on
+       * the render engine or on the compute engine.
+       */
+      if (anv_cmd_buffer_is_compute_queue(cmd_buffer) ||
+          cmd_buffer->state.current_pipeline == pdevice->gpgpu_pipeline_value) {
+         cmd_buffer->state.queries.clear_bits =
+            ANV_QUERY_COMPUTE_WRITES_PENDING_BITS;
+      } else {
+         cmd_buffer->state.queries.clear_bits =
+            ANV_QUERY_RENDER_TARGET_WRITES_PENDING_BITS(&pdevice->info);
+      }
+
+      trace_intel_end_query_clear_blorp(&cmd_buffer->trace, queryCount);
+      return;
+   }
 
-   switch (pool->type) {
+   trace_intel_begin_query_clear_cs(&cmd_buffer->trace);
+
+   switch (pool->vk.query_type) {
    case VK_QUERY_TYPE_OCCLUSION:
-   case VK_QUERY_TYPE_TIMESTAMP:
+#if GFX_VERx10 >= 125
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+#endif
       for (uint32_t i = 0; i < queryCount; i++) {
          emit_query_pc_availability(cmd_buffer,
                                     anv_query_address(pool, firstQuery + i),
@@ -731,20 +825,37 @@ void genX(CmdResetQueryPool)(
       }
       break;
 
+   case VK_QUERY_TYPE_TIMESTAMP: {
+      for (uint32_t i = 0; i < queryCount; i++) {
+         emit_query_pc_availability(cmd_buffer,
+                                    anv_query_address(pool, firstQuery + i),
+                                    false);
+      }
+
+      /* Add a CS stall here to make sure the PIPE_CONTROL above has
+       * completed. Otherwise some timestamps written later with MI_STORE_*
+       * commands might race with the PIPE_CONTROL in the loop above.
+       */
+      anv_add_pending_pipe_bits(cmd_buffer, ANV_PIPE_CS_STALL_BIT,
+                                "vkCmdResetQueryPool of timestamps");
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+      break;
+   }
+
    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
-   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
+   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
       struct mi_builder b;
-      mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+      mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
 
       for (uint32_t i = 0; i < queryCount; i++)
          emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
       break;
    }
 
-#if GFX_VER >= 8
    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
       struct mi_builder b;
-      mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+      mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
 
       for (uint32_t i = 0; i < queryCount; i++) {
          for (uint32_t p = 0; p < pool->n_passes; p++) {
@@ -756,20 +867,24 @@ void genX(CmdResetQueryPool)(
       }
       break;
    }
-#endif
 
    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
       struct mi_builder b;
-      mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+      mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
 
       for (uint32_t i = 0; i < queryCount; i++)
          emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
       break;
    }
-
+   case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR:
+      for (uint32_t i = 0; i < queryCount; i++)
+         emit_query_mi_flush_availability(cmd_buffer, anv_query_address(pool, firstQuery + i), false);
+      break;
    default:
       unreachable("Unsupported query type");
    }
+
+   trace_intel_end_query_clear_cs(&cmd_buffer->trace, queryCount);
 }
 
 void genX(ResetQueryPool)(
@@ -781,14 +896,12 @@ void genX(ResetQueryPool)(
    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 
    for (uint32_t i = 0; i < queryCount; i++) {
-      if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
-#if GFX_VER >= 8
+      if (pool->vk.query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
          for (uint32_t p = 0; p < pool->n_passes; p++) {
             uint64_t *pass_slot = pool->bo->map +
                khr_perf_query_availability_offset(pool, firstQuery + i, p);
             *pass_slot = 0;
          }
-#endif
       } else {
          uint64_t *slot = query_slot(pool, firstQuery + i);
          *slot = 0;
@@ -858,6 +971,7 @@ emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer,
 
       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
+      case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
       case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: {
          struct anv_address addr = anv_address_add(data_addr, field->location);
@@ -877,15 +991,22 @@ emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer,
    }
 }
 
-void genX(CmdBeginQuery)(
-    VkCommandBuffer                             commandBuffer,
-    VkQueryPool                                 queryPool,
-    uint32_t                                    query,
-    VkQueryControlFlags                         flags)
+static void
+emit_query_clear_flush(struct anv_cmd_buffer *cmd_buffer,
+                       struct anv_query_pool *pool,
+                       const char *reason)
 {
-   genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
+   if (cmd_buffer->state.queries.clear_bits == 0)
+      return;
+
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_QUERY_BITS(
+                                cmd_buffer->state.queries.clear_bits),
+                             reason);
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 }
 
+
 void genX(CmdBeginQueryIndexedEXT)(
     VkCommandBuffer                             commandBuffer,
     VkQueryPool                                 queryPool,
@@ -897,22 +1018,39 @@ void genX(CmdBeginQueryIndexedEXT)(
    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
    struct anv_address query_addr = anv_query_address(pool, query);
 
+   emit_query_clear_flush(cmd_buffer, pool, "CmdBeginQuery* flush query clears");
+
    struct mi_builder b;
-   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+   const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &query_addr);
+   mi_builder_set_mocs(&b, mocs);
 
-   switch (pool->type) {
+   switch (pool->vk.query_type) {
    case VK_QUERY_TYPE_OCCLUSION:
+      cmd_buffer->state.gfx.n_occlusion_queries++;
+      cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE;
       emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
       break;
 
+   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+      genx_batch_emit_pipe_control(&cmd_buffer->batch,
+                                   cmd_buffer->device->info,
+                                   cmd_buffer->state.current_pipeline,
+                                   ANV_PIPE_CS_STALL_BIT |
+                                   ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
+      mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)),
+                   mi_reg64(GENX(CL_INVOCATION_COUNT_num)));
+      break;
+
    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
       /* TODO: This might only be necessary for certain stats */
-      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-         pc.CommandStreamerStallEnable = true;
-         pc.StallAtPixelScoreboard = true;
-      }
+      genx_batch_emit_pipe_control(&cmd_buffer->batch,
+                                   cmd_buffer->device->info,
+                                   cmd_buffer->state.current_pipeline,
+                                   ANV_PIPE_CS_STALL_BIT |
+                                   ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
 
-      uint32_t statistics = pool->pipeline_statistics;
+      uint32_t statistics = pool->vk.pipeline_statistics;
       uint32_t offset = 8;
       while (statistics) {
          uint32_t stat = u_bit_scan(&statistics);
@@ -923,14 +1061,14 @@ void genX(CmdBeginQueryIndexedEXT)(
    }
 
    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
-      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-         pc.CommandStreamerStallEnable = true;
-         pc.StallAtPixelScoreboard = true;
-      }
+      genx_batch_emit_pipe_control(&cmd_buffer->batch,
+                                   cmd_buffer->device->info,
+                                   cmd_buffer->state.current_pipeline,
+                                   ANV_PIPE_CS_STALL_BIT |
+                                   ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
       emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
       break;
 
-#if GFX_VER >= 8
    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
       if (!khr_perf_query_ensure_relocs(cmd_buffer))
          return;
@@ -979,12 +1117,15 @@ void genX(CmdBeginQueryIndexedEXT)(
 
       assert(reloc_idx == pdevice->n_perf_query_commands);
 
-      mi_self_mod_barrier(&b);
+      const struct intel_device_info *devinfo = cmd_buffer->device->info;
+      const enum intel_engine_class engine_class = cmd_buffer->queue_family->engine_class;
+      mi_self_mod_barrier(&b, devinfo->engine_class_prefetch[engine_class]);
 
-      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-         pc.CommandStreamerStallEnable = true;
-         pc.StallAtPixelScoreboard = true;
-      }
+      genx_batch_emit_pipe_control(&cmd_buffer->batch,
+                                   cmd_buffer->device->info,
+                                   cmd_buffer->state.current_pipeline,
+                                   ANV_PIPE_CS_STALL_BIT |
+                                   ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
       cmd_buffer->perf_query_pool = pool;
 
       cmd_buffer->perf_reloc_idx = 0;
@@ -1007,6 +1148,7 @@ void genX(CmdBeginQueryIndexedEXT)(
 
          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
+         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
             dws =
@@ -1040,30 +1182,24 @@ void genX(CmdBeginQueryIndexedEXT)(
       }
       break;
    }
-#endif
 
    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
-      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-         pc.CommandStreamerStallEnable = true;
-         pc.StallAtPixelScoreboard = true;
-      }
+      genx_batch_emit_pipe_control(&cmd_buffer->batch,
+                                   cmd_buffer->device->info,
+                                   cmd_buffer->state.current_pipeline,
+                                   ANV_PIPE_CS_STALL_BIT |
+                                   ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
       emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, false);
       break;
    }
-
+   case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR:
+      emit_query_mi_flush_availability(cmd_buffer, query_addr, false);
+      break;
    default:
       unreachable("");
    }
 }
 
-void genX(CmdEndQuery)(
-    VkCommandBuffer                             commandBuffer,
-    VkQueryPool                                 queryPool,
-    uint32_t                                    query)
-{
-   genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
-}
-
 void genX(CmdEndQueryIndexedEXT)(
     VkCommandBuffer                             commandBuffer,
     VkQueryPool                                 queryPool,
@@ -1075,22 +1211,40 @@ void genX(CmdEndQueryIndexedEXT)(
    struct anv_address query_addr = anv_query_address(pool, query);
 
    struct mi_builder b;
-   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
 
-   switch (pool->type) {
+   switch (pool->vk.query_type) {
    case VK_QUERY_TYPE_OCCLUSION:
       emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
       emit_query_pc_availability(cmd_buffer, query_addr, true);
+      cmd_buffer->state.gfx.n_occlusion_queries--;
+      cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_OCCLUSION_QUERY_ACTIVE;
+      break;
+
+   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+      /* Ensure previous commands have completed before capturing the register
+       * value.
+       */
+      genx_batch_emit_pipe_control(&cmd_buffer->batch,
+                                   cmd_buffer->device->info,
+                                   cmd_buffer->state.current_pipeline,
+                                   ANV_PIPE_CS_STALL_BIT |
+                                   ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
+
+      mi_store(&b, mi_mem64(anv_address_add(query_addr, 16)),
+                   mi_reg64(GENX(CL_INVOCATION_COUNT_num)));
+      emit_query_mi_availability(&b, query_addr, true);
       break;
 
    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
       /* TODO: This might only be necessary for certain stats */
-      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-         pc.CommandStreamerStallEnable = true;
-         pc.StallAtPixelScoreboard = true;
-      }
+      genx_batch_emit_pipe_control(&cmd_buffer->batch,
+                                   cmd_buffer->device->info,
+                                   cmd_buffer->state.current_pipeline,
+                                   ANV_PIPE_CS_STALL_BIT |
+                                   ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
 
-      uint32_t statistics = pool->pipeline_statistics;
+      uint32_t statistics = pool->vk.pipeline_statistics;
       uint32_t offset = 16;
       while (statistics) {
          uint32_t stat = u_bit_scan(&statistics);
@@ -1103,21 +1257,21 @@ void genX(CmdEndQueryIndexedEXT)(
    }
 
    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
-      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-         pc.CommandStreamerStallEnable = true;
-         pc.StallAtPixelScoreboard = true;
-      }
-
+      genx_batch_emit_pipe_control(&cmd_buffer->batch,
+                                   cmd_buffer->device->info,
+                                   cmd_buffer->state.current_pipeline,
+                                   ANV_PIPE_CS_STALL_BIT |
+                                   ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
       emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
       emit_query_mi_availability(&b, query_addr, true);
       break;
 
-#if GFX_VER >= 8
    case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
-      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-         pc.CommandStreamerStallEnable = true;
-         pc.StallAtPixelScoreboard = true;
-      }
+      genx_batch_emit_pipe_control(&cmd_buffer->batch,
+                                   cmd_buffer->device->info,
+                                   cmd_buffer->state.current_pipeline,
+                                   ANV_PIPE_CS_STALL_BIT |
+                                   ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
       cmd_buffer->perf_query_pool = pool;
 
       if (!khr_perf_query_ensure_relocs(cmd_buffer))
@@ -1144,6 +1298,7 @@ void genX(CmdEndQueryIndexedEXT)(
 
          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT:
          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT:
+         case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A:
          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B:
          case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C:
             dws =
@@ -1189,13 +1344,13 @@ void genX(CmdEndQueryIndexedEXT)(
       assert(cmd_buffer->perf_reloc_idx == pdevice->n_perf_query_commands);
       break;
    }
-#endif
 
    case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
-      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-         pc.CommandStreamerStallEnable = true;
-         pc.StallAtPixelScoreboard = true;
-      }
+      genx_batch_emit_pipe_control(&cmd_buffer->batch,
+                                   cmd_buffer->device->info,
+                                   cmd_buffer->state.current_pipeline,
+                                   ANV_PIPE_CS_STALL_BIT |
+                                   ANV_PIPE_STALL_AT_SCOREBOARD_BIT);
       uint32_t marker_offset = intel_perf_marker_offset();
       mi_store(&b, mi_mem64(anv_address_add(query_addr, marker_offset)),
                    mi_imm(cmd_buffer->intel_perf_marker));
@@ -1203,6 +1358,9 @@ void genX(CmdEndQueryIndexedEXT)(
       emit_query_mi_availability(&b, query_addr, true);
       break;
    }
+   case VK_QUERY_TYPE_RESULT_STATUS_ONLY_KHR:
+      emit_query_mi_flush_availability(cmd_buffer, query_addr, true);
+      break;
 
    default:
       unreachable("");
@@ -1216,9 +1374,9 @@ void genX(CmdEndQueryIndexedEXT)(
     * first index, mark the other query indices as being already available
     * with result 0.
     */
-   if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
+   if (cmd_buffer->state.gfx.view_mask) {
       const uint32_t num_queries =
-         util_bitcount(cmd_buffer->state.subpass->view_mask);
+         util_bitcount(cmd_buffer->state.gfx.view_mask);
       if (num_queries > 1)
          emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
    }
@@ -1226,9 +1384,9 @@ void genX(CmdEndQueryIndexedEXT)(
 
 #define TIMESTAMP 0x2358
 
-void genX(CmdWriteTimestamp)(
+void genX(CmdWriteTimestamp2)(
     VkCommandBuffer                             commandBuffer,
-    VkPipelineStageFlagBits                     pipelineStage,
+    VkPipelineStageFlags2                       stage,
     VkQueryPool                                 queryPool,
     uint32_t                                    query)
 {
@@ -1236,34 +1394,49 @@ void genX(CmdWriteTimestamp)(
    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
    struct anv_address query_addr = anv_query_address(pool, query);
 
-   assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
+   assert(pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP);
+
+   emit_query_clear_flush(cmd_buffer, pool,
+                          "CmdWriteTimestamp flush query clears");
 
    struct mi_builder b;
-   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
 
-   switch (pipelineStage) {
-   case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
+   if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT) {
       mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)),
                    mi_reg64(TIMESTAMP));
-      break;
-
-   default:
+      emit_query_mi_availability(&b, query_addr, true);
+   } else {
       /* Everything else is bottom-of-pipe */
       cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 
-      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-         pc.DestinationAddressType  = DAT_PPGTT;
-         pc.PostSyncOperation       = WriteTimestamp;
-         pc.Address                 = anv_address_add(query_addr, 8);
+      bool cs_stall_needed =
+         (GFX_VER == 9 && cmd_buffer->device->info->gt == 4);
 
-         if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4)
-            pc.CommandStreamerStallEnable = true;
+      if (anv_cmd_buffer_is_blitter_queue(cmd_buffer) ||
+          anv_cmd_buffer_is_video_queue(cmd_buffer)) {
+         /* Wa_16018063123 - emit fast color dummy blit before MI_FLUSH_DW. */
+         if (intel_needs_workaround(cmd_buffer->device->info, 16018063123)) {
+            genX(batch_emit_fast_color_dummy_blit)(&cmd_buffer->batch,
+                                                   cmd_buffer->device);
+         }
+         anv_batch_emit(&cmd_buffer->batch, GENX(MI_FLUSH_DW), dw) {
+            dw.Address = anv_address_add(query_addr, 8);
+            dw.PostSyncOperation = WriteTimestamp;
+         }
+         emit_query_mi_flush_availability(cmd_buffer, query_addr, true);
+      } else {
+         genx_batch_emit_pipe_control_write
+            (&cmd_buffer->batch, cmd_buffer->device->info,
+             cmd_buffer->state.current_pipeline, WriteTimestamp,
+             anv_address_add(query_addr, 8), 0,
+             cs_stall_needed ? ANV_PIPE_CS_STALL_BIT : 0);
+         emit_query_pc_availability(cmd_buffer, query_addr, true);
       }
-      break;
+
    }
 
-   emit_query_pc_availability(cmd_buffer, query_addr, true);
 
    /* When multiview is active the spec requires that N consecutive query
     * indices are used, where N is the number of active views in the subpass.
@@ -1273,16 +1446,14 @@ void genX(CmdWriteTimestamp)(
     * first index, mark the other query indices as being already available
     * with result 0.
     */
-   if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
+   if (cmd_buffer->state.gfx.view_mask) {
       const uint32_t num_queries =
-         util_bitcount(cmd_buffer->state.subpass->view_mask);
+         util_bitcount(cmd_buffer->state.gfx.view_mask);
       if (num_queries > 1)
          emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
    }
 }
 
-#if GFX_VERx10 >= 75
-
 #define MI_PREDICATE_SRC0    0x2400
 #define MI_PREDICATE_SRC1    0x2408
 #define MI_PREDICATE_RESULT  0x2418
@@ -1341,61 +1512,92 @@ compute_query_result(struct mi_builder *b, struct anv_address addr)
                      mi_mem64(anv_address_add(addr, 0)));
 }
 
-void genX(CmdCopyQueryPoolResults)(
-    VkCommandBuffer                             commandBuffer,
-    VkQueryPool                                 queryPool,
-    uint32_t                                    firstQuery,
-    uint32_t                                    queryCount,
-    VkBuffer                                    destBuffer,
-    VkDeviceSize                                destOffset,
-    VkDeviceSize                                destStride,
-    VkQueryResultFlags                          flags)
+static void
+copy_query_results_with_cs(struct anv_cmd_buffer *cmd_buffer,
+                           struct anv_query_pool *pool,
+                           struct anv_address dest_addr,
+                           uint64_t dest_stride,
+                           uint32_t first_query,
+                           uint32_t query_count,
+                           VkQueryResultFlags flags)
 {
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
-   ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
+   enum anv_pipe_bits needed_flushes = 0;
 
-   struct mi_builder b;
-   mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch);
-   struct mi_value result;
+   trace_intel_begin_query_copy_cs(&cmd_buffer->trace);
 
    /* If render target writes are ongoing, request a render target cache flush
     * to ensure proper ordering of the commands from the 3d pipe and the
     * command streamer.
     */
-   if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
-      anv_add_pending_pipe_bits(cmd_buffer,
-                                ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
-                                "CopyQueryPoolResults");
+   if ((cmd_buffer->state.queries.buffer_write_bits |
+        cmd_buffer->state.queries.clear_bits) &
+       ANV_QUERY_WRITES_RT_FLUSH)
+      needed_flushes |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+
+   if ((cmd_buffer->state.queries.buffer_write_bits |
+        cmd_buffer->state.queries.clear_bits) &
+       ANV_QUERY_WRITES_TILE_FLUSH)
+      needed_flushes |= ANV_PIPE_TILE_CACHE_FLUSH_BIT;
+
+   if ((cmd_buffer->state.queries.buffer_write_bits |
+        cmd_buffer->state.queries.clear_bits) &
+       ANV_QUERY_WRITES_DATA_FLUSH) {
+      needed_flushes |= (ANV_PIPE_DATA_CACHE_FLUSH_BIT |
+                         ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
+                         ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT);
    }
 
-   if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
-       (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
-       /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
-        * because we're about to copy values from MI commands, we need to
-        * stall the command streamer to make sure the PIPE_CONTROL values have
-        * landed, otherwise we could see inconsistent values & availability.
-        *
-        *  From the vulkan spec:
-        *
-        *     "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
-        *     previous uses of vkCmdResetQueryPool in the same queue, without
-        *     any additional synchronization."
-        */
-       pool->type == VK_QUERY_TYPE_OCCLUSION ||
-       pool->type == VK_QUERY_TYPE_TIMESTAMP) {
+   if ((cmd_buffer->state.queries.buffer_write_bits |
+        cmd_buffer->state.queries.clear_bits) &
+       ANV_QUERY_WRITES_CS_STALL)
+      needed_flushes |= ANV_PIPE_CS_STALL_BIT;
+
+   /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
+    * because we're about to copy values from MI commands, we need to stall
+    * the command streamer to make sure the PIPE_CONTROL values have
+    * landed, otherwise we could see inconsistent values & availability.
+    *
+    *  From the vulkan spec:
+    *
+    *     "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
+    *     previous uses of vkCmdResetQueryPool in the same queue, without any
+    *     additional synchronization."
+    */
+   if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION ||
+       pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP)
+      needed_flushes |= ANV_PIPE_CS_STALL_BIT;
+
+   if (needed_flushes) {
       anv_add_pending_pipe_bits(cmd_buffer,
-                                ANV_PIPE_CS_STALL_BIT,
+                                needed_flushes,
                                 "CopyQueryPoolResults");
       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
    }
 
-   struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
-   for (uint32_t i = 0; i < queryCount; i++) {
-      struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+   struct mi_value result;
+
+   for (uint32_t i = 0; i < query_count; i++) {
+      struct anv_address query_addr = anv_query_address(pool, first_query + i);
+      const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &query_addr);
+
+      mi_builder_set_mocs(&b, mocs);
+
+      /* Wait for the availability write to land before we go read the data */
+      if (flags & VK_QUERY_RESULT_WAIT_BIT) {
+         anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
+            sem.WaitMode            = PollingMode;
+            sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD;
+            sem.SemaphoreDataDword  = true;
+            sem.SemaphoreAddress    = query_addr;
+         }
+      }
+
       uint32_t idx = 0;
-      switch (pool->type) {
+      switch (pool->vk.query_type) {
       case VK_QUERY_TYPE_OCCLUSION:
+      case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
          result = compute_query_result(&b, anv_address_add(query_addr, 8));
          /* Like in the case of vkGetQueryPoolResults, if the query is
           * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set,
@@ -1403,32 +1605,23 @@ void genX(CmdCopyQueryPoolResults)(
           * VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value.
           */
          gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
-               1 /* available */, flags, idx, result);
+                                     1 /* available */, flags, idx, result);
          if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
             gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
-                  0 /* unavailable */, flags, idx, mi_imm(0));
+                                        0 /* unavailable */, flags, idx, mi_imm(0));
          }
          idx++;
          break;
 
       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
-         uint32_t statistics = pool->pipeline_statistics;
+         uint32_t statistics = pool->vk.pipeline_statistics;
          while (statistics) {
-            uint32_t stat = u_bit_scan(&statistics);
-
+            UNUSED uint32_t stat = u_bit_scan(&statistics);
             result = compute_query_result(&b, anv_address_add(query_addr,
                                                               idx * 16 + 8));
-
-            /* WaDividePSInvocationCountBy4:HSW,BDW */
-            if ((cmd_buffer->device->info.ver == 8 ||
-                 cmd_buffer->device->info.is_haswell) &&
-                (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
-               result = mi_ushr32_imm(&b, result, 2);
-            }
-
             gpu_write_query_result(&b, dest_addr, flags, idx++, result);
          }
-         assert(idx == util_bitcount(pool->pipeline_statistics));
+         assert(idx == util_bitcount(pool->vk.pipeline_statistics));
          break;
       }
 
@@ -1444,11 +1637,23 @@ void genX(CmdCopyQueryPoolResults)(
          gpu_write_query_result(&b, dest_addr, flags, idx++, result);
          break;
 
-#if GFX_VER >= 8
+#if GFX_VERx10 >= 125
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+         result = mi_mem64(anv_address_add(query_addr, 8));
+         gpu_write_query_result(&b, dest_addr, flags, idx++, result);
+         break;
+
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+         result = mi_mem64(anv_address_add(query_addr, 16));
+         gpu_write_query_result(&b, dest_addr, flags, idx++, result);
+         break;
+#endif
+
       case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
          unreachable("Copy KHR performance query results not implemented");
          break;
-#endif
 
       default:
          unreachable("unhandled query type");
@@ -1459,11 +1664,182 @@ void genX(CmdCopyQueryPoolResults)(
                                 mi_mem64(query_addr));
       }
 
-      dest_addr = anv_address_add(dest_addr, destStride);
+      dest_addr = anv_address_add(dest_addr, dest_stride);
    }
+
+   trace_intel_end_query_copy_cs(&cmd_buffer->trace, query_count);
+}
+
+static void
+copy_query_results_with_shader(struct anv_cmd_buffer *cmd_buffer,
+                               struct anv_query_pool *pool,
+                               struct anv_address dest_addr,
+                               uint64_t dest_stride,
+                               uint32_t first_query,
+                               uint32_t query_count,
+                               VkQueryResultFlags flags)
+{
+   struct anv_device *device = cmd_buffer->device;
+   enum anv_pipe_bits needed_flushes = 0;
+
+   trace_intel_begin_query_copy_shader(&cmd_buffer->trace);
+
+   /* If this is the first command in the batch buffer, make sure we have
+    * consistent pipeline mode.
+    */
+   if (cmd_buffer->state.current_pipeline == UINT32_MAX)
+      genX(flush_pipeline_select_3d)(cmd_buffer);
+
+   if ((cmd_buffer->state.queries.buffer_write_bits |
+        cmd_buffer->state.queries.clear_bits) & ANV_QUERY_WRITES_RT_FLUSH)
+      needed_flushes |= ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
+
+   if ((cmd_buffer->state.queries.buffer_write_bits |
+        cmd_buffer->state.queries.clear_bits) & ANV_QUERY_WRITES_DATA_FLUSH) {
+      needed_flushes |= (ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
+                         ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT);
+   }
+
+   /* Flushes for the queries to complete */
+   if (flags & VK_QUERY_RESULT_WAIT_BIT) {
+      /* Some queries are done with shaders, so we need to have them flush
+       * high level caches writes. The L3 should be shared across the GPU.
+       */
+      if (pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR ||
+          pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR ||
+          pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR ||
+          pool->vk.query_type == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR) {
+         needed_flushes |= ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;
+      }
+      /* And we need to stall for previous CS writes to land or the flushes to
+       * complete.
+       */
+      needed_flushes |= ANV_PIPE_CS_STALL_BIT;
+   }
+
+   /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
+    * because we're about to copy values from MI commands, we need to stall
+    * the command streamer to make sure the PIPE_CONTROL values have
+    * landed, otherwise we could see inconsistent values & availability.
+    *
+    *  From the vulkan spec:
+    *
+    *     "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
+    *     previous uses of vkCmdResetQueryPool in the same queue, without any
+    *     additional synchronization."
+    */
+   if (pool->vk.query_type == VK_QUERY_TYPE_OCCLUSION ||
+       pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP)
+      needed_flushes |= ANV_PIPE_CS_STALL_BIT;
+
+   if (needed_flushes) {
+      anv_add_pending_pipe_bits(cmd_buffer,
+                                needed_flushes | ANV_PIPE_END_OF_PIPE_SYNC_BIT,
+                                "CopyQueryPoolResults");
+      genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+   }
+
+   struct anv_shader_bin *copy_kernel;
+   VkResult ret =
+      anv_device_get_internal_shader(
+         cmd_buffer->device,
+         cmd_buffer->state.current_pipeline == GPGPU ?
+         ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_COMPUTE :
+         ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_FRAGMENT,
+         &copy_kernel);
+   if (ret != VK_SUCCESS) {
+      anv_batch_set_error(&cmd_buffer->batch, ret);
+      return;
+   }
+
+   struct anv_simple_shader state = {
+      .device               = cmd_buffer->device,
+      .cmd_buffer           = cmd_buffer,
+      .dynamic_state_stream = &cmd_buffer->dynamic_state_stream,
+      .general_state_stream = &cmd_buffer->general_state_stream,
+      .batch                = &cmd_buffer->batch,
+      .kernel               = copy_kernel,
+      .l3_config            = device->internal_kernels_l3_config,
+      .urb_cfg              = &cmd_buffer->state.gfx.urb_cfg,
+   };
+   genX(emit_simple_shader_init)(&state);
+
+   struct anv_state push_data_state =
+      genX(simple_shader_alloc_push)(&state,
+                                     sizeof(struct anv_query_copy_params));
+   if (push_data_state.map == NULL)
+      return;
+
+   struct anv_query_copy_params *params = push_data_state.map;
+
+   uint32_t copy_flags =
+      ((flags & VK_QUERY_RESULT_64_BIT) ? ANV_COPY_QUERY_FLAG_RESULT64 : 0) |
+      ((flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) ? ANV_COPY_QUERY_FLAG_AVAILABLE : 0);
+
+   uint32_t num_items = 1;
+   uint32_t data_offset = 8 /* behind availability */;
+   switch (pool->vk.query_type) {
+   case VK_QUERY_TYPE_OCCLUSION:
+   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
+      copy_flags |= ANV_COPY_QUERY_FLAG_DELTA;
+      /* These 2 queries are the only ones where we would have partial data
+       * because they are capture with a PIPE_CONTROL post sync operation. The
+       * other ones are captured with MI_STORE_REGISTER_DATA so we're always
+       * available by the time we reach the copy command.
+       */
+      copy_flags |= (flags & VK_QUERY_RESULT_PARTIAL_BIT) ? ANV_COPY_QUERY_FLAG_PARTIAL : 0;
+      break;
+
+   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
+      num_items = util_bitcount(pool->vk.pipeline_statistics);
+      copy_flags |= ANV_COPY_QUERY_FLAG_DELTA;
+      break;
+
+   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+      num_items = 2;
+      copy_flags |= ANV_COPY_QUERY_FLAG_DELTA;
+      break;
+
+   case VK_QUERY_TYPE_TIMESTAMP:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+      break;
+
+   case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+      data_offset += 8;
+      break;
+
+   default:
+      unreachable("unhandled query type");
+   }
+
+   *params = (struct anv_query_copy_params) {
+      .flags              = copy_flags,
+      .num_queries        = query_count,
+      .num_items          = num_items,
+      .query_base         = first_query,
+      .query_stride       = pool->stride,
+      .query_data_offset  = data_offset,
+      .destination_stride = dest_stride,
+      .query_data_addr    = anv_address_physical(
+         (struct anv_address) {
+            .bo = pool->bo,
+         }),
+      .destination_addr   = anv_address_physical(dest_addr),
+   };
+
+   genX(emit_simple_shader_dispatch)(&state, query_count, push_data_state);
+
+   /* The query copy result shader is writing using the dataport, flush
+    * HDC/Data cache depending on the generation. Also stall at pixel
+    * scoreboard in case we're doing the copy with a fragment shader.
+    */
+   cmd_buffer->state.queries.buffer_write_bits |= ANV_QUERY_WRITES_DATA_FLUSH;
+
+   trace_intel_end_query_copy_shader(&cmd_buffer->trace, query_count);
 }
 
-#else
 void genX(CmdCopyQueryPoolResults)(
     VkCommandBuffer                             commandBuffer,
     VkQueryPool                                 queryPool,
@@ -1474,6 +1850,99 @@ void genX(CmdCopyQueryPoolResults)(
     VkDeviceSize                                destStride,
     VkQueryResultFlags                          flags)
 {
-   anv_finishme("Queries not yet supported on Ivy Bridge");
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+   ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
+   struct anv_device *device = cmd_buffer->device;
+   struct anv_physical_device *pdevice = device->physical;
+
+   if (queryCount > pdevice->instance->query_copy_with_shader_threshold) {
+      copy_query_results_with_shader(cmd_buffer, pool,
+                                     anv_address_add(buffer->address,
+                                                     destOffset),
+                                     destStride,
+                                     firstQuery,
+                                     queryCount,
+                                     flags);
+   } else {
+      copy_query_results_with_cs(cmd_buffer, pool,
+                                 anv_address_add(buffer->address,
+                                                 destOffset),
+                                 destStride,
+                                 firstQuery,
+                                 queryCount,
+                                 flags);
+   }
+}
+
+#if GFX_VERx10 == 125 && ANV_SUPPORT_RT
+
+#include "grl/include/GRLRTASCommon.h"
+#include "grl/grl_metakernel_postbuild_info.h"
+
+void
+genX(CmdWriteAccelerationStructuresPropertiesKHR)(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    accelerationStructureCount,
+    const VkAccelerationStructureKHR*           pAccelerationStructures,
+    VkQueryType                                 queryType,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    firstQuery)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
+
+   assert(queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR ||
+          queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR ||
+          queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR ||
+          queryType == VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR);
+
+   emit_query_clear_flush(cmd_buffer, pool,
+                          "CmdWriteAccelerationStructuresPropertiesKHR flush query clears");
+
+   struct mi_builder b;
+   mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
+
+   for (uint32_t i = 0; i < accelerationStructureCount; i++) {
+      ANV_FROM_HANDLE(vk_acceleration_structure, accel, pAccelerationStructures[i]);
+      struct anv_address query_addr =
+         anv_address_add(anv_query_address(pool, firstQuery + i), 8);
+
+      switch (queryType) {
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR:
+         genX(grl_postbuild_info_compacted_size)(cmd_buffer,
+                                                 vk_acceleration_structure_get_va(accel),
+                                                 anv_address_physical(query_addr));
+         break;
+
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SIZE_KHR:
+         genX(grl_postbuild_info_current_size)(cmd_buffer,
+                                               vk_acceleration_structure_get_va(accel),
+                                               anv_address_physical(query_addr));
+         break;
+
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_SIZE_KHR:
+      case VK_QUERY_TYPE_ACCELERATION_STRUCTURE_SERIALIZATION_BOTTOM_LEVEL_POINTERS_KHR:
+         genX(grl_postbuild_info_serialized_size)(cmd_buffer,
+                                                  vk_acceleration_structure_get_va(accel),
+                                                  anv_address_physical(query_addr));
+         break;
+
+      default:
+         unreachable("unhandled query type");
+      }
+   }
+
+   /* TODO: Figure out why MTL needs ANV_PIPE_DATA_CACHE_FLUSH_BIT in order
+    * to not lose the availability bit.
+    */
+   anv_add_pending_pipe_bits(cmd_buffer,
+                             ANV_PIPE_END_OF_PIPE_SYNC_BIT |
+                             ANV_PIPE_DATA_CACHE_FLUSH_BIT,
+                             "after write acceleration struct props");
+   genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+   for (uint32_t i = 0; i < accelerationStructureCount; i++)
+      emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), true);
 }
 #endif
diff --git a/src/intel/vulkan/genX_simple_shader.c b/src/intel/vulkan/genX_simple_shader.c
new file mode 100644
index 00000000000..bfe1ba2b5bf
--- /dev/null
+++ b/src/intel/vulkan/genX_simple_shader.c
@@ -0,0 +1,704 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "util/macros.h"
+
+#include "anv_private.h"
+
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+#include "common/intel_genX_state_brw.h"
+
+static void
+genX(emit_simpler_shader_init_fragment)(struct anv_simple_shader *state)
+{
+   assert(state->cmd_buffer == NULL ||
+          state->cmd_buffer->state.current_pipeline == _3D);
+
+   struct anv_batch *batch = state->batch;
+   struct anv_device *device = state->device;
+   const struct brw_wm_prog_data *prog_data =
+      brw_wm_prog_data_const(state->kernel->prog_data);
+
+   uint32_t *dw = anv_batch_emitn(batch,
+                                  1 + 2 * GENX(VERTEX_ELEMENT_STATE_length),
+                                  GENX(3DSTATE_VERTEX_ELEMENTS));
+   /* You might think there is some shady stuff going here and you would be
+    * right. We're setting up 2 VERTEX_ELEMENT_STATE yet we're only providing
+    * 1 (positions) VERTEX_BUFFER_STATE later.
+    *
+    * Find more about how to set up a 3D pipeline with a fragment shader but
+    * without a vertex shader in blorp_emit_vertex_elements() in
+    * blorp_genX_exec_brw.h.
+    */
+   GENX(VERTEX_ELEMENT_STATE_pack)(
+      batch, dw + 1, &(struct GENX(VERTEX_ELEMENT_STATE)) {
+         .VertexBufferIndex = 1,
+         .Valid = true,
+         .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
+         .SourceElementOffset = 0,
+         .Component0Control = VFCOMP_STORE_SRC,
+         .Component1Control = VFCOMP_STORE_0,
+         .Component2Control = VFCOMP_STORE_0,
+         .Component3Control = VFCOMP_STORE_0,
+      });
+   GENX(VERTEX_ELEMENT_STATE_pack)(
+      batch, dw + 3, &(struct GENX(VERTEX_ELEMENT_STATE)) {
+         .VertexBufferIndex   = 0,
+         .Valid               = true,
+         .SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT,
+         .SourceElementOffset = 0,
+         .Component0Control   = VFCOMP_STORE_SRC,
+         .Component1Control   = VFCOMP_STORE_SRC,
+         .Component2Control   = VFCOMP_STORE_SRC,
+         .Component3Control   = VFCOMP_STORE_1_FP,
+      });
+
+   anv_batch_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf);
+   anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs) {
+      sgvs.InstanceIDEnable = true;
+      sgvs.InstanceIDComponentNumber = COMP_1;
+      sgvs.InstanceIDElementOffset = 0;
+   }
+#if GFX_VER >= 11
+   anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS_2), sgvs);
+#endif
+   anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+      vfi.InstancingEnable   = false;
+      vfi.VertexElementIndex = 0;
+   }
+   anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
+      vfi.InstancingEnable   = false;
+      vfi.VertexElementIndex = 1;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
+      topo.PrimitiveTopologyType = _3DPRIM_RECTLIST;
+   }
+
+   /* Emit URB setup.  We tell it that the VS is active because we want it to
+    * allocate space for the VS.  Even though one isn't run, we need VUEs to
+    * store the data that VF is going to pass to SOL.
+    */
+   struct intel_urb_config urb_cfg_out = {
+      .size = { DIV_ROUND_UP(32, 64), 1, 1, 1 },
+   };
+
+   genX(emit_l3_config)(batch, device, state->l3_config);
+
+   state->cmd_buffer->state.current_l3_config = state->l3_config;
+
+   enum intel_urb_deref_block_size deref_block_size;
+   genX(emit_urb_setup)(device, batch, state->l3_config,
+                        VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT,
+                        state->urb_cfg, &urb_cfg_out, &deref_block_size);
+
+   anv_batch_emit(batch, GENX(3DSTATE_PS_BLEND), ps_blend) {
+      ps_blend.HasWriteableRT = true;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), wm);
+
+#if GFX_VER >= 12
+   anv_batch_emit(batch, GENX(3DSTATE_DEPTH_BOUNDS), db) {
+      db.DepthBoundsTestEnable = false;
+      db.DepthBoundsTestMinValue = 0.0;
+      db.DepthBoundsTestMaxValue = 1.0;
+   }
+#endif
+
+   anv_batch_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms);
+   anv_batch_emit(batch, GENX(3DSTATE_SAMPLE_MASK), sm) {
+      sm.SampleMask = 0x1;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_VS), vs);
+   anv_batch_emit(batch, GENX(3DSTATE_HS), hs);
+   anv_batch_emit(batch, GENX(3DSTATE_TE), te);
+   anv_batch_emit(batch, GENX(3DSTATE_DS), DS);
+
+#if GFX_VERx10 >= 125
+   if (device->vk.enabled_extensions.EXT_mesh_shader) {
+      anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), mesh);
+      anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), task);
+   }
+#endif
+
+   anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so);
+
+   anv_batch_emit(batch, GENX(3DSTATE_GS), gs);
+
+   anv_batch_emit(batch, GENX(3DSTATE_CLIP), clip) {
+      clip.PerspectiveDivideDisable = true;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_SF), sf) {
+#if GFX_VER >= 12
+      sf.DerefBlockSize = deref_block_size;
+#endif
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_RASTER), raster) {
+      raster.CullMode = CULLMODE_NONE;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_SBE), sbe) {
+      sbe.VertexURBEntryReadOffset = 1;
+      sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
+      sbe.VertexURBEntryReadLength = MAX2((prog_data->num_varying_inputs + 1) / 2, 1);
+      sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
+      sbe.ForceVertexURBEntryReadLength = true;
+      sbe.ForceVertexURBEntryReadOffset = true;
+      for (unsigned i = 0; i < 32; i++)
+         sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_WM), wm);
+
+   anv_batch_emit(batch, GENX(3DSTATE_PS), ps) {
+      intel_set_ps_dispatch_state(&ps, device->info, prog_data,
+                                  1 /* rasterization_samples */,
+                                  0 /* msaa_flags */);
+
+      ps.VectorMaskEnable       = prog_data->uses_vmask;
+
+      ps.BindingTableEntryCount = GFX_VER == 9 ? 1 : 0;
+#if GFX_VER < 20
+      ps.PushConstantEnable     = prog_data->base.nr_params > 0 ||
+                                  prog_data->base.ubo_ranges[0].length;
+#endif
+
+      ps.DispatchGRFStartRegisterForConstantSetupData0 =
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+      ps.DispatchGRFStartRegisterForConstantSetupData1 =
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
+#if GFX_VER < 20
+      ps.DispatchGRFStartRegisterForConstantSetupData2 =
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
+#endif
+
+      ps.KernelStartPointer0 = state->kernel->kernel.offset +
+         brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+      ps.KernelStartPointer1 = state->kernel->kernel.offset +
+         brw_wm_prog_data_prog_offset(prog_data, ps, 1);
+#if GFX_VER < 20
+      ps.KernelStartPointer2 = state->kernel->kernel.offset +
+         brw_wm_prog_data_prog_offset(prog_data, ps, 2);
+#endif
+
+      ps.MaximumNumberofThreadsPerPSD = device->info->max_threads_per_psd - 1;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_PS_EXTRA), psx) {
+      psx.PixelShaderValid = true;
+#if GFX_VER < 20
+      psx.AttributeEnable = prog_data->num_varying_inputs > 0;
+#endif
+      psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
+      psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
+      psx.PixelShaderComputesStencil = prog_data->computed_stencil;
+   }
+
+   anv_batch_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
+      struct anv_state cc_state =
+         anv_state_stream_alloc(state->dynamic_state_stream,
+                                4 * GENX(CC_VIEWPORT_length), 32);
+      if (cc_state.map == NULL)
+         return;
+
+      struct GENX(CC_VIEWPORT) cc_viewport = {
+         .MinimumDepth = 0.0f,
+         .MaximumDepth = 1.0f,
+      };
+      GENX(CC_VIEWPORT_pack)(NULL, cc_state.map, &cc_viewport);
+      cc.CCViewportPointer = cc_state.offset;
+   }
+
+#if GFX_VER >= 12
+   /* Disable Primitive Replication. */
+   anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
+#endif
+
+   anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc);
+   anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_HS), alloc);
+   anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_DS), alloc);
+   anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_GS), alloc);
+   anv_batch_emit(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
+      alloc.ConstantBufferOffset = 0;
+      alloc.ConstantBufferSize   = device->info->max_constant_urb_size_kb;
+   }
+
+#if GFX_VERx10 == 125
+   /* DG2: Wa_22011440098
+    * MTL: Wa_18022330953
+    *
+    * In 3D mode, after programming push constant alloc command immediately
+    * program push constant command(ZERO length) without any commit between
+    * them.
+    *
+    * Note that Wa_16011448509 isn't needed here as all address bits are zero.
+    */
+   anv_batch_emit(batch, GENX(3DSTATE_CONSTANT_ALL), c) {
+      /* Update empty push constants for all stages (bitmask = 11111b) */
+      c.ShaderUpdateEnable = 0x1f;
+      c.MOCS = anv_mocs(device, NULL, 0);
+   }
+#endif
+
+#if GFX_VER == 9
+   /* Allocate a binding table for Gfx9 for 2 reason :
+    *
+    *   1. we need a to emit a 3DSTATE_BINDING_TABLE_POINTERS_PS to make the
+    *      HW apply the preceeding 3DSTATE_CONSTANT_PS
+    *
+    *   2. Emitting an empty 3DSTATE_BINDING_TABLE_POINTERS_PS would cause RT
+    *      writes (even though they're empty) to disturb later writes
+    *      (probably due to RT cache)
+    *
+    * Our binding table only has one entry to the null surface.
+    */
+   uint32_t bt_offset;
+   state->bt_state =
+      anv_cmd_buffer_alloc_binding_table(state->cmd_buffer, 1, &bt_offset);
+   if (state->bt_state.map == NULL) {
+      VkResult result = anv_cmd_buffer_new_binding_table_block(state->cmd_buffer);
+      if (result != VK_SUCCESS)
+         return;
+
+      /* Re-emit state base addresses so we get the new surface state base
+       * address before we start emitting binding tables etc.
+       */
+      genX(cmd_buffer_emit_bt_pool_base_address)(state->cmd_buffer);
+
+      state->bt_state =
+         anv_cmd_buffer_alloc_binding_table(state->cmd_buffer, 1, &bt_offset);
+      assert(state->bt_state.map != NULL);
+   }
+
+   uint32_t *bt_map = state->bt_state.map;
+   bt_map[0] = anv_bindless_state_for_binding_table(
+      device,
+      device->null_surface_state).offset + bt_offset;
+
+   state->cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
+#endif
+
+   /* Flag all the instructions emitted by the memcpy. */
+   struct anv_gfx_dynamic_state *hw_state =
+      &state->cmd_buffer->state.gfx.dyn_state;
+
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_URB);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
+#if GFX_VER >= 11
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
+#endif
+#if GFX_VER >= 12
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
+#endif
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VIEWPORT_CC);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_CLIP);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_RASTER);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SAMPLE_MASK);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DEPTH_BOUNDS);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_WM);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_WM_DEPTH_STENCIL);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SF);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SBE);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VS);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_HS);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DS);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TE);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_GS);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS_EXTRA);
+   BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS_BLEND);
+   if (device->vk.enabled_extensions.EXT_mesh_shader) {
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL);
+      BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL);
+   }
+
+   /* Update urb config after simple shader. */
+   memcpy(&state->cmd_buffer->state.gfx.urb_cfg, &urb_cfg_out,
+          sizeof(struct intel_urb_config));
+
+   state->cmd_buffer->state.gfx.vb_dirty = BITFIELD_BIT(0);
+   state->cmd_buffer->state.gfx.dirty |= ~(ANV_CMD_DIRTY_INDEX_BUFFER |
+                                           ANV_CMD_DIRTY_XFB_ENABLE);
+   state->cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
+   state->cmd_buffer->state.gfx.push_constant_stages = VK_SHADER_STAGE_FRAGMENT_BIT;
+}
+
+static void
+genX(emit_simpler_shader_init_compute)(struct anv_simple_shader *state)
+{
+   assert(state->cmd_buffer == NULL ||
+          state->cmd_buffer->state.current_pipeline == GPGPU);
+
+#if GFX_VERx10 >= 125
+   struct anv_shader_bin *cs_bin = state->kernel;
+   const struct brw_cs_prog_data *prog_data =
+      (const struct brw_cs_prog_data *) cs_bin->prog_data;
+   /* Currently our simple shaders are simple enough that they never spill. */
+   assert(prog_data->base.total_scratch == 0);
+   if (state->cmd_buffer != NULL) {
+      genX(cmd_buffer_ensure_cfe_state)(state->cmd_buffer, 0);
+   } else {
+      anv_batch_emit(state->batch, GENX(CFE_STATE), cfe) {
+         cfe.MaximumNumberofThreads =
+            state->device->info->max_cs_threads *
+            state->device->info->subslice_total;
+      }
+   }
+#endif
+}
+
+/** Initialize a simple shader emission */
+void
+genX(emit_simple_shader_init)(struct anv_simple_shader *state)
+{
+   assert(state->kernel->stage == MESA_SHADER_FRAGMENT ||
+          state->kernel->stage == MESA_SHADER_COMPUTE);
+
+   if (state->kernel->stage == MESA_SHADER_FRAGMENT)
+      genX(emit_simpler_shader_init_fragment)(state);
+   else
+      genX(emit_simpler_shader_init_compute)(state);
+}
+
+/** Allocate push constant data for a simple shader */
+struct anv_state
+genX(simple_shader_alloc_push)(struct anv_simple_shader *state, uint32_t size)
+{
+   struct anv_state s;
+
+   if (state->kernel->stage == MESA_SHADER_FRAGMENT) {
+      s = anv_state_stream_alloc(state->dynamic_state_stream,
+                                 size, ANV_UBO_ALIGNMENT);
+   } else {
+#if GFX_VERx10 >= 125
+      s = anv_state_stream_alloc(state->general_state_stream, align(size, 64), 64);
+#else
+      s = anv_state_stream_alloc(state->dynamic_state_stream, size, 64);
+#endif
+   }
+
+   if (s.map == NULL)
+      anv_batch_set_error(state->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+   return s;
+}
+
+/** Get the address of allocated push constant data by
+ *  genX(simple_shader_alloc_push)
+ */
+struct anv_address
+genX(simple_shader_push_state_address)(struct anv_simple_shader *state,
+                                       struct anv_state push_state)
+{
+   if (state->kernel->stage == MESA_SHADER_FRAGMENT) {
+      return anv_state_pool_state_address(
+         &state->device->dynamic_state_pool, push_state);
+   } else {
+#if GFX_VERx10 >= 125
+      return anv_state_pool_state_address(
+         &state->device->general_state_pool, push_state);
+#else
+      return anv_state_pool_state_address(
+         &state->device->dynamic_state_pool, push_state);
+#endif
+   }
+}
+
+/** Emit a simple shader dispatch */
+void
+genX(emit_simple_shader_dispatch)(struct anv_simple_shader *state,
+                                  uint32_t num_threads,
+                                  struct anv_state push_state)
+{
+   struct anv_device *device = state->device;
+   struct anv_batch *batch = state->batch;
+   struct anv_address push_addr =
+      anv_state_pool_state_address(&device->dynamic_state_pool, push_state);
+
+   if (state->kernel->stage == MESA_SHADER_FRAGMENT) {
+      /* At the moment we require a command buffer associated with this
+       * emission as we need to allocate binding tables on Gfx9.
+       */
+      assert(state->cmd_buffer != NULL);
+
+      struct anv_state vs_data_state =
+         anv_state_stream_alloc(state->dynamic_state_stream,
+                                9 * sizeof(uint32_t), 32);
+      if (vs_data_state.map == NULL)
+         return;
+
+      float x0 = 0.0f, x1 = MIN2(num_threads, 8192);
+      float y0 = 0.0f, y1 = DIV_ROUND_UP(num_threads, 8192);
+      float z = 0.0f;
+
+      float *vertices = vs_data_state.map;
+      vertices[0] = x1; vertices[1] = y1; vertices[2] = z; /* v0 */
+      vertices[3] = x0; vertices[4] = y1; vertices[5] = z; /* v1 */
+      vertices[6] = x0; vertices[7] = y0; vertices[8] = z; /* v2 */
+
+      uint32_t *dw = anv_batch_emitn(batch,
+                                     1 + GENX(VERTEX_BUFFER_STATE_length),
+                                     GENX(3DSTATE_VERTEX_BUFFERS));
+      GENX(VERTEX_BUFFER_STATE_pack)(batch, dw + 1,
+                                     &(struct GENX(VERTEX_BUFFER_STATE)) {
+                                        .VertexBufferIndex     = 0,
+                                        .AddressModifyEnable   = true,
+                                        .BufferStartingAddress = (struct anv_address) {
+                                           .bo = device->dynamic_state_pool.block_pool.bo,
+                                           .offset = vs_data_state.offset,
+                                        },
+                                        .BufferPitch           = 3 * sizeof(float),
+                                        .BufferSize            = 9 * sizeof(float),
+                                        .MOCS                  = anv_mocs(device, NULL, 0),
+#if GFX_VER >= 12
+                                        .L3BypassDisable       = true,
+#endif
+                                     });
+
+#if GFX_VERx10 > 120
+      dw =
+         anv_batch_emitn(batch,
+                         GENX(3DSTATE_CONSTANT_ALL_length) +
+                         GENX(3DSTATE_CONSTANT_ALL_DATA_length),
+                         GENX(3DSTATE_CONSTANT_ALL),
+                         .ShaderUpdateEnable = BITFIELD_BIT(MESA_SHADER_FRAGMENT),
+                         .PointerBufferMask = 0x1,
+                         .MOCS = anv_mocs(device, NULL, 0));
+
+      GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
+         batch, dw + GENX(3DSTATE_CONSTANT_ALL_length),
+         &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
+            .PointerToConstantBuffer = push_addr,
+            .ConstantBufferReadLength = DIV_ROUND_UP(push_state.alloc_size, 32),
+         });
+#else
+      /* The Skylake PRM contains the following restriction:
+       *
+       *    "The driver must ensure The following case does not occur
+       *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
+       *     buffer 3 read length equal to zero committed followed by a
+       *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
+       *     zero committed."
+       *
+       * To avoid this, we program the highest slot.
+       */
+      anv_batch_emit(batch, GENX(3DSTATE_CONSTANT_PS), c) {
+         c.MOCS = anv_mocs(device, NULL, 0);
+         c.ConstantBody.ReadLength[3] = DIV_ROUND_UP(push_state.alloc_size, 32);
+         c.ConstantBody.Buffer[3] = push_addr;
+      }
+#endif
+
+#if GFX_VER == 9
+      /* Why are the push constants not flushed without a binding table
+       * update??
+       */
+      anv_batch_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), btp) {
+         btp.PointertoPSBindingTable = state->bt_state.offset;
+      }
+#endif
+
+      genX(emit_breakpoint)(batch, device, true);
+      anv_batch_emit(batch, GENX(3DPRIMITIVE), prim) {
+         prim.VertexAccessType         = SEQUENTIAL;
+         prim.PrimitiveTopologyType    = _3DPRIM_RECTLIST;
+         prim.VertexCountPerInstance   = 3;
+         prim.InstanceCount            = 1;
+      }
+      genX(batch_emit_post_3dprimitive_was)(batch, device, _3DPRIM_RECTLIST, 3);
+      genX(emit_breakpoint)(batch, device, false);
+   } else {
+      const struct intel_device_info *devinfo = device->info;
+      const struct brw_cs_prog_data *prog_data =
+         (const struct brw_cs_prog_data *) state->kernel->prog_data;
+      const struct intel_cs_dispatch_info dispatch =
+         brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
+
+#if GFX_VERx10 >= 125
+      anv_batch_emit(batch, GENX(COMPUTE_WALKER), cw) {
+         cw.SIMDSize                       = dispatch.simd_size / 16;
+         cw.MessageSIMD                    = dispatch.simd_size / 16,
+         cw.IndirectDataStartAddress       = push_state.offset;
+         cw.IndirectDataLength             = push_state.alloc_size;
+         cw.LocalXMaximum                  = prog_data->local_size[0] - 1;
+         cw.LocalYMaximum                  = prog_data->local_size[1] - 1;
+         cw.LocalZMaximum                  = prog_data->local_size[2] - 1;
+         cw.ThreadGroupIDXDimension        = DIV_ROUND_UP(num_threads,
+                                                          dispatch.simd_size);
+         cw.ThreadGroupIDYDimension        = 1;
+         cw.ThreadGroupIDZDimension        = 1;
+         cw.ExecutionMask                  = dispatch.right_mask;
+         cw.PostSync.MOCS                  = anv_mocs(device, NULL, 0);
+
+#if GFX_VERx10 >= 125
+         cw.GenerateLocalID                = prog_data->generate_local_id != 0;
+         cw.EmitLocal                      = prog_data->generate_local_id;
+         cw.WalkOrder                      = prog_data->walk_order;
+         cw.TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
+                         TileY32bpe : Linear;
+#endif
+
+         cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
+            .KernelStartPointer                = state->kernel->kernel.offset +
+                                                 brw_cs_prog_data_prog_offset(prog_data,
+                                                                              dispatch.simd_size),
+            .SamplerStatePointer               = 0,
+            .BindingTablePointer               = 0,
+            .BindingTableEntryCount            = 0,
+            .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
+            .SharedLocalMemorySize             = encode_slm_size(GFX_VER,
+                                                                 prog_data->base.total_shared),
+            .NumberOfBarriers                  = prog_data->uses_barrier,
+         };
+      }
+#else
+      const uint32_t vfe_curbe_allocation =
+         ALIGN(prog_data->push.per_thread.regs * dispatch.threads +
+               prog_data->push.cross_thread.regs, 2);
+
+      /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
+       *
+       *    "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
+       *     the only bits that are changed are scoreboard related: Scoreboard
+       *     Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
+       *     these scoreboard related states, a MEDIA_STATE_FLUSH is
+       *     sufficient."
+       */
+      enum anv_pipe_bits emitted_bits = 0;
+      genX(emit_apply_pipe_flushes)(batch, device, GPGPU, ANV_PIPE_CS_STALL_BIT,
+                                    &emitted_bits);
+
+      /* If we have a command buffer allocated with the emission, update the
+       * pending bits.
+       */
+      if (state->cmd_buffer)
+         anv_cmd_buffer_update_pending_query_bits(state->cmd_buffer, emitted_bits);
+
+      anv_batch_emit(batch, GENX(MEDIA_VFE_STATE), vfe) {
+         vfe.StackSize              = 0;
+         vfe.MaximumNumberofThreads =
+            devinfo->max_cs_threads * devinfo->subslice_total - 1;
+         vfe.NumberofURBEntries     = 2;
+#if GFX_VER < 11
+         vfe.ResetGatewayTimer      = true;
+#endif
+         vfe.URBEntryAllocationSize = 2;
+         vfe.CURBEAllocationSize    = vfe_curbe_allocation;
+
+         if (prog_data->base.total_scratch) {
+            /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
+             * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
+             */
+            vfe.PerThreadScratchSpace =
+               ffs(prog_data->base.total_scratch) - 11;
+            vfe.ScratchSpaceBasePointer =
+               (struct anv_address) {
+               .bo = anv_scratch_pool_alloc(device,
+                                            &device->scratch_pool,
+                                            MESA_SHADER_COMPUTE,
+                                            prog_data->base.total_scratch),
+               .offset = 0,
+            };
+         }
+      }
+      struct anv_state iface_desc_state =
+         anv_state_stream_alloc(state->dynamic_state_stream,
+                                GENX(INTERFACE_DESCRIPTOR_DATA_length) * 4, 64);
+      if (iface_desc_state.map == NULL)
+         return;
+
+      struct GENX(INTERFACE_DESCRIPTOR_DATA) iface_desc = {
+         .KernelStartPointer                    = state->kernel->kernel.offset +
+                                                  brw_cs_prog_data_prog_offset(prog_data,
+                                                                               dispatch.simd_size),
+
+         .SamplerCount                          = 0,
+         .BindingTableEntryCount                = 0,
+         .BarrierEnable                         = prog_data->uses_barrier,
+         .SharedLocalMemorySize                 = encode_slm_size(GFX_VER,
+                                                                  prog_data->base.total_shared),
+
+         .ConstantURBEntryReadOffset            = 0,
+         .ConstantURBEntryReadLength            = prog_data->push.per_thread.regs,
+         .CrossThreadConstantDataReadLength     = prog_data->push.cross_thread.regs,
+#if GFX_VER >= 12
+         /* TODO: Check if we are missing workarounds and enable mid-thread
+          * preemption.
+          *
+          * We still have issues with mid-thread preemption (it was already
+          * disabled by the kernel on gfx11, due to missing workarounds). It's
+          * possible that we are just missing some workarounds, and could
+          * enable it later, but for now let's disable it to fix a GPU in
+          * compute in Car Chase (and possibly more).
+          */
+         .ThreadPreemptionDisable               = true,
+#endif
+         .NumberofThreadsinGPGPUThreadGroup     = dispatch.threads,
+      };
+      GENX(INTERFACE_DESCRIPTOR_DATA_pack)(batch, iface_desc_state.map, &iface_desc);
+      anv_batch_emit(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
+         mid.InterfaceDescriptorTotalLength        = iface_desc_state.alloc_size;
+         mid.InterfaceDescriptorDataStartAddress   = iface_desc_state.offset;
+      }
+      anv_batch_emit(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
+         curbe.CURBEDataStartAddress = push_state.offset;
+         curbe.CURBETotalDataLength  = push_state.alloc_size;
+      }
+      anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) {
+         ggw.SIMDSize                     = dispatch.simd_size / 16;
+         ggw.ThreadDepthCounterMaximum    = 0;
+         ggw.ThreadHeightCounterMaximum   = 0;
+         ggw.ThreadWidthCounterMaximum    = dispatch.threads - 1;
+         ggw.ThreadGroupIDXDimension      = DIV_ROUND_UP(num_threads,
+                                                         dispatch.simd_size);
+         ggw.ThreadGroupIDYDimension      = 1;
+         ggw.ThreadGroupIDZDimension      = 1;
+         ggw.RightExecutionMask           = dispatch.right_mask;
+         ggw.BottomExecutionMask          = 0xffffffff;
+      }
+#endif
+   }
+}
+
+void
+genX(emit_simple_shader_end)(struct anv_simple_shader *state)
+{
+   anv_batch_emit(state->batch, GENX(MI_BATCH_BUFFER_END), end);
+
+   if ((state->batch->next - state->batch->start) & 4)
+      anv_batch_emit(state->batch, GENX(MI_NOOP), noop);
+}
diff --git a/src/intel/vulkan/genX_state.c b/src/intel/vulkan/genX_state.c
deleted file mode 100644
index dd8ada9087a..00000000000
--- a/src/intel/vulkan/genX_state.c
+++ /dev/null
@@ -1,894 +0,0 @@
-/*
- * Copyright © 2015 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <assert.h>
-#include <stdbool.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-
-#include "anv_private.h"
-
-#include "common/intel_aux_map.h"
-#include "common/intel_sample_positions.h"
-#include "genxml/gen_macros.h"
-#include "genxml/genX_pack.h"
-
-#include "vk_util.h"
-
-/**
- * Compute an \p n x \p m pixel hashing table usable as slice, subslice or
- * pixel pipe hashing table.  The resulting table is the cyclic repetition of
- * a fixed pattern with periodicity equal to \p period.
- *
- * If \p index is specified to be equal to \p period, a 2-way hashing table
- * will be generated such that indices 0 and 1 are returned for the following
- * fractions of entries respectively:
- *
- *   p_0 = ceil(period / 2) / period
- *   p_1 = floor(period / 2) / period
- *
- * If \p index is even and less than \p period, a 3-way hashing table will be
- * generated such that indices 0, 1 and 2 are returned for the following
- * fractions of entries:
- *
- *   p_0 = (ceil(period / 2) - 1) / period
- *   p_1 = floor(period / 2) / period
- *   p_2 = 1 / period
- *
- * The equations above apply if \p flip is equal to 0, if it is equal to 1 p_0
- * and p_1 will be swapped for the result.  Note that in the context of pixel
- * pipe hashing this can be always 0 on Gfx12 platforms, since the hardware
- * transparently remaps logical indices found on the table to physical pixel
- * pipe indices from the highest to lowest EU count.
- */
-UNUSED static void
-calculate_pixel_hashing_table(unsigned n, unsigned m,
-                              unsigned period, unsigned index, bool flip,
-                              uint32_t *p)
-{
-   for (unsigned i = 0; i < n; i++) {
-      for (unsigned j = 0; j < m; j++) {
-         const unsigned k = (i + j) % period;
-         p[j + m * i] = (k == index ? 2 : (k & 1) ^ flip);
-      }
-   }
-}
-
-static void
-genX(emit_slice_hashing_state)(struct anv_device *device,
-                               struct anv_batch *batch)
-{
-   device->slice_hash = (struct anv_state) { 0 };
-
-#if GFX_VER == 11
-   assert(device->info.ppipe_subslices[2] == 0);
-
-   if (device->info.ppipe_subslices[0] == device->info.ppipe_subslices[1])
-     return;
-
-   unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
-   device->slice_hash =
-      anv_state_pool_alloc(&device->dynamic_state_pool, size, 64);
-
-   const bool flip = device->info.ppipe_subslices[0] <
-                     device->info.ppipe_subslices[1];
-   struct GENX(SLICE_HASH_TABLE) table;
-   calculate_pixel_hashing_table(16, 16, 3, 3, flip, table.Entry[0]);
-
-   GENX(SLICE_HASH_TABLE_pack)(NULL, device->slice_hash.map, &table);
-
-   anv_batch_emit(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
-      ptr.SliceHashStatePointerValid = true;
-      ptr.SliceHashTableStatePointer = device->slice_hash.offset;
-   }
-
-   anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), mode) {
-      mode.SliceHashingTableEnable = true;
-   }
-#elif GFX_VERx10 == 120
-   /* For each n calculate ppipes_of[n], equal to the number of pixel pipes
-    * present with n active dual subslices.
-    */
-   unsigned ppipes_of[3] = {};
-
-   for (unsigned n = 0; n < ARRAY_SIZE(ppipes_of); n++) {
-      for (unsigned p = 0; p < ARRAY_SIZE(device->info.ppipe_subslices); p++)
-         ppipes_of[n] += (device->info.ppipe_subslices[p] == n);
-   }
-
-   /* Gfx12 has three pixel pipes. */
-   assert(ppipes_of[0] + ppipes_of[1] + ppipes_of[2] == 3);
-
-   if (ppipes_of[2] == 3 || ppipes_of[0] == 2) {
-      /* All three pixel pipes have the maximum number of active dual
-       * subslices, or there is only one active pixel pipe: Nothing to do.
-       */
-      return;
-   }
-
-   anv_batch_emit(batch, GENX(3DSTATE_SUBSLICE_HASH_TABLE), p) {
-      p.SliceHashControl[0] = TABLE_0;
-
-      if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
-         calculate_pixel_hashing_table(8, 16, 2, 2, 0, p.TwoWayTableEntry[0]);
-      else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
-         calculate_pixel_hashing_table(8, 16, 3, 3, 0, p.TwoWayTableEntry[0]);
-
-      if (ppipes_of[2] == 2 && ppipes_of[1] == 1)
-         calculate_pixel_hashing_table(8, 16, 5, 4, 0, p.ThreeWayTableEntry[0]);
-      else if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
-         calculate_pixel_hashing_table(8, 16, 2, 2, 0, p.ThreeWayTableEntry[0]);
-      else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
-         calculate_pixel_hashing_table(8, 16, 3, 3, 0, p.ThreeWayTableEntry[0]);
-      else
-         unreachable("Illegal fusing.");
-   }
-
-   anv_batch_emit(batch, GENX(3DSTATE_3D_MODE), p) {
-      p.SubsliceHashingTableEnable = true;
-      p.SubsliceHashingTableEnableMask = true;
-   }
-#endif
-}
-
-static VkResult
-init_render_queue_state(struct anv_queue *queue)
-{
-   struct anv_device *device = queue->device;
-   struct anv_batch batch;
-
-   uint32_t cmds[64];
-   batch.start = batch.next = cmds;
-   batch.end = (void *) cmds + sizeof(cmds);
-
-   anv_batch_emit(&batch, GENX(PIPELINE_SELECT), ps) {
-#if GFX_VER >= 9
-      ps.MaskBits = GFX_VER >= 12 ? 0x13 : 3;
-      ps.MediaSamplerDOPClockGateEnable = GFX_VER >= 12;
-#endif
-      ps.PipelineSelection = _3D;
-   }
-
-#if GFX_VER == 9
-   anv_batch_write_reg(&batch, GENX(CACHE_MODE_1), cm1) {
-      cm1.FloatBlendOptimizationEnable = true;
-      cm1.FloatBlendOptimizationEnableMask = true;
-      cm1.MSCRAWHazardAvoidanceBit = true;
-      cm1.MSCRAWHazardAvoidanceBitMask = true;
-      cm1.PartialResolveDisableInVC = true;
-      cm1.PartialResolveDisableInVCMask = true;
-   }
-#endif
-
-   anv_batch_emit(&batch, GENX(3DSTATE_AA_LINE_PARAMETERS), aa);
-
-   anv_batch_emit(&batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
-      rect.ClippedDrawingRectangleYMin = 0;
-      rect.ClippedDrawingRectangleXMin = 0;
-      rect.ClippedDrawingRectangleYMax = UINT16_MAX;
-      rect.ClippedDrawingRectangleXMax = UINT16_MAX;
-      rect.DrawingRectangleOriginY = 0;
-      rect.DrawingRectangleOriginX = 0;
-   }
-
-#if GFX_VER >= 8
-   anv_batch_emit(&batch, GENX(3DSTATE_WM_CHROMAKEY), ck);
-
-   genX(emit_sample_pattern)(&batch, 0, NULL);
-
-   /* The BDW+ docs describe how to use the 3DSTATE_WM_HZ_OP instruction in the
-    * section titled, "Optimized Depth Buffer Clear and/or Stencil Buffer
-    * Clear." It mentions that the packet overrides GPU state for the clear
-    * operation and needs to be reset to 0s to clear the overrides. Depending
-    * on the kernel, we may not get a context with the state for this packet
-    * zeroed. Do it ourselves just in case. We've observed this to prevent a
-    * number of GPU hangs on ICL.
-    */
-   anv_batch_emit(&batch, GENX(3DSTATE_WM_HZ_OP), hzp);
-#endif
-
-#if GFX_VER == 11
-   /* The default behavior of bit 5 "Headerless Message for Pre-emptable
-    * Contexts" in SAMPLER MODE register is set to 0, which means
-    * headerless sampler messages are not allowed for pre-emptable
-    * contexts. Set the bit 5 to 1 to allow them.
-    */
-   anv_batch_write_reg(&batch, GENX(SAMPLER_MODE), sm) {
-      sm.HeaderlessMessageforPreemptableContexts = true;
-      sm.HeaderlessMessageforPreemptableContextsMask = true;
-   }
-
-   /* Bit 1 "Enabled Texel Offset Precision Fix" must be set in
-    * HALF_SLICE_CHICKEN7 register.
-    */
-   anv_batch_write_reg(&batch, GENX(HALF_SLICE_CHICKEN7), hsc7) {
-      hsc7.EnabledTexelOffsetPrecisionFix = true;
-      hsc7.EnabledTexelOffsetPrecisionFixMask = true;
-   }
-
-   anv_batch_write_reg(&batch, GENX(TCCNTLREG), tcc) {
-      tcc.L3DataPartialWriteMergingEnable = true;
-      tcc.ColorZPartialWriteMergingEnable = true;
-      tcc.URBPartialWriteMergingEnable = true;
-      tcc.TCDisable = true;
-   }
-#endif
-   genX(emit_slice_hashing_state)(device, &batch);
-
-#if GFX_VER >= 11
-   /* hardware specification recommends disabling repacking for
-    * the compatibility with decompression mechanism in display controller.
-    */
-   if (device->info.disable_ccs_repack) {
-      anv_batch_write_reg(&batch, GENX(CACHE_MODE_0), cm0) {
-         cm0.DisableRepackingforCompression = true;
-         cm0.DisableRepackingforCompressionMask = true;
-      }
-   }
-
-   /* an unknown issue is causing vs push constants to become
-    * corrupted during object-level preemption. For now, restrict
-    * to command buffer level preemption to avoid rendering
-    * corruption.
-    */
-   anv_batch_write_reg(&batch, GENX(CS_CHICKEN1), cc1) {
-      cc1.ReplayMode = MidcmdbufferPreemption;
-      cc1.ReplayModeMask = true;
-   }
-
-#if GFX_VERx10 < 125
-#define AA_LINE_QUALITY_REG GENX(3D_CHICKEN3)
-#else
-#define AA_LINE_QUALITY_REG GENX(CHICKEN_RASTER_1)
-#endif
-
-   /* Enable the new line drawing algorithm that produces higher quality
-    * lines.
-    */
-   anv_batch_write_reg(&batch, AA_LINE_QUALITY_REG, c3) {
-      c3.AALineQualityFix = true;
-      c3.AALineQualityFixMask = true;
-   }
-#endif
-
-#if GFX_VER == 12
-   if (device->info.has_aux_map) {
-      uint64_t aux_base_addr = intel_aux_map_get_base(device->aux_map_ctx);
-      assert(aux_base_addr % (32 * 1024) == 0);
-      anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
-         lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num);
-         lri.DataDWord = aux_base_addr & 0xffffffff;
-      }
-      anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
-         lri.RegisterOffset = GENX(GFX_AUX_TABLE_BASE_ADDR_num) + 4;
-         lri.DataDWord = aux_base_addr >> 32;
-      }
-   }
-#endif
-
-   /* Set the "CONSTANT_BUFFER Address Offset Disable" bit, so
-    * 3DSTATE_CONSTANT_XS buffer 0 is an absolute address.
-    *
-    * This is only safe on kernels with context isolation support.
-    */
-   if (GFX_VER >= 8 && device->physical->has_context_isolation) {
-#if GFX_VER >= 9
-      anv_batch_write_reg(&batch, GENX(CS_DEBUG_MODE2), csdm2) {
-         csdm2.CONSTANT_BUFFERAddressOffsetDisable = true;
-         csdm2.CONSTANT_BUFFERAddressOffsetDisableMask = true;
-      }
-#elif GFX_VER == 8
-      anv_batch_write_reg(&batch, GENX(INSTPM), instpm) {
-         instpm.CONSTANT_BUFFERAddressOffsetDisable = true;
-         instpm.CONSTANT_BUFFERAddressOffsetDisableMask = true;
-      }
-#endif
-   }
-
-#if GFX_VER >= 11
-   /* Starting with GFX version 11, SLM is no longer part of the L3$ config
-    * so it never changes throughout the lifetime of the VkDevice.
-    */
-   const struct intel_l3_config *cfg = intel_get_default_l3_config(&device->info);
-   genX(emit_l3_config)(&batch, device, cfg);
-   device->l3_config = cfg;
-#endif
-
-   anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
-
-   assert(batch.next <= batch.end);
-
-   return anv_queue_submit_simple_batch(queue, &batch);
-}
-
-void
-genX(init_physical_device_state)(ASSERTED struct anv_physical_device *device)
-{
-   assert(device->info.verx10 == GFX_VERx10);
-}
-
-VkResult
-genX(init_device_state)(struct anv_device *device)
-{
-   VkResult res;
-
-   for (uint32_t i = 0; i < device->queue_count; i++) {
-      struct anv_queue *queue = &device->queues[i];
-      switch (queue->family->engine_class) {
-      case I915_ENGINE_CLASS_RENDER:
-         res = init_render_queue_state(queue);
-         break;
-      default:
-         res = vk_error(VK_ERROR_INITIALIZATION_FAILED);
-         break;
-      }
-      if (res != VK_SUCCESS)
-         return res;
-   }
-
-   return res;
-}
-
-void
-genX(emit_l3_config)(struct anv_batch *batch,
-                     const struct anv_device *device,
-                     const struct intel_l3_config *cfg)
-{
-   UNUSED const struct intel_device_info *devinfo = &device->info;
-
-#if GFX_VER >= 8
-
-#if GFX_VER >= 12
-#define L3_ALLOCATION_REG GENX(L3ALLOC)
-#define L3_ALLOCATION_REG_num GENX(L3ALLOC_num)
-#else
-#define L3_ALLOCATION_REG GENX(L3CNTLREG)
-#define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num)
-#endif
-
-   anv_batch_write_reg(batch, L3_ALLOCATION_REG, l3cr) {
-      if (cfg == NULL) {
-#if GFX_VER >= 12
-         l3cr.L3FullWayAllocationEnable = true;
-#else
-         unreachable("Invalid L3$ config");
-#endif
-      } else {
-#if GFX_VER < 11
-         l3cr.SLMEnable = cfg->n[INTEL_L3P_SLM];
-#endif
-#if GFX_VER == 11
-         /* Wa_1406697149: Bit 9 "Error Detection Behavior Control" must be
-          * set in L3CNTLREG register. The default setting of the bit is not
-          * the desirable behavior.
-          */
-         l3cr.ErrorDetectionBehaviorControl = true;
-         l3cr.UseFullWays = true;
-#endif /* GFX_VER == 11 */
-         assert(cfg->n[INTEL_L3P_IS] == 0);
-         assert(cfg->n[INTEL_L3P_C] == 0);
-         assert(cfg->n[INTEL_L3P_T] == 0);
-         l3cr.URBAllocation = cfg->n[INTEL_L3P_URB];
-         l3cr.ROAllocation = cfg->n[INTEL_L3P_RO];
-         l3cr.DCAllocation = cfg->n[INTEL_L3P_DC];
-         l3cr.AllAllocation = cfg->n[INTEL_L3P_ALL];
-      }
-   }
-
-#else /* GFX_VER < 8 */
-
-   const bool has_dc = cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_ALL];
-   const bool has_is = cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_RO] ||
-                       cfg->n[INTEL_L3P_ALL];
-   const bool has_c = cfg->n[INTEL_L3P_C] || cfg->n[INTEL_L3P_RO] ||
-                      cfg->n[INTEL_L3P_ALL];
-   const bool has_t = cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_RO] ||
-                      cfg->n[INTEL_L3P_ALL];
-
-   assert(!cfg->n[INTEL_L3P_ALL]);
-
-   /* When enabled SLM only uses a portion of the L3 on half of the banks,
-    * the matching space on the remaining banks has to be allocated to a
-    * client (URB for all validated configurations) set to the
-    * lower-bandwidth 2-bank address hashing mode.
-    */
-   const bool urb_low_bw = cfg->n[INTEL_L3P_SLM] && !devinfo->is_baytrail;
-   assert(!urb_low_bw || cfg->n[INTEL_L3P_URB] == cfg->n[INTEL_L3P_SLM]);
-
-   /* Minimum number of ways that can be allocated to the URB. */
-   const unsigned n0_urb = devinfo->is_baytrail ? 32 : 0;
-   assert(cfg->n[INTEL_L3P_URB] >= n0_urb);
-
-   anv_batch_write_reg(batch, GENX(L3SQCREG1), l3sqc) {
-      l3sqc.ConvertDC_UC = !has_dc;
-      l3sqc.ConvertIS_UC = !has_is;
-      l3sqc.ConvertC_UC = !has_c;
-      l3sqc.ConvertT_UC = !has_t;
-#if GFX_VERx10 == 75
-      l3sqc.L3SQGeneralPriorityCreditInitialization = SQGPCI_DEFAULT;
-#else
-      l3sqc.L3SQGeneralPriorityCreditInitialization =
-         devinfo->is_baytrail ? BYT_SQGPCI_DEFAULT : SQGPCI_DEFAULT;
-#endif
-      l3sqc.L3SQHighPriorityCreditInitialization = SQHPCI_DEFAULT;
-   }
-
-   anv_batch_write_reg(batch, GENX(L3CNTLREG2), l3cr2) {
-      l3cr2.SLMEnable = cfg->n[INTEL_L3P_SLM];
-      l3cr2.URBLowBandwidth = urb_low_bw;
-      l3cr2.URBAllocation = cfg->n[INTEL_L3P_URB] - n0_urb;
-#if !GFX_VERx10 == 75
-      l3cr2.ALLAllocation = cfg->n[INTEL_L3P_ALL];
-#endif
-      l3cr2.ROAllocation = cfg->n[INTEL_L3P_RO];
-      l3cr2.DCAllocation = cfg->n[INTEL_L3P_DC];
-   }
-
-   anv_batch_write_reg(batch, GENX(L3CNTLREG3), l3cr3) {
-      l3cr3.ISAllocation = cfg->n[INTEL_L3P_IS];
-      l3cr3.ISLowBandwidth = 0;
-      l3cr3.CAllocation = cfg->n[INTEL_L3P_C];
-      l3cr3.CLowBandwidth = 0;
-      l3cr3.TAllocation = cfg->n[INTEL_L3P_T];
-      l3cr3.TLowBandwidth = 0;
-   }
-
-#if GFX_VERx10 == 75
-   if (device->physical->cmd_parser_version >= 4) {
-      /* Enable L3 atomics on HSW if we have a DC partition, otherwise keep
-       * them disabled to avoid crashing the system hard.
-       */
-      anv_batch_write_reg(batch, GENX(SCRATCH1), s1) {
-         s1.L3AtomicDisable = !has_dc;
-      }
-      anv_batch_write_reg(batch, GENX(CHICKEN3), c3) {
-         c3.L3AtomicDisableMask = true;
-         c3.L3AtomicDisable = !has_dc;
-      }
-   }
-#endif /* GFX_VERx10 == 75 */
-
-#endif /* GFX_VER < 8 */
-}
-
-void
-genX(emit_multisample)(struct anv_batch *batch, uint32_t samples,
-                       const VkSampleLocationEXT *locations)
-{
-   anv_batch_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
-      ms.NumberofMultisamples       = __builtin_ffs(samples) - 1;
-
-      ms.PixelLocation              = CENTER;
-#if GFX_VER >= 8
-      /* The PRM says that this bit is valid only for DX9:
-       *
-       *    SW can choose to set this bit only for DX9 API. DX10/OGL API's
-       *    should not have any effect by setting or not setting this bit.
-       */
-      ms.PixelPositionOffsetEnable  = false;
-#else
-
-      if (locations) {
-         switch (samples) {
-         case 1:
-            INTEL_SAMPLE_POS_1X_ARRAY(ms.Sample, locations);
-            break;
-         case 2:
-            INTEL_SAMPLE_POS_2X_ARRAY(ms.Sample, locations);
-            break;
-         case 4:
-            INTEL_SAMPLE_POS_4X_ARRAY(ms.Sample, locations);
-            break;
-         case 8:
-            INTEL_SAMPLE_POS_8X_ARRAY(ms.Sample, locations);
-            break;
-         default:
-            break;
-         }
-      } else {
-         switch (samples) {
-         case 1:
-            INTEL_SAMPLE_POS_1X(ms.Sample);
-            break;
-         case 2:
-            INTEL_SAMPLE_POS_2X(ms.Sample);
-            break;
-         case 4:
-            INTEL_SAMPLE_POS_4X(ms.Sample);
-            break;
-         case 8:
-            INTEL_SAMPLE_POS_8X(ms.Sample);
-            break;
-         default:
-            break;
-         }
-      }
-#endif
-   }
-}
-
-#if GFX_VER >= 8
-void
-genX(emit_sample_pattern)(struct anv_batch *batch, uint32_t samples,
-                          const VkSampleLocationEXT *locations)
-{
-   /* See the Vulkan 1.0 spec Table 24.1 "Standard sample locations" and
-    * VkPhysicalDeviceFeatures::standardSampleLocations.
-    */
-   anv_batch_emit(batch, GENX(3DSTATE_SAMPLE_PATTERN), sp) {
-      if (locations) {
-         /* The Skylake PRM Vol. 2a "3DSTATE_SAMPLE_PATTERN" says:
-          *
-          *    "When programming the sample offsets (for NUMSAMPLES_4 or _8
-          *    and MSRASTMODE_xxx_PATTERN), the order of the samples 0 to 3
-          *    (or 7 for 8X, or 15 for 16X) must have monotonically increasing
-          *    distance from the pixel center. This is required to get the
-          *    correct centroid computation in the device."
-          *
-          * However, the Vulkan spec seems to require that the the samples
-          * occur in the order provided through the API. The standard sample
-          * patterns have the above property that they have monotonically
-          * increasing distances from the center but client-provided ones do
-          * not. As long as this only affects centroid calculations as the
-          * docs say, we should be ok because OpenGL and Vulkan only require
-          * that the centroid be some lit sample and that it's the same for
-          * all samples in a pixel; they have no requirement that it be the
-          * one closest to center.
-          */
-         switch (samples) {
-         case 1:
-            INTEL_SAMPLE_POS_1X_ARRAY(sp._1xSample, locations);
-            break;
-         case 2:
-            INTEL_SAMPLE_POS_2X_ARRAY(sp._2xSample, locations);
-            break;
-         case 4:
-            INTEL_SAMPLE_POS_4X_ARRAY(sp._4xSample, locations);
-            break;
-         case 8:
-            INTEL_SAMPLE_POS_8X_ARRAY(sp._8xSample, locations);
-            break;
-#if GFX_VER >= 9
-         case 16:
-            INTEL_SAMPLE_POS_16X_ARRAY(sp._16xSample, locations);
-            break;
-#endif
-         default:
-            break;
-         }
-      } else {
-         INTEL_SAMPLE_POS_1X(sp._1xSample);
-         INTEL_SAMPLE_POS_2X(sp._2xSample);
-         INTEL_SAMPLE_POS_4X(sp._4xSample);
-         INTEL_SAMPLE_POS_8X(sp._8xSample);
-#if GFX_VER >= 9
-         INTEL_SAMPLE_POS_16X(sp._16xSample);
-#endif
-      }
-   }
-}
-#endif
-
-#if GFX_VER >= 11
-void
-genX(emit_shading_rate)(struct anv_batch *batch,
-                        const struct anv_graphics_pipeline *pipeline,
-                        struct anv_state cps_states,
-                        struct anv_dynamic_state *dynamic_state)
-{
-   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
-   const bool cps_enable = wm_prog_data && wm_prog_data->per_coarse_pixel_dispatch;
-
-#if GFX_VER == 11
-   anv_batch_emit(batch, GENX(3DSTATE_CPS), cps) {
-      cps.CoarsePixelShadingMode = cps_enable ? CPS_MODE_CONSTANT : CPS_MODE_NONE;
-      if (cps_enable) {
-         cps.MinCPSizeX = dynamic_state->fragment_shading_rate.width;
-         cps.MinCPSizeY = dynamic_state->fragment_shading_rate.height;
-      }
-   }
-#elif GFX_VER == 12
-   for (uint32_t i = 0; i < dynamic_state->viewport.count; i++) {
-      uint32_t *cps_state_dwords =
-         cps_states.map + GENX(CPS_STATE_length) * 4 * i;
-      struct GENX(CPS_STATE) cps_state = {
-         .CoarsePixelShadingMode = cps_enable ? CPS_MODE_CONSTANT : CPS_MODE_NONE,
-      };
-
-      if (cps_enable) {
-         cps_state.MinCPSizeX = dynamic_state->fragment_shading_rate.width;
-         cps_state.MinCPSizeY = dynamic_state->fragment_shading_rate.height;
-      }
-
-      GENX(CPS_STATE_pack)(NULL, cps_state_dwords, &cps_state);
-   }
-
-   anv_batch_emit(batch, GENX(3DSTATE_CPS_POINTERS), cps) {
-      cps.CoarsePixelShadingStateArrayPointer = cps_states.offset;
-   }
-#endif
-}
-#endif /* GFX_VER >= 11 */
-
-static uint32_t
-vk_to_intel_tex_filter(VkFilter filter, bool anisotropyEnable)
-{
-   switch (filter) {
-   default:
-      assert(!"Invalid filter");
-   case VK_FILTER_NEAREST:
-      return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_NEAREST;
-   case VK_FILTER_LINEAR:
-      return anisotropyEnable ? MAPFILTER_ANISOTROPIC : MAPFILTER_LINEAR;
-   }
-}
-
-static uint32_t
-vk_to_intel_max_anisotropy(float ratio)
-{
-   return (anv_clamp_f(ratio, 2, 16) - 2) / 2;
-}
-
-static const uint32_t vk_to_intel_mipmap_mode[] = {
-   [VK_SAMPLER_MIPMAP_MODE_NEAREST]          = MIPFILTER_NEAREST,
-   [VK_SAMPLER_MIPMAP_MODE_LINEAR]           = MIPFILTER_LINEAR
-};
-
-static const uint32_t vk_to_intel_tex_address[] = {
-   [VK_SAMPLER_ADDRESS_MODE_REPEAT]          = TCM_WRAP,
-   [VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT] = TCM_MIRROR,
-   [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE]   = TCM_CLAMP,
-   [VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
-   [VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
-};
-
-/* Vulkan specifies the result of shadow comparisons as:
- *     1     if   ref <op> texel,
- *     0     otherwise.
- *
- * The hardware does:
- *     0     if texel <op> ref,
- *     1     otherwise.
- *
- * So, these look a bit strange because there's both a negation
- * and swapping of the arguments involved.
- */
-static const uint32_t vk_to_intel_shadow_compare_op[] = {
-   [VK_COMPARE_OP_NEVER]                        = PREFILTEROP_ALWAYS,
-   [VK_COMPARE_OP_LESS]                         = PREFILTEROP_LEQUAL,
-   [VK_COMPARE_OP_EQUAL]                        = PREFILTEROP_NOTEQUAL,
-   [VK_COMPARE_OP_LESS_OR_EQUAL]                = PREFILTEROP_LESS,
-   [VK_COMPARE_OP_GREATER]                      = PREFILTEROP_GEQUAL,
-   [VK_COMPARE_OP_NOT_EQUAL]                    = PREFILTEROP_EQUAL,
-   [VK_COMPARE_OP_GREATER_OR_EQUAL]             = PREFILTEROP_GREATER,
-   [VK_COMPARE_OP_ALWAYS]                       = PREFILTEROP_NEVER,
-};
-
-#if GFX_VER >= 9
-static const uint32_t vk_to_intel_sampler_reduction_mode[] = {
-   [VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT] = STD_FILTER,
-   [VK_SAMPLER_REDUCTION_MODE_MIN_EXT]              = MINIMUM,
-   [VK_SAMPLER_REDUCTION_MODE_MAX_EXT]              = MAXIMUM,
-};
-#endif
-
-VkResult genX(CreateSampler)(
-    VkDevice                                    _device,
-    const VkSamplerCreateInfo*                  pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkSampler*                                  pSampler)
-{
-   ANV_FROM_HANDLE(anv_device, device, _device);
-   struct anv_sampler *sampler;
-
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO);
-
-   sampler = vk_object_zalloc(&device->vk, pAllocator, sizeof(*sampler),
-                              VK_OBJECT_TYPE_SAMPLER);
-   if (!sampler)
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   sampler->n_planes = 1;
-
-   uint32_t border_color_stride = GFX_VERx10 == 75 ? 512 : 64;
-   uint32_t border_color_offset;
-   ASSERTED bool has_custom_color = false;
-   if (pCreateInfo->borderColor <= VK_BORDER_COLOR_INT_OPAQUE_WHITE) {
-      border_color_offset = device->border_colors.offset +
-                            pCreateInfo->borderColor *
-                            border_color_stride;
-   } else {
-      assert(GFX_VER >= 8);
-      sampler->custom_border_color =
-         anv_state_reserved_pool_alloc(&device->custom_border_colors);
-      border_color_offset = sampler->custom_border_color.offset;
-   }
-
-#if GFX_VER >= 9
-   unsigned sampler_reduction_mode = STD_FILTER;
-   bool enable_sampler_reduction = false;
-#endif
-
-   vk_foreach_struct(ext, pCreateInfo->pNext) {
-      switch (ext->sType) {
-      case VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO: {
-         VkSamplerYcbcrConversionInfo *pSamplerConversion =
-            (VkSamplerYcbcrConversionInfo *) ext;
-         ANV_FROM_HANDLE(anv_ycbcr_conversion, conversion,
-                         pSamplerConversion->conversion);
-
-         /* Ignore conversion for non-YUV formats. This fulfills a requirement
-          * for clients that want to utilize same code path for images with
-          * external formats (VK_FORMAT_UNDEFINED) and "regular" RGBA images
-          * where format is known.
-          */
-         if (conversion == NULL || !conversion->format->can_ycbcr)
-            break;
-
-         sampler->n_planes = conversion->format->n_planes;
-         sampler->conversion = conversion;
-         break;
-      }
-#if GFX_VER >= 9
-      case VK_STRUCTURE_TYPE_SAMPLER_REDUCTION_MODE_CREATE_INFO: {
-         VkSamplerReductionModeCreateInfo *sampler_reduction =
-            (VkSamplerReductionModeCreateInfo *) ext;
-         sampler_reduction_mode =
-            vk_to_intel_sampler_reduction_mode[sampler_reduction->reductionMode];
-         enable_sampler_reduction = true;
-         break;
-      }
-#endif
-      case VK_STRUCTURE_TYPE_SAMPLER_CUSTOM_BORDER_COLOR_CREATE_INFO_EXT: {
-         VkSamplerCustomBorderColorCreateInfoEXT *custom_border_color =
-            (VkSamplerCustomBorderColorCreateInfoEXT *) ext;
-         if (sampler->custom_border_color.map == NULL)
-            break;
-         struct gfx8_border_color *cbc = sampler->custom_border_color.map;
-         if (custom_border_color->format == VK_FORMAT_B4G4R4A4_UNORM_PACK16) {
-            /* B4G4R4A4_UNORM_PACK16 is treated as R4G4B4A4_UNORM_PACK16 with
-             * a swizzle, but this does not carry over to the sampler for
-             * border colors, so we need to do the swizzle ourselves here.
-             */
-            cbc->uint32[0] = custom_border_color->customBorderColor.uint32[2];
-            cbc->uint32[1] = custom_border_color->customBorderColor.uint32[1];
-            cbc->uint32[2] = custom_border_color->customBorderColor.uint32[0];
-            cbc->uint32[3] = custom_border_color->customBorderColor.uint32[3];
-         } else {
-            /* Both structs share the same layout, so just copy them over. */
-            memcpy(cbc, &custom_border_color->customBorderColor,
-                   sizeof(VkClearColorValue));
-         }
-         has_custom_color = true;
-         break;
-      }
-      default:
-         anv_debug_ignored_stype(ext->sType);
-         break;
-      }
-   }
-
-   assert((sampler->custom_border_color.map == NULL) || has_custom_color);
-
-   if (device->physical->has_bindless_samplers) {
-      /* If we have bindless, allocate enough samplers.  We allocate 32 bytes
-       * for each sampler instead of 16 bytes because we want all bindless
-       * samplers to be 32-byte aligned so we don't have to use indirect
-       * sampler messages on them.
-       */
-      sampler->bindless_state =
-         anv_state_pool_alloc(&device->dynamic_state_pool,
-                              sampler->n_planes * 32, 32);
-   }
-
-   for (unsigned p = 0; p < sampler->n_planes; p++) {
-      const bool plane_has_chroma =
-         sampler->conversion && sampler->conversion->format->planes[p].has_chroma;
-      const VkFilter min_filter =
-         plane_has_chroma ? sampler->conversion->chroma_filter : pCreateInfo->minFilter;
-      const VkFilter mag_filter =
-         plane_has_chroma ? sampler->conversion->chroma_filter : pCreateInfo->magFilter;
-      const bool enable_min_filter_addr_rounding = min_filter != VK_FILTER_NEAREST;
-      const bool enable_mag_filter_addr_rounding = mag_filter != VK_FILTER_NEAREST;
-      /* From Broadwell PRM, SAMPLER_STATE:
-       *   "Mip Mode Filter must be set to MIPFILTER_NONE for Planar YUV surfaces."
-       */
-      const bool isl_format_is_planar_yuv = sampler->conversion &&
-         isl_format_is_yuv(sampler->conversion->format->planes[0].isl_format) &&
-         isl_format_is_planar(sampler->conversion->format->planes[0].isl_format);
-
-      const uint32_t mip_filter_mode =
-         isl_format_is_planar_yuv ?
-         MIPFILTER_NONE : vk_to_intel_mipmap_mode[pCreateInfo->mipmapMode];
-
-      struct GENX(SAMPLER_STATE) sampler_state = {
-         .SamplerDisable = false,
-         .TextureBorderColorMode = DX10OGL,
-
-#if GFX_VER >= 11
-         .CPSLODCompensationEnable = true,
-#endif
-
-#if GFX_VER >= 8
-         .LODPreClampMode = CLAMP_MODE_OGL,
-#else
-         .LODPreClampEnable = CLAMP_ENABLE_OGL,
-#endif
-
-#if GFX_VER == 8
-         .BaseMipLevel = 0.0,
-#endif
-         .MipModeFilter = mip_filter_mode,
-         .MagModeFilter = vk_to_intel_tex_filter(mag_filter, pCreateInfo->anisotropyEnable),
-         .MinModeFilter = vk_to_intel_tex_filter(min_filter, pCreateInfo->anisotropyEnable),
-         .TextureLODBias = anv_clamp_f(pCreateInfo->mipLodBias, -16, 15.996),
-         .AnisotropicAlgorithm =
-            pCreateInfo->anisotropyEnable ? EWAApproximation : LEGACY,
-         .MinLOD = anv_clamp_f(pCreateInfo->minLod, 0, 14),
-         .MaxLOD = anv_clamp_f(pCreateInfo->maxLod, 0, 14),
-         .ChromaKeyEnable = 0,
-         .ChromaKeyIndex = 0,
-         .ChromaKeyMode = 0,
-         .ShadowFunction =
-            vk_to_intel_shadow_compare_op[pCreateInfo->compareEnable ?
-                                        pCreateInfo->compareOp : VK_COMPARE_OP_NEVER],
-         .CubeSurfaceControlMode = OVERRIDE,
-
-         .BorderColorPointer = border_color_offset,
-
-#if GFX_VER >= 8
-         .LODClampMagnificationMode = MIPNONE,
-#endif
-
-         .MaximumAnisotropy = vk_to_intel_max_anisotropy(pCreateInfo->maxAnisotropy),
-         .RAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
-         .RAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
-         .VAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
-         .VAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
-         .UAddressMinFilterRoundingEnable = enable_min_filter_addr_rounding,
-         .UAddressMagFilterRoundingEnable = enable_mag_filter_addr_rounding,
-         .TrilinearFilterQuality = 0,
-         .NonnormalizedCoordinateEnable = pCreateInfo->unnormalizedCoordinates,
-         .TCXAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeU],
-         .TCYAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeV],
-         .TCZAddressControlMode = vk_to_intel_tex_address[pCreateInfo->addressModeW],
-
-#if GFX_VER >= 9
-         .ReductionType = sampler_reduction_mode,
-         .ReductionTypeEnable = enable_sampler_reduction,
-#endif
-      };
-
-      GENX(SAMPLER_STATE_pack)(NULL, sampler->state[p], &sampler_state);
-
-      if (sampler->bindless_state.map) {
-         memcpy(sampler->bindless_state.map + p * 32,
-                sampler->state[p], GENX(SAMPLER_STATE_length) * 4);
-      }
-   }
-
-   *pSampler = anv_sampler_to_handle(sampler);
-
-   return VK_SUCCESS;
-}
diff --git a/src/intel/vulkan/gfx7_cmd_buffer.c b/src/intel/vulkan/gfx7_cmd_buffer.c
deleted file mode 100644
index b092bd8c377..00000000000
--- a/src/intel/vulkan/gfx7_cmd_buffer.c
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Copyright © 2015 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <assert.h>
-#include <stdbool.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-
-#include "anv_private.h"
-#include "vk_format.h"
-
-#include "genxml/gen_macros.h"
-#include "genxml/genX_pack.h"
-
-#if GFX_VERx10 == 70
-static int64_t
-clamp_int64(int64_t x, int64_t min, int64_t max)
-{
-   if (x < min)
-      return min;
-   else if (x < max)
-      return x;
-   else
-      return max;
-}
-
-void
-gfx7_cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
-{
-   struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
-   uint32_t count = cmd_buffer->state.gfx.dynamic.scissor.count;
-   const VkRect2D *scissors = cmd_buffer->state.gfx.dynamic.scissor.scissors;
-
-   /* Wa_1409725701:
-    *    "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
-    *    stored as an array of up to 16 elements. The location of first
-    *    element of the array, as specified by Pointer to SCISSOR_RECT, should
-    *    be aligned to a 64-byte boundary.
-    */
-   uint32_t alignment = 64;
-   struct anv_state scissor_state =
-      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, alignment);
-
-   for (uint32_t i = 0; i < count; i++) {
-      const VkRect2D *s = &scissors[i];
-
-      /* Since xmax and ymax are inclusive, we have to have xmax < xmin or
-       * ymax < ymin for empty clips.  In case clip x, y, width height are all
-       * 0, the clamps below produce 0 for xmin, ymin, xmax, ymax, which isn't
-       * what we want. Just special case empty clips and produce a canonical
-       * empty clip. */
-      static const struct GFX7_SCISSOR_RECT empty_scissor = {
-         .ScissorRectangleYMin = 1,
-         .ScissorRectangleXMin = 1,
-         .ScissorRectangleYMax = 0,
-         .ScissorRectangleXMax = 0
-      };
-
-      const int max = 0xffff;
-
-      uint32_t y_min = s->offset.y;
-      uint32_t x_min = s->offset.x;
-      uint32_t y_max = s->offset.y + s->extent.height - 1;
-      uint32_t x_max = s->offset.x + s->extent.width - 1;
-
-      /* Do this math using int64_t so overflow gets clamped correctly. */
-      if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
-         y_min = clamp_int64((uint64_t) y_min,
-                             cmd_buffer->state.render_area.offset.y, max);
-         x_min = clamp_int64((uint64_t) x_min,
-                             cmd_buffer->state.render_area.offset.x, max);
-         y_max = clamp_int64((uint64_t) y_max, 0,
-                             cmd_buffer->state.render_area.offset.y +
-                             cmd_buffer->state.render_area.extent.height - 1);
-         x_max = clamp_int64((uint64_t) x_max, 0,
-                             cmd_buffer->state.render_area.offset.x +
-                             cmd_buffer->state.render_area.extent.width - 1);
-      } else if (fb) {
-         y_min = clamp_int64((uint64_t) y_min, 0, max);
-         x_min = clamp_int64((uint64_t) x_min, 0, max);
-         y_max = clamp_int64((uint64_t) y_max, 0, fb->height - 1);
-         x_max = clamp_int64((uint64_t) x_max, 0, fb->width - 1);
-      }
-
-      struct GFX7_SCISSOR_RECT scissor = {
-         .ScissorRectangleYMin = y_min,
-         .ScissorRectangleXMin = x_min,
-         .ScissorRectangleYMax = y_max,
-         .ScissorRectangleXMax = x_max
-      };
-
-      if (s->extent.width <= 0 || s->extent.height <= 0) {
-         GFX7_SCISSOR_RECT_pack(NULL, scissor_state.map + i * 8,
-                                &empty_scissor);
-      } else {
-         GFX7_SCISSOR_RECT_pack(NULL, scissor_state.map + i * 8, &scissor);
-      }
-   }
-
-   anv_batch_emit(&cmd_buffer->batch,
-                  GFX7_3DSTATE_SCISSOR_STATE_POINTERS, ssp) {
-      ssp.ScissorRectPointer = scissor_state.offset;
-   }
-}
-#endif
-
-static uint32_t vk_to_intel_index_type(VkIndexType type)
-{
-   switch (type) {
-   case VK_INDEX_TYPE_UINT8_EXT:
-      return INDEX_BYTE;
-   case VK_INDEX_TYPE_UINT16:
-      return INDEX_WORD;
-   case VK_INDEX_TYPE_UINT32:
-      return INDEX_DWORD;
-   default:
-      unreachable("invalid index type");
-   }
-}
-
-static uint32_t restart_index_for_type(VkIndexType type)
-{
-   switch (type) {
-   case VK_INDEX_TYPE_UINT8_EXT:
-      return UINT8_MAX;
-   case VK_INDEX_TYPE_UINT16:
-      return UINT16_MAX;
-   case VK_INDEX_TYPE_UINT32:
-      return UINT32_MAX;
-   default:
-      unreachable("invalid index type");
-   }
-}
-
-void genX(CmdBindIndexBuffer)(
-    VkCommandBuffer                             commandBuffer,
-    VkBuffer                                    _buffer,
-    VkDeviceSize                                offset,
-    VkIndexType                                 indexType)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
-   if (GFX_VERx10 == 75)
-      cmd_buffer->state.restart_index = restart_index_for_type(indexType);
-   cmd_buffer->state.gfx.gfx7.index_buffer = buffer;
-   cmd_buffer->state.gfx.gfx7.index_type = vk_to_intel_index_type(indexType);
-   cmd_buffer->state.gfx.gfx7.index_offset = offset;
-}
-
-static uint32_t
-get_depth_format(struct anv_cmd_buffer *cmd_buffer)
-{
-   const struct anv_render_pass *pass = cmd_buffer->state.pass;
-   const struct anv_subpass *subpass = cmd_buffer->state.subpass;
-
-   if (!subpass->depth_stencil_attachment)
-      return D16_UNORM;
-
-   struct anv_render_pass_attachment *att =
-      &pass->attachments[subpass->depth_stencil_attachment->attachment];
-
-   switch (att->format) {
-   case VK_FORMAT_D16_UNORM:
-   case VK_FORMAT_D16_UNORM_S8_UINT:
-      return D16_UNORM;
-
-   case VK_FORMAT_X8_D24_UNORM_PACK32:
-   case VK_FORMAT_D24_UNORM_S8_UINT:
-      return D24_UNORM_X8_UINT;
-
-   case VK_FORMAT_D32_SFLOAT:
-   case VK_FORMAT_D32_SFLOAT_S8_UINT:
-      return D32_FLOAT;
-
-   default:
-      return D16_UNORM;
-   }
-}
-
-void
-genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
-{
-   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   struct anv_dynamic_state *d = &cmd_buffer->state.gfx.dynamic;
-
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) {
-      uint32_t topology;
-      if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
-         topology = pipeline->topology;
-      else
-         topology = genX(vk_to_intel_primitive_type)[d->primitive_topology];
-
-      cmd_buffer->state.gfx.primitive_topology = topology;
-   }
-
-   if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
-                                      ANV_CMD_DIRTY_RENDER_TARGETS |
-                                      ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH |
-                                      ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS |
-                                      ANV_CMD_DIRTY_DYNAMIC_CULL_MODE |
-                                      ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
-                                      ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE |
-                                      ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY)) {
-      /* Take dynamic primitive topology in to account with
-       *    3DSTATE_SF::MultisampleRasterizationMode
-       */
-      uint32_t ms_rast_mode = 0;
-
-      if (cmd_buffer->state.gfx.pipeline->dynamic_states &
-          ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) {
-         VkPrimitiveTopology primitive_topology =
-            cmd_buffer->state.gfx.dynamic.primitive_topology;
-
-         VkPolygonMode dynamic_raster_mode =
-            genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
-                                      primitive_topology);
-
-         ms_rast_mode =
-            genX(ms_rasterization_mode)(pipeline, dynamic_raster_mode);
-      }
-
-      uint32_t sf_dw[GENX(3DSTATE_SF_length)];
-      struct GENX(3DSTATE_SF) sf = {
-         GENX(3DSTATE_SF_header),
-         .DepthBufferSurfaceFormat = get_depth_format(cmd_buffer),
-         .LineWidth = d->line_width,
-         .GlobalDepthOffsetConstant = d->depth_bias.bias,
-         .GlobalDepthOffsetScale = d->depth_bias.slope,
-         .GlobalDepthOffsetClamp = d->depth_bias.clamp,
-         .FrontWinding            = genX(vk_to_intel_front_face)[d->front_face],
-         .CullMode                = genX(vk_to_intel_cullmode)[d->cull_mode],
-         .GlobalDepthOffsetEnableSolid = d->depth_bias_enable,
-         .GlobalDepthOffsetEnableWireframe = d->depth_bias_enable,
-         .GlobalDepthOffsetEnablePoint = d->depth_bias_enable,
-         .MultisampleRasterizationMode = ms_rast_mode,
-      };
-      GENX(3DSTATE_SF_pack)(NULL, sf_dw, &sf);
-
-      anv_batch_emit_merge(&cmd_buffer->batch, sf_dw, pipeline->gfx7.sf);
-   }
-
-   if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS |
-                                      ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE)) {
-      struct anv_state cc_state =
-         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
-                                            GENX(COLOR_CALC_STATE_length) * 4,
-                                            64);
-      struct GENX(COLOR_CALC_STATE) cc = {
-         .BlendConstantColorRed = d->blend_constants[0],
-         .BlendConstantColorGreen = d->blend_constants[1],
-         .BlendConstantColorBlue = d->blend_constants[2],
-         .BlendConstantColorAlpha = d->blend_constants[3],
-         .StencilReferenceValue = d->stencil_reference.front & 0xff,
-         .BackfaceStencilReferenceValue = d->stencil_reference.back & 0xff,
-      };
-      GENX(COLOR_CALC_STATE_pack)(NULL, cc_state.map, &cc);
-
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) {
-         ccp.ColorCalcStatePointer = cc_state.offset;
-      }
-   }
-
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE) {
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_LINE_STIPPLE), ls) {
-         ls.LineStipplePattern = d->line_stipple.pattern;
-         ls.LineStippleInverseRepeatCount =
-            1.0f / MAX2(1, d->line_stipple.factor);
-         ls.LineStippleRepeatCount = d->line_stipple.factor;
-      }
-   }
-
-   if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
-                                      ANV_CMD_DIRTY_RENDER_TARGETS |
-                                      ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
-                                      ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
-                                      ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE |
-                                      ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
-                                      ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP |
-                                      ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE |
-                                      ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP)) {
-      uint32_t depth_stencil_dw[GENX(DEPTH_STENCIL_STATE_length)];
-
-      struct GENX(DEPTH_STENCIL_STATE) depth_stencil = {
-         .StencilTestMask = d->stencil_compare_mask.front & 0xff,
-         .StencilWriteMask = d->stencil_write_mask.front & 0xff,
-
-         .BackfaceStencilTestMask = d->stencil_compare_mask.back & 0xff,
-         .BackfaceStencilWriteMask = d->stencil_write_mask.back & 0xff,
-
-         .StencilBufferWriteEnable =
-            (d->stencil_write_mask.front || d->stencil_write_mask.back) &&
-            d->stencil_test_enable,
-
-         .DepthTestEnable = d->depth_test_enable,
-         .DepthBufferWriteEnable = d->depth_test_enable && d->depth_write_enable,
-         .DepthTestFunction = genX(vk_to_intel_compare_op)[d->depth_compare_op],
-         .StencilTestEnable = d->stencil_test_enable,
-         .StencilFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.front.fail_op],
-         .StencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[d->stencil_op.front.pass_op],
-         .StencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.front.depth_fail_op],
-         .StencilTestFunction = genX(vk_to_intel_compare_op)[d->stencil_op.front.compare_op],
-         .BackfaceStencilFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.back.fail_op],
-         .BackfaceStencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[d->stencil_op.back.pass_op],
-         .BackfaceStencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.back.depth_fail_op],
-         .BackfaceStencilTestFunction = genX(vk_to_intel_compare_op)[d->stencil_op.back.compare_op],
-      };
-      GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil);
-
-      struct anv_state ds_state =
-         anv_cmd_buffer_merge_dynamic(cmd_buffer, depth_stencil_dw,
-                                      pipeline->gfx7.depth_stencil_state,
-                                      GENX(DEPTH_STENCIL_STATE_length), 64);
-
-      anv_batch_emit(&cmd_buffer->batch,
-                     GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), dsp) {
-         dsp.PointertoDEPTH_STENCIL_STATE = ds_state.offset;
-      }
-   }
-
-   if (cmd_buffer->state.gfx.gfx7.index_buffer &&
-       cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
-                                      ANV_CMD_DIRTY_INDEX_BUFFER |
-                                      ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE)) {
-      struct anv_buffer *buffer = cmd_buffer->state.gfx.gfx7.index_buffer;
-      uint32_t offset = cmd_buffer->state.gfx.gfx7.index_offset;
-
-#if GFX_VERx10 == 75
-      anv_batch_emit(&cmd_buffer->batch, GFX75_3DSTATE_VF, vf) {
-         vf.IndexedDrawCutIndexEnable  = d->primitive_restart_enable;
-         vf.CutIndex                   = cmd_buffer->state.restart_index;
-      }
-#endif
-
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
-#if GFX_VERx10 != 75
-         ib.CutIndexEnable        = d->primitive_restart_enable;
-#endif
-         ib.IndexFormat           = cmd_buffer->state.gfx.gfx7.index_type;
-         ib.MOCS                  = anv_mocs(cmd_buffer->device,
-                                             buffer->address.bo,
-                                             ISL_SURF_USAGE_INDEX_BUFFER_BIT);
-
-         ib.BufferStartingAddress = anv_address_add(buffer->address, offset);
-         ib.BufferEndingAddress   = anv_address_add(buffer->address,
-                                                    buffer->size);
-      }
-   }
-
-   /* 3DSTATE_WM in the hope we can avoid spawning fragment shaders
-    * threads or if we have dirty dynamic primitive topology state and
-    * need to toggle 3DSTATE_WM::MultisampleRasterizationMode dynamically.
-    */
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE ||
-       cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) {
-      const uint8_t color_writes = cmd_buffer->state.gfx.dynamic.color_writes;
-
-      bool dirty_color_blend =
-         cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE;
-
-      bool dirty_primitive_topology =
-         cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
-
-      VkPolygonMode dynamic_raster_mode;
-      VkPrimitiveTopology primitive_topology =
-         cmd_buffer->state.gfx.dynamic.primitive_topology;
-      dynamic_raster_mode =
-         genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
-                                   primitive_topology);
-
-      if (dirty_color_blend || dirty_primitive_topology) {
-         uint32_t dwords[GENX(3DSTATE_WM_length)];
-         struct GENX(3DSTATE_WM) wm = {
-            GENX(3DSTATE_WM_header),
-
-            .ThreadDispatchEnable = pipeline->force_fragment_thread_dispatch ||
-                                    color_writes,
-            .MultisampleRasterizationMode =
-               genX(ms_rasterization_mode)(pipeline, dynamic_raster_mode),
-         };
-         GENX(3DSTATE_WM_pack)(NULL, dwords, &wm);
-
-         anv_batch_emit_merge(&cmd_buffer->batch, dwords, pipeline->gfx7.wm);
-      }
-
-   }
-
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS) {
-      genX(emit_multisample)(&cmd_buffer->batch,
-                             cmd_buffer->state.gfx.dynamic.sample_locations.samples,
-                             cmd_buffer->state.gfx.dynamic.sample_locations.locations);
-   }
-
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE ||
-       cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP) {
-      const uint8_t color_writes = cmd_buffer->state.gfx.dynamic.color_writes;
-      bool dirty_color_blend =
-         cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE;
-
-      /* Blend states of each RT */
-      uint32_t surface_count = 0;
-      struct anv_pipeline_bind_map *map;
-      if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
-         map = &pipeline->shaders[MESA_SHADER_FRAGMENT]->bind_map;
-         surface_count = map->surface_count;
-      }
-
-      uint32_t blend_dws[GENX(BLEND_STATE_length) +
-                         MAX_RTS * GENX(BLEND_STATE_ENTRY_length)];
-      uint32_t *dws = blend_dws;
-      memset(blend_dws, 0, sizeof(blend_dws));
-
-      /* Skip this part */
-      dws += GENX(BLEND_STATE_length);
-
-      bool dirty_logic_op =
-         cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
-
-      for (uint32_t i = 0; i < surface_count; i++) {
-         struct anv_pipeline_binding *binding = &map->surface_to_descriptor[i];
-         bool write_disabled =
-            dirty_color_blend && (color_writes & (1u << binding->index)) == 0;
-         struct GENX(BLEND_STATE_ENTRY) entry = {
-            .WriteDisableAlpha = write_disabled,
-            .WriteDisableRed   = write_disabled,
-            .WriteDisableGreen = write_disabled,
-            .WriteDisableBlue  = write_disabled,
-            .LogicOpFunction =
-               dirty_logic_op ? genX(vk_to_intel_logic_op)[d->logic_op] : 0,
-         };
-         GENX(BLEND_STATE_ENTRY_pack)(NULL, dws, &entry);
-         dws += GENX(BLEND_STATE_ENTRY_length);
-      }
-
-      uint32_t num_dwords = GENX(BLEND_STATE_length) +
-         GENX(BLEND_STATE_ENTRY_length) * surface_count;
-
-      struct anv_state blend_states =
-         anv_cmd_buffer_merge_dynamic(cmd_buffer, blend_dws,
-                                      pipeline->gfx7.blend_state, num_dwords, 64);
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
-         bsp.BlendStatePointer      = blend_states.offset;
-      }
-   }
-
-   cmd_buffer->state.gfx.dirty = 0;
-}
-
-void
-genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer,
-                                bool enable)
-{
-   /* The NP PMA fix doesn't exist on gfx7 */
-}
diff --git a/src/intel/vulkan/gfx8_cmd_buffer.c b/src/intel/vulkan/gfx8_cmd_buffer.c
deleted file mode 100644
index 95250fa01d0..00000000000
--- a/src/intel/vulkan/gfx8_cmd_buffer.c
+++ /dev/null
@@ -1,844 +0,0 @@
-/*
- * Copyright © 2015 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <assert.h>
-#include <stdbool.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-
-#include "anv_private.h"
-
-#include "genxml/gen_macros.h"
-#include "genxml/genX_pack.h"
-#include "common/intel_guardband.h"
-
-#if GFX_VER == 8
-void
-gfx8_cmd_buffer_emit_viewport(struct anv_cmd_buffer *cmd_buffer)
-{
-   struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
-   uint32_t count = cmd_buffer->state.gfx.dynamic.viewport.count;
-   const VkViewport *viewports =
-      cmd_buffer->state.gfx.dynamic.viewport.viewports;
-   struct anv_state sf_clip_state =
-      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 64, 64);
-
-   for (uint32_t i = 0; i < count; i++) {
-      const VkViewport *vp = &viewports[i];
-
-      /* The gfx7 state struct has just the matrix and guardband fields, the
-       * gfx8 struct adds the min/max viewport fields. */
-      struct GENX(SF_CLIP_VIEWPORT) sfv = {
-         .ViewportMatrixElementm00 = vp->width / 2,
-         .ViewportMatrixElementm11 = vp->height / 2,
-         .ViewportMatrixElementm22 = vp->maxDepth - vp->minDepth,
-         .ViewportMatrixElementm30 = vp->x + vp->width / 2,
-         .ViewportMatrixElementm31 = vp->y + vp->height / 2,
-         .ViewportMatrixElementm32 = vp->minDepth,
-         .XMinClipGuardband = -1.0f,
-         .XMaxClipGuardband = 1.0f,
-         .YMinClipGuardband = -1.0f,
-         .YMaxClipGuardband = 1.0f,
-         .XMinViewPort = vp->x,
-         .XMaxViewPort = vp->x + vp->width - 1,
-         .YMinViewPort = MIN2(vp->y, vp->y + vp->height),
-         .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1,
-      };
-
-      if (fb) {
-         /* We can only calculate a "real" guardband clip if we know the
-          * framebuffer at the time we emit the packet.  Otherwise, we have
-          * fall back to a worst-case guardband of [-1, 1].
-          */
-         intel_calculate_guardband_size(fb->width, fb->height,
-                                        sfv.ViewportMatrixElementm00,
-                                        sfv.ViewportMatrixElementm11,
-                                        sfv.ViewportMatrixElementm30,
-                                        sfv.ViewportMatrixElementm31,
-                                        &sfv.XMinClipGuardband,
-                                        &sfv.XMaxClipGuardband,
-                                        &sfv.YMinClipGuardband,
-                                        &sfv.YMaxClipGuardband);
-      }
-
-      GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv);
-   }
-
-   anv_batch_emit(&cmd_buffer->batch,
-                  GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) {
-      clip.SFClipViewportPointer = sf_clip_state.offset;
-   }
-}
-
-void
-gfx8_cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer *cmd_buffer,
-                                    bool depth_clamp_enable)
-{
-   uint32_t count = cmd_buffer->state.gfx.dynamic.viewport.count;
-   const VkViewport *viewports =
-      cmd_buffer->state.gfx.dynamic.viewport.viewports;
-   struct anv_state cc_state =
-      anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, 32);
-
-   for (uint32_t i = 0; i < count; i++) {
-      const VkViewport *vp = &viewports[i];
-
-      /* From the Vulkan spec:
-       *
-       *    "It is valid for minDepth to be greater than or equal to
-       *    maxDepth."
-       */
-      float min_depth = MIN2(vp->minDepth, vp->maxDepth);
-      float max_depth = MAX2(vp->minDepth, vp->maxDepth);
-
-      struct GENX(CC_VIEWPORT) cc_viewport = {
-         .MinimumDepth = depth_clamp_enable ? min_depth : 0.0f,
-         .MaximumDepth = depth_clamp_enable ? max_depth : 1.0f,
-      };
-
-      GENX(CC_VIEWPORT_pack)(NULL, cc_state.map + i * 8, &cc_viewport);
-   }
-
-   anv_batch_emit(&cmd_buffer->batch,
-                  GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
-      cc.CCViewportPointer = cc_state.offset;
-   }
-}
-#endif
-
-void
-genX(cmd_buffer_enable_pma_fix)(struct anv_cmd_buffer *cmd_buffer, bool enable)
-{
-   if (cmd_buffer->state.pma_fix_enabled == enable)
-      return;
-
-   cmd_buffer->state.pma_fix_enabled = enable;
-
-   /* According to the Broadwell PIPE_CONTROL documentation, software should
-    * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
-    * prior to the LRI.  If stencil buffer writes are enabled, then a Render
-    * Cache Flush is also necessary.
-    *
-    * The Skylake docs say to use a depth stall rather than a command
-    * streamer stall.  However, the hardware seems to violently disagree.
-    * A full command streamer stall seems to be needed in both cases.
-    */
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-      pc.DepthCacheFlushEnable = true;
-      pc.CommandStreamerStallEnable = true;
-      pc.RenderTargetCacheFlushEnable = true;
-#if GFX_VER >= 12
-      pc.TileCacheFlushEnable = true;
-
-      /* Wa_1409600907: "PIPE_CONTROL with Depth Stall Enable bit must
-       * be set with any PIPE_CONTROL with Depth Flush Enable bit set.
-       */
-      pc.DepthStallEnable = true;
-#endif
-   }
-
-#if GFX_VER == 9
-
-   uint32_t cache_mode;
-   anv_pack_struct(&cache_mode, GENX(CACHE_MODE_0),
-                   .STCPMAOptimizationEnable = enable,
-                   .STCPMAOptimizationEnableMask = true);
-   anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
-      lri.RegisterOffset   = GENX(CACHE_MODE_0_num);
-      lri.DataDWord        = cache_mode;
-   }
-
-#elif GFX_VER == 8
-
-   uint32_t cache_mode;
-   anv_pack_struct(&cache_mode, GENX(CACHE_MODE_1),
-                   .NPPMAFixEnable = enable,
-                   .NPEarlyZFailsDisable = enable,
-                   .NPPMAFixEnableMask = true,
-                   .NPEarlyZFailsDisableMask = true);
-   anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
-      lri.RegisterOffset   = GENX(CACHE_MODE_1_num);
-      lri.DataDWord        = cache_mode;
-   }
-
-#endif /* GFX_VER == 8 */
-
-   /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
-    * Flush bits is often necessary.  We do it regardless because it's easier.
-    * The render cache flush is also necessary if stencil writes are enabled.
-    *
-    * Again, the Skylake docs give a different set of flushes but the BDW
-    * flushes seem to work just as well.
-    */
-   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
-      pc.DepthStallEnable = true;
-      pc.DepthCacheFlushEnable = true;
-      pc.RenderTargetCacheFlushEnable = true;
-#if GFX_VER >= 12
-      pc.TileCacheFlushEnable = true;
-#endif
-   }
-}
-
-UNUSED static bool
-want_depth_pma_fix(struct anv_cmd_buffer *cmd_buffer)
-{
-   assert(GFX_VER == 8);
-
-   /* From the Broadwell PRM Vol. 2c CACHE_MODE_1::NP_PMA_FIX_ENABLE:
-    *
-    *    SW must set this bit in order to enable this fix when following
-    *    expression is TRUE.
-    *
-    *    3DSTATE_WM::ForceThreadDispatch != 1 &&
-    *    !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
-    *    (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
-    *    (3DSTATE_DEPTH_BUFFER::HIZ Enable) &&
-    *    !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) &&
-    *    (3DSTATE_PS_EXTRA::PixelShaderValid) &&
-    *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
-    *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
-    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
-    *      3DSTATE_WM_HZ_OP::StencilBufferClear) &&
-    *    (3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable) &&
-    *    (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
-    *       3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
-    *       3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
-    *       3DSTATE_PS_BLEND::AlphaTestEnable ||
-    *       3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
-    *      3DSTATE_WM::ForceKillPix != ForceOff &&
-    *      ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
-    *        3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
-    *       (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
-    *        3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
-    *        3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
-    *     (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
-    */
-
-   /* These are always true:
-    *    3DSTATE_WM::ForceThreadDispatch != 1 &&
-    *    !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
-    */
-
-   /* We only enable the PMA fix if we know for certain that HiZ is enabled.
-    * If we don't know whether HiZ is enabled or not, we disable the PMA fix
-    * and there is no harm.
-    *
-    * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
-    * 3DSTATE_DEPTH_BUFFER::HIZ Enable
-    */
-   if (!cmd_buffer->state.hiz_enabled)
-      return false;
-
-   /* 3DSTATE_PS_EXTRA::PixelShaderValid */
-   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
-      return false;
-
-   /* !(3DSTATE_WM::EDSC_Mode == EDSC_PREPS) */
-   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
-   if (wm_prog_data->early_fragment_tests)
-      return false;
-
-   /* We never use anv_pipeline for HiZ ops so this is trivially true:
-    *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
-    *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
-    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
-    *      3DSTATE_WM_HZ_OP::StencilBufferClear)
-    */
-
-   /* 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable */
-   if (!pipeline->depth_test_enable)
-      return false;
-
-   /* (((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
-    *    3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
-    *    3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
-    *    3DSTATE_PS_BLEND::AlphaTestEnable ||
-    *    3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) &&
-    *   3DSTATE_WM::ForceKillPix != ForceOff &&
-    *   ((3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
-    *     3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE) ||
-    *    (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
-    *     3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
-    *     3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE))) ||
-    *  (3DSTATE_PS_EXTRA:: Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
-    */
-   return (pipeline->kill_pixel && (pipeline->writes_depth ||
-                                    pipeline->writes_stencil)) ||
-          wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
-}
-
-UNUSED static bool
-want_stencil_pma_fix(struct anv_cmd_buffer *cmd_buffer)
-{
-   if (GFX_VER > 9)
-      return false;
-   assert(GFX_VER == 9);
-
-   /* From the Skylake PRM Vol. 2c CACHE_MODE_1::STC PMA Optimization Enable:
-    *
-    *    Clearing this bit will force the STC cache to wait for pending
-    *    retirement of pixels at the HZ-read stage and do the STC-test for
-    *    Non-promoted, R-computed and Computed depth modes instead of
-    *    postponing the STC-test to RCPFE.
-    *
-    *    STC_TEST_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
-    *                  3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
-    *
-    *    STC_WRITE_EN = 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
-    *                   (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
-    *                    3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
-    *
-    *    COMP_STC_EN = STC_TEST_EN &&
-    *                  3DSTATE_PS_EXTRA::PixelShaderComputesStencil
-    *
-    *    SW parses the pipeline states to generate the following logical
-    *    signal indicating if PMA FIX can be enabled.
-    *
-    *    STC_PMA_OPT =
-    *       3DSTATE_WM::ForceThreadDispatch != 1 &&
-    *       !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0) &&
-    *       3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
-    *       3DSTATE_DEPTH_BUFFER::HIZ Enable &&
-    *       !(3DSTATE_WM::EDSC_Mode == 2) &&
-    *       3DSTATE_PS_EXTRA::PixelShaderValid &&
-    *       !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
-    *         3DSTATE_WM_HZ_OP::DepthBufferResolve ||
-    *         3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
-    *         3DSTATE_WM_HZ_OP::StencilBufferClear) &&
-    *       (COMP_STC_EN || STC_WRITE_EN) &&
-    *       ((3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
-    *         3DSTATE_WM::ForceKillPix == ON ||
-    *         3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
-    *         3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
-    *         3DSTATE_PS_BLEND::AlphaTestEnable ||
-    *         3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
-    *        (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF))
-    */
-
-   /* These are always true:
-    *    3DSTATE_WM::ForceThreadDispatch != 1 &&
-    *    !(3DSTATE_RASTER::ForceSampleCount != NUMRASTSAMPLES_0)
-    */
-
-   /* We only enable the PMA fix if we know for certain that HiZ is enabled.
-    * If we don't know whether HiZ is enabled or not, we disable the PMA fix
-    * and there is no harm.
-    *
-    * (3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL) &&
-    * 3DSTATE_DEPTH_BUFFER::HIZ Enable
-    */
-   if (!cmd_buffer->state.hiz_enabled)
-      return false;
-
-   /* We can't possibly know if HiZ is enabled without the framebuffer */
-   assert(cmd_buffer->state.framebuffer);
-
-   /* HiZ is enabled so we had better have a depth buffer with HiZ */
-   const struct anv_image_view *ds_iview =
-      anv_cmd_buffer_get_depth_stencil_view(cmd_buffer);
-   assert(ds_iview && ds_iview->image->planes[0].aux_usage == ISL_AUX_USAGE_HIZ);
-
-   /* 3DSTATE_PS_EXTRA::PixelShaderValid */
-   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
-      return false;
-
-   /* !(3DSTATE_WM::EDSC_Mode == 2) */
-   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
-   if (wm_prog_data->early_fragment_tests)
-      return false;
-
-   /* We never use anv_pipeline for HiZ ops so this is trivially true:
-   *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
-    *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
-    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
-    *      3DSTATE_WM_HZ_OP::StencilBufferClear)
-    */
-
-   /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
-    * 3DSTATE_WM_DEPTH_STENCIL::StencilTestEnable
-    */
-   const bool stc_test_en =
-      (ds_iview->image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
-      pipeline->stencil_test_enable;
-
-   /* 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE &&
-    * (3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
-    *  3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE)
-    */
-   const bool stc_write_en =
-      (ds_iview->image->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
-      (cmd_buffer->state.gfx.dynamic.stencil_write_mask.front ||
-       cmd_buffer->state.gfx.dynamic.stencil_write_mask.back) &&
-      pipeline->writes_stencil;
-
-   /* STC_TEST_EN && 3DSTATE_PS_EXTRA::PixelShaderComputesStencil */
-   const bool comp_stc_en = stc_test_en && wm_prog_data->computed_stencil;
-
-   /* COMP_STC_EN || STC_WRITE_EN */
-   if (!(comp_stc_en || stc_write_en))
-      return false;
-
-   /* (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
-    *  3DSTATE_WM::ForceKillPix == ON ||
-    *  3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
-    *  3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
-    *  3DSTATE_PS_BLEND::AlphaTestEnable ||
-    *  3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) ||
-    * (3DSTATE_PS_EXTRA::Pixel Shader Computed Depth mode != PSCDEPTH_OFF)
-    */
-   return pipeline->kill_pixel ||
-          wm_prog_data->computed_depth_mode != PSCDEPTH_OFF;
-}
-
-void
-genX(cmd_buffer_flush_dynamic_state)(struct anv_cmd_buffer *cmd_buffer)
-{
-   struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
-   struct anv_dynamic_state *d = &cmd_buffer->state.gfx.dynamic;
-
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) {
-      uint32_t topology;
-      if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
-         topology = pipeline->topology;
-      else
-         topology = genX(vk_to_intel_primitive_type)[d->primitive_topology];
-
-      cmd_buffer->state.gfx.primitive_topology = topology;
-
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
-         vft.PrimitiveTopologyType = topology;
-      }
-   }
-
-   if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
-                                      ANV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)) {
-      uint32_t sf_dw[GENX(3DSTATE_SF_length)];
-      struct GENX(3DSTATE_SF) sf = {
-         GENX(3DSTATE_SF_header),
-      };
-#if GFX_VER == 8
-      if (cmd_buffer->device->info.is_cherryview) {
-         sf.CHVLineWidth = d->line_width;
-      } else {
-         sf.LineWidth = d->line_width;
-      }
-#else
-      sf.LineWidth = d->line_width,
-#endif
-      GENX(3DSTATE_SF_pack)(NULL, sf_dw, &sf);
-      anv_batch_emit_merge(&cmd_buffer->batch, sf_dw, pipeline->gfx8.sf);
-   }
-
-   if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
-                                      ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS |
-                                      ANV_CMD_DIRTY_DYNAMIC_CULL_MODE |
-                                      ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
-                                      ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE |
-                                      ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY)) {
-      /* Take dynamic primitive topology in to account with
-       *    3DSTATE_RASTER::APIMode
-       *    3DSTATE_RASTER::DXMultisampleRasterizationEnable
-       *    3DSTATE_RASTER::AntialiasingEnable
-       */
-      uint32_t api_mode = 0;
-      bool msaa_raster_enable = false;
-      bool aa_enable = 0;
-
-      if (cmd_buffer->state.gfx.pipeline->dynamic_states &
-          ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY) {
-         VkPrimitiveTopology primitive_topology =
-            cmd_buffer->state.gfx.dynamic.primitive_topology;
-
-         VkPolygonMode dynamic_raster_mode =
-            genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
-                                      primitive_topology);
-
-         genX(rasterization_mode)(
-            dynamic_raster_mode, pipeline->line_mode, d->line_width,
-            &api_mode, &msaa_raster_enable);
-
-         aa_enable =
-            anv_rasterization_aa_mode(dynamic_raster_mode,
-                                      pipeline->line_mode);
-      }
-
-      uint32_t raster_dw[GENX(3DSTATE_RASTER_length)];
-      struct GENX(3DSTATE_RASTER) raster = {
-         GENX(3DSTATE_RASTER_header),
-         .APIMode = api_mode,
-         .DXMultisampleRasterizationEnable = msaa_raster_enable,
-         .AntialiasingEnable = aa_enable,
-         .GlobalDepthOffsetConstant = d->depth_bias.bias,
-         .GlobalDepthOffsetScale = d->depth_bias.slope,
-         .GlobalDepthOffsetClamp = d->depth_bias.clamp,
-         .CullMode = genX(vk_to_intel_cullmode)[d->cull_mode],
-         .FrontWinding = genX(vk_to_intel_front_face)[d->front_face],
-         .GlobalDepthOffsetEnableSolid = d->depth_bias_enable,
-         .GlobalDepthOffsetEnableWireframe = d->depth_bias_enable,
-         .GlobalDepthOffsetEnablePoint = d->depth_bias_enable,
-      };
-      GENX(3DSTATE_RASTER_pack)(NULL, raster_dw, &raster);
-      anv_batch_emit_merge(&cmd_buffer->batch, raster_dw,
-                           pipeline->gfx8.raster);
-   }
-
-   /* Stencil reference values moved from COLOR_CALC_STATE in gfx8 to
-    * 3DSTATE_WM_DEPTH_STENCIL in gfx9. That means the dirty bits gets split
-    * across different state packets for gfx8 and gfx9. We handle that by
-    * using a big old #if switch here.
-    */
-#if GFX_VER == 8
-   if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS |
-                                      ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE)) {
-      struct anv_state cc_state =
-         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
-                                            GENX(COLOR_CALC_STATE_length) * 4,
-                                            64);
-      struct GENX(COLOR_CALC_STATE) cc = {
-         .BlendConstantColorRed = d->blend_constants[0],
-         .BlendConstantColorGreen = d->blend_constants[1],
-         .BlendConstantColorBlue = d->blend_constants[2],
-         .BlendConstantColorAlpha = d->blend_constants[3],
-         .StencilReferenceValue = d->stencil_reference.front & 0xff,
-         .BackfaceStencilReferenceValue = d->stencil_reference.back & 0xff,
-      };
-      GENX(COLOR_CALC_STATE_pack)(NULL, cc_state.map, &cc);
-
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) {
-         ccp.ColorCalcStatePointer        = cc_state.offset;
-         ccp.ColorCalcStatePointerValid   = true;
-      }
-   }
-
-   if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
-                                      ANV_CMD_DIRTY_RENDER_TARGETS |
-                                      ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
-                                      ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
-                                      ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE |
-                                      ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
-                                      ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP |
-                                      ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE |
-                                      ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP)) {
-      uint32_t wm_depth_stencil_dw[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
-
-      struct GENX(3DSTATE_WM_DEPTH_STENCIL wm_depth_stencil) = {
-         GENX(3DSTATE_WM_DEPTH_STENCIL_header),
-
-         .StencilTestMask = d->stencil_compare_mask.front & 0xff,
-         .StencilWriteMask = d->stencil_write_mask.front & 0xff,
-
-         .BackfaceStencilTestMask = d->stencil_compare_mask.back & 0xff,
-         .BackfaceStencilWriteMask = d->stencil_write_mask.back & 0xff,
-
-         .StencilBufferWriteEnable =
-            (d->stencil_write_mask.front || d->stencil_write_mask.back) &&
-            d->stencil_test_enable,
-
-         .DepthTestEnable = d->depth_test_enable,
-         .DepthBufferWriteEnable = d->depth_test_enable && d->depth_write_enable,
-         .DepthTestFunction = genX(vk_to_intel_compare_op)[d->depth_compare_op],
-         .StencilTestEnable = d->stencil_test_enable,
-         .StencilFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.front.fail_op],
-         .StencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[d->stencil_op.front.pass_op],
-         .StencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.front.depth_fail_op],
-         .StencilTestFunction = genX(vk_to_intel_compare_op)[d->stencil_op.front.compare_op],
-         .BackfaceStencilFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.back.fail_op],
-         .BackfaceStencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[d->stencil_op.back.pass_op],
-         .BackfaceStencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.back.depth_fail_op],
-         .BackfaceStencilTestFunction = genX(vk_to_intel_compare_op)[d->stencil_op.back.compare_op],
-      };
-      GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, wm_depth_stencil_dw,
-                                          &wm_depth_stencil);
-
-      anv_batch_emit_merge(&cmd_buffer->batch, wm_depth_stencil_dw,
-                           pipeline->gfx8.wm_depth_stencil);
-
-      genX(cmd_buffer_enable_pma_fix)(cmd_buffer,
-                                      want_depth_pma_fix(cmd_buffer));
-   }
-#else
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS) {
-      struct anv_state cc_state =
-         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
-                                            GENX(COLOR_CALC_STATE_length) * 4,
-                                            64);
-      struct GENX(COLOR_CALC_STATE) cc = {
-         .BlendConstantColorRed = d->blend_constants[0],
-         .BlendConstantColorGreen = d->blend_constants[1],
-         .BlendConstantColorBlue = d->blend_constants[2],
-         .BlendConstantColorAlpha = d->blend_constants[3],
-      };
-      GENX(COLOR_CALC_STATE_pack)(NULL, cc_state.map, &cc);
-
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), ccp) {
-         ccp.ColorCalcStatePointer = cc_state.offset;
-         ccp.ColorCalcStatePointerValid = true;
-      }
-   }
-
-   if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
-                                      ANV_CMD_DIRTY_RENDER_TARGETS |
-                                      ANV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK |
-                                      ANV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
-                                      ANV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE |
-                                      ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE |
-                                      ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
-                                      ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP |
-                                      ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE |
-                                      ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP)) {
-      uint32_t dwords[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
-      struct GENX(3DSTATE_WM_DEPTH_STENCIL) wm_depth_stencil = {
-         GENX(3DSTATE_WM_DEPTH_STENCIL_header),
-
-         .StencilTestMask = d->stencil_compare_mask.front & 0xff,
-         .StencilWriteMask = d->stencil_write_mask.front & 0xff,
-
-         .BackfaceStencilTestMask = d->stencil_compare_mask.back & 0xff,
-         .BackfaceStencilWriteMask = d->stencil_write_mask.back & 0xff,
-
-         .StencilReferenceValue = d->stencil_reference.front & 0xff,
-         .BackfaceStencilReferenceValue = d->stencil_reference.back & 0xff,
-
-         .StencilBufferWriteEnable =
-            (d->stencil_write_mask.front || d->stencil_write_mask.back) &&
-            d->stencil_test_enable,
-
-         .DepthTestEnable = d->depth_test_enable,
-         .DepthBufferWriteEnable = d->depth_test_enable && d->depth_write_enable,
-         .DepthTestFunction = genX(vk_to_intel_compare_op)[d->depth_compare_op],
-         .StencilTestEnable = d->stencil_test_enable,
-         .StencilFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.front.fail_op],
-         .StencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[d->stencil_op.front.pass_op],
-         .StencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.front.depth_fail_op],
-         .StencilTestFunction = genX(vk_to_intel_compare_op)[d->stencil_op.front.compare_op],
-         .BackfaceStencilFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.back.fail_op],
-         .BackfaceStencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[d->stencil_op.back.pass_op],
-         .BackfaceStencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[d->stencil_op.back.depth_fail_op],
-         .BackfaceStencilTestFunction = genX(vk_to_intel_compare_op)[d->stencil_op.back.compare_op],
-
-      };
-      GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, dwords, &wm_depth_stencil);
-
-      anv_batch_emit_merge(&cmd_buffer->batch, dwords,
-                           pipeline->gfx9.wm_depth_stencil);
-
-      genX(cmd_buffer_enable_pma_fix)(cmd_buffer,
-                                      want_stencil_pma_fix(cmd_buffer));
-   }
-#endif
-
-#if GFX_VER >= 12
-   if(cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
-                                     ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS |
-                                     ANV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE)) {
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_DEPTH_BOUNDS), db) {
-         db.DepthBoundsTestValueModifyDisable = false;
-         db.DepthBoundsTestEnableModifyDisable = false;
-         db.DepthBoundsTestEnable = d->depth_bounds_test_enable;
-         db.DepthBoundsTestMinValue = d->depth_bounds.min;
-         db.DepthBoundsTestMaxValue = d->depth_bounds.max;
-      }
-   }
-#endif
-
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE) {
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_LINE_STIPPLE), ls) {
-         ls.LineStipplePattern = d->line_stipple.pattern;
-         ls.LineStippleInverseRepeatCount =
-            1.0f / MAX2(1, d->line_stipple.factor);
-         ls.LineStippleRepeatCount = d->line_stipple.factor;
-      }
-   }
-
-   if (cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
-                                      ANV_CMD_DIRTY_INDEX_BUFFER |
-                                      ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE)) {
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF), vf) {
-         vf.IndexedDrawCutIndexEnable  = d->primitive_restart_enable;
-         vf.CutIndex                   = cmd_buffer->state.restart_index;
-      }
-   }
-
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS) {
-      genX(emit_sample_pattern)(&cmd_buffer->batch,
-                                cmd_buffer->state.gfx.dynamic.sample_locations.samples,
-                                cmd_buffer->state.gfx.dynamic.sample_locations.locations);
-   }
-
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE ||
-       cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP) {
-      const uint8_t color_writes = cmd_buffer->state.gfx.dynamic.color_writes;
-      /* 3DSTATE_WM in the hope we can avoid spawning fragment shaders
-       * threads.
-       */
-      bool dirty_color_blend =
-         cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE;
-
-      if (dirty_color_blend) {
-         uint32_t dwords[MAX2(GENX(3DSTATE_WM_length),
-                              GENX(3DSTATE_PS_BLEND_length))];
-         struct GENX(3DSTATE_WM) wm = {
-            GENX(3DSTATE_WM_header),
-
-            .ForceThreadDispatchEnable = (pipeline->force_fragment_thread_dispatch ||
-                                          !color_writes) ? ForceON : 0,
-         };
-         GENX(3DSTATE_WM_pack)(NULL, dwords, &wm);
-
-         anv_batch_emit_merge(&cmd_buffer->batch, dwords, pipeline->gfx8.wm);
-
-         /* 3DSTATE_PS_BLEND to be consistent with the rest of the
-          * BLEND_STATE_ENTRY.
-          */
-         struct GENX(3DSTATE_PS_BLEND) ps_blend = {
-            GENX(3DSTATE_PS_BLEND_header),
-            .HasWriteableRT = color_writes != 0,
-         };
-         GENX(3DSTATE_PS_BLEND_pack)(NULL, dwords, &ps_blend);
-         anv_batch_emit_merge(&cmd_buffer->batch, dwords, pipeline->gfx8.ps_blend);
-      }
-
-      /* Blend states of each RT */
-      uint32_t surface_count = 0;
-      struct anv_pipeline_bind_map *map;
-      if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
-         map = &pipeline->shaders[MESA_SHADER_FRAGMENT]->bind_map;
-         surface_count = map->surface_count;
-      }
-
-      uint32_t blend_dws[GENX(BLEND_STATE_length) +
-                         MAX_RTS * GENX(BLEND_STATE_ENTRY_length)];
-      uint32_t *dws = blend_dws;
-      memset(blend_dws, 0, sizeof(blend_dws));
-
-      /* Skip this part */
-      dws += GENX(BLEND_STATE_length);
-
-      bool dirty_logic_op =
-         cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
-
-      for (uint32_t i = 0; i < surface_count; i++) {
-         struct anv_pipeline_binding *binding = &map->surface_to_descriptor[i];
-         bool write_disabled =
-            dirty_color_blend && (color_writes & (1u << binding->index)) == 0;
-         struct GENX(BLEND_STATE_ENTRY) entry = {
-            .WriteDisableAlpha = write_disabled,
-            .WriteDisableRed   = write_disabled,
-            .WriteDisableGreen = write_disabled,
-            .WriteDisableBlue  = write_disabled,
-            .LogicOpFunction =
-               dirty_logic_op ? genX(vk_to_intel_logic_op)[d->logic_op] : 0,
-         };
-         GENX(BLEND_STATE_ENTRY_pack)(NULL, dws, &entry);
-         dws += GENX(BLEND_STATE_ENTRY_length);
-      }
-
-      uint32_t num_dwords = GENX(BLEND_STATE_length) +
-         GENX(BLEND_STATE_ENTRY_length) * surface_count;
-
-      struct anv_state blend_states =
-         anv_cmd_buffer_merge_dynamic(cmd_buffer, blend_dws,
-                                      pipeline->gfx8.blend_state, num_dwords, 64);
-      anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
-         bsp.BlendStatePointer      = blend_states.offset;
-         bsp.BlendStatePointerValid = true;
-      }
-   }
-
-#if GFX_VER >= 11
-   if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE) {
-      struct anv_state cps_states = ANV_STATE_NULL;
-
-#if GFX_VER >= 12
-      uint32_t count = cmd_buffer->state.gfx.dynamic.viewport.count;
-      cps_states =
-         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
-                                            GENX(CPS_STATE_length) * 4 * count,
-                                            32);
-#endif /* GFX_VER >= 12 */
-
-      genX(emit_shading_rate)(&cmd_buffer->batch, pipeline, cps_states,
-                              &cmd_buffer->state.gfx.dynamic);
-   }
-#endif /* GFX_VER >= 11 */
-
-   cmd_buffer->state.gfx.dirty = 0;
-}
-
-static uint32_t vk_to_intel_index_type(VkIndexType type)
-{
-   switch (type) {
-   case VK_INDEX_TYPE_UINT8_EXT:
-      return INDEX_BYTE;
-   case VK_INDEX_TYPE_UINT16:
-      return INDEX_WORD;
-   case VK_INDEX_TYPE_UINT32:
-      return INDEX_DWORD;
-   default:
-      unreachable("invalid index type");
-   }
-}
-
-static uint32_t restart_index_for_type(VkIndexType type)
-{
-   switch (type) {
-   case VK_INDEX_TYPE_UINT8_EXT:
-      return UINT8_MAX;
-   case VK_INDEX_TYPE_UINT16:
-      return UINT16_MAX;
-   case VK_INDEX_TYPE_UINT32:
-      return UINT32_MAX;
-   default:
-      unreachable("invalid index type");
-   }
-}
-
-void genX(CmdBindIndexBuffer)(
-    VkCommandBuffer                             commandBuffer,
-    VkBuffer                                    _buffer,
-    VkDeviceSize                                offset,
-    VkIndexType                                 indexType)
-{
-   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
-   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
-
-   cmd_buffer->state.restart_index = restart_index_for_type(indexType);
-
-   anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
-      ib.IndexFormat           = vk_to_intel_index_type(indexType);
-      ib.MOCS                  = anv_mocs(cmd_buffer->device,
-                                          buffer->address.bo,
-                                          ISL_SURF_USAGE_INDEX_BUFFER_BIT);
-#if GFX_VER >= 12
-      ib.L3BypassDisable       = true;
-#endif
-      ib.BufferStartingAddress = anv_address_add(buffer->address, offset);
-      ib.BufferSize            = buffer->size - offset;
-   }
-
-   cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
-}
diff --git a/src/intel/vulkan/grl/.gitignore b/src/intel/vulkan/grl/.gitignore
new file mode 100644
index 00000000000..e2850ca03b1
--- /dev/null
+++ b/src/intel/vulkan/grl/.gitignore
@@ -0,0 +1 @@
+parsetab.py
diff --git a/src/intel/vulkan/grl/genX_grl.h b/src/intel/vulkan/grl/genX_grl.h
new file mode 100644
index 00000000000..57aefa72de0
--- /dev/null
+++ b/src/intel/vulkan/grl/genX_grl.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright © 2021 Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef ANV_GRL_H
+#define ANV_GRL_H
+
+#include "grl/grl_cl_kernel.h"
+#include "genxml/gen_macros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct anv_cmd_buffer;
+struct anv_kernel_arg;
+
+void
+genX(grl_dispatch)(struct anv_cmd_buffer *cmd_buffer,
+                   enum grl_cl_kernel kernel,
+                   const uint32_t *global_size,
+                   uint32_t arg_count,
+                   const struct anv_kernel_arg *args);
+
+void
+genX(grl_load_rt_uuid)(uint8_t *out_uuid);
+
+uint32_t
+genX(grl_max_scratch_size)(void);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* ANV_GRL_H */
diff --git a/src/intel/vulkan/grl/genX_grl_dispatch.c b/src/intel/vulkan/grl/genX_grl_dispatch.c
new file mode 100644
index 00000000000..aeb76b79bd0
--- /dev/null
+++ b/src/intel/vulkan/grl/genX_grl_dispatch.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright © 2021 Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+#include "genX_grl.h"
+
+static struct anv_shader_bin *
+get_shader_bin(struct anv_device *device,
+               enum grl_cl_kernel kernel)
+{
+   const char *key = genX(grl_get_cl_kernel_sha1)(kernel);
+   int key_len = strlen(key);
+
+   bool cache_hit = false;
+   struct anv_shader_bin *bin =
+      anv_device_search_for_kernel(device, device->internal_cache,
+                                   key, key_len, &cache_hit);
+   if (bin != NULL)
+      return bin;
+
+   uint32_t dummy_param[32];
+   struct brw_kernel kernel_data;
+   genX(grl_get_cl_kernel)(&kernel_data, kernel);
+
+   assert(kernel_data.prog_data.base.nr_params <= ARRAY_SIZE(dummy_param));
+   kernel_data.prog_data.base.param = dummy_param;
+
+   struct anv_push_descriptor_info empty_push_desc_info = {};
+   struct anv_pipeline_bind_map bind_map = {
+      .kernel_args_size = kernel_data.args_size,
+      .kernel_arg_count = kernel_data.arg_count,
+      .kernel_args = (struct brw_kernel_arg_desc *)kernel_data.args,
+   };
+
+   struct anv_shader_upload_params upload_params = {
+      .stage               = MESA_SHADER_KERNEL,
+      .key_data            = key,
+      .key_size            = key_len,
+      .kernel_data         = kernel_data.code,
+      .kernel_size         = kernel_data.prog_data.base.program_size,
+      .prog_data           = &kernel_data.prog_data.base,
+      .prog_data_size      = sizeof(kernel_data.prog_data),
+      .bind_map            = &bind_map,
+      .push_desc_info      = &empty_push_desc_info,
+   };
+
+   bin = anv_device_upload_kernel(device, device->internal_cache,
+                                  &upload_params);
+
+   /* The cache already has a reference and it's not going anywhere so there
+    * is no need to hold a second reference.
+    */
+   anv_shader_bin_unref(device, bin);
+
+   return bin;
+}
+
+void
+genX(grl_dispatch)(struct anv_cmd_buffer *cmd_buffer,
+                   enum grl_cl_kernel kernel,
+                   const uint32_t *global_size,
+                   uint32_t arg_count,
+                   const struct anv_kernel_arg *args)
+{
+   struct anv_device *device = cmd_buffer->device;
+
+   const struct intel_l3_weights w =
+      intel_get_default_l3_weights(device->info, true, true);
+
+   struct anv_kernel ak = {
+      .bin = get_shader_bin(device, kernel),
+      .l3_config = intel_get_l3_config(device->info, w),
+   };
+
+   genX(cmd_buffer_dispatch_kernel)(cmd_buffer, &ak, global_size,
+                                    arg_count, args);
+}
+
+uint32_t
+genX(grl_max_scratch_size)(void)
+{
+   uint32_t scratch_size = 0;
+
+   for (uint32_t i = 0; i < GRL_CL_KERNEL_MAX; i++) {
+      struct brw_kernel kernel_data;
+      genX(grl_get_cl_kernel)(&kernel_data, i);
+
+      scratch_size = MAX2(kernel_data.prog_data.base.total_scratch,
+                          scratch_size);
+   }
+
+   return scratch_size;
+}
diff --git a/src/intel/vulkan/grl/genX_grl_uuid.cpp b/src/intel/vulkan/grl/genX_grl_uuid.cpp
new file mode 100644
index 00000000000..cf6b425fe2b
--- /dev/null
+++ b/src/intel/vulkan/grl/genX_grl_uuid.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright © 2021 Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "genX_grl.h"
+#include "include/GRLGen12.h"
+
+#include "vulkan/vulkan_core.h"
+
+extern "C" void
+genX(grl_load_rt_uuid)(uint8_t *out_uuid);
+
+extern "C" void
+genX(grl_load_rt_uuid)(uint8_t *out_uuid)
+{
+   assert(sizeof(GRL::RTAS::GEN12::BVH_MAGIC) == VK_UUID_SIZE);
+   memcpy(out_uuid, GRL::RTAS::GEN12::BVH_MAGIC, VK_UUID_SIZE);
+}
diff --git a/src/intel/vulkan/grl/gpu/AABB.h b/src/intel/vulkan/grl/gpu/AABB.h
new file mode 100644
index 00000000000..11d848e3c09
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/AABB.h
@@ -0,0 +1,450 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "shared.h"
+#include "intrinsics.h"
+#ifndef __OPENCL_VERSION__
+#include "stdio.h"
+#endif
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+/* ====== QUAD ENCODING config ====== */
+
+#define QUAD_GEOMID_BITS 27 // dxr limit is 2^24 geos... we have headroom
+#define QUAD_PRIMID_DIFF_BITS (32 - QUAD_GEOMID_BITS)
+#define QUAD_GEOMID_MASK      ((1<<QUAD_GEOMID_BITS)-1)
+
+#define QUAD_PRIMID_BITS 29 // dxr limit is 2^29 prims total within one blas
+#define QUAD_PRIMID_MASK  ((1<<QUAD_PRIMID_BITS)-1)
+
+#define INSTANCE_ID_BITS 24
+#define INSTANCE_ID_MASK ((1<<INSTANCE_ID_BITS)-1)
+
+// JDB TODO:  Make this a separate, dedicated structure..  Aliasing a float4 AABB as a primref is needlessly obfuscated
+
+typedef struct AABB PrimRef;
+
+GRL_INLINE void AABB_init(struct AABB *aabb)
+{
+    aabb->lower = (float4)(INFINITY, INFINITY, INFINITY, 0);
+    aabb->upper = -(float4)(INFINITY, INFINITY, INFINITY, 0);
+}
+
+GRL_INLINE uint PRIMREF_geomID( PrimRef* aabb)
+{
+    const uint v = as_uint(aabb->lower.w);
+    return v & QUAD_GEOMID_MASK;
+}
+
+GRL_INLINE uint PRIMREF_primID0( PrimRef* aabb)
+{
+    return as_uint( aabb->upper.w ) & QUAD_PRIMID_MASK;
+}
+
+GRL_INLINE uint PRIMREF_primID1( PrimRef* aabb)
+{
+    const uint v = as_uint(aabb->lower.w);
+    const uint primID0 = as_uint(aabb->upper.w) & QUAD_PRIMID_MASK;
+    const uint deltaID = v >> QUAD_GEOMID_BITS;
+    const uint primID1 = primID0 + deltaID;
+    return primID1;
+}
+
+GRL_INLINE uint PRIMREF_geomFlags( PrimRef* aabb )
+{
+    const uint v = as_uint( aabb->upper.w );
+    return (v >> QUAD_PRIMID_BITS) ;
+}
+
+GRL_INLINE uint PRIMREF_instanceIndex( PrimRef* aabb )
+{
+    return as_uint(aabb->lower.w) & INSTANCE_ID_MASK;
+}
+
+GRL_INLINE uchar PRIMREF_instanceMask( PrimRef* aabb )
+{
+    return as_uint(aabb->lower.w) >> INSTANCE_ID_BITS;
+}
+
+GRL_INLINE void PRIMREF_setProceduralMetaData( PrimRef* primref, uint geomID, uint primID, uint geomFlags )
+{
+    /* encode geomID, primID */
+    uint flags = (geomFlags << QUAD_PRIMID_BITS);
+    primref->lower.w = as_float( geomID );
+    primref->upper.w = as_float( primID | flags );
+}
+
+GRL_INLINE void PRIMREF_setQuadMetaData( PrimRef* primref, uint primID0, uint primID1, uint geomID, uint geomFlags )
+{
+    const uint primID_diff = primID1 - primID0;
+    uint flags = geomFlags << QUAD_PRIMID_BITS;
+    
+    primref->lower.w = as_float( geomID | (primID_diff << QUAD_GEOMID_BITS) );
+    primref->upper.w = as_float( (primID0 | flags) );
+}
+
+GRL_INLINE void PRIMREF_setAABB( PrimRef* primref, float3 lower, float3 upper )
+{
+    primref->lower.xyz = lower.xyz;
+    primref->upper.xyz = upper.xyz;
+}
+
+GRL_INLINE PrimRef PRIMREF_set_instance( float3 lower, float3 upper, uint instanceIndex, uint instanceMask, uint rootOffset, bool is_procedural )
+{
+    PrimRef new_ref;
+    new_ref.lower.xyz = lower;
+    new_ref.lower.w = as_float(instanceIndex | (instanceMask << 24));
+    new_ref.upper.xyz = upper;
+    new_ref.upper.w = as_float(rootOffset + (is_procedural? 0x80000000 : 0));    
+    return new_ref;
+}
+
+GRL_INLINE bool PRIMREF_isProceduralInstance( PrimRef* primref )
+{
+    return (as_uint(primref->upper.w) & 0x80000000) != 0;
+}
+
+GRL_INLINE uint PRIMREF_instanceRootNodeOffset(PrimRef* primref)
+{
+    return (as_uint(primref->upper.w) & 0x7fffffff);
+}
+
+GRL_INLINE float3 PRIMREF_lower( PrimRef* primref )
+{
+    return primref->lower.xyz;
+}
+GRL_INLINE float3 PRIMREF_upper( PrimRef* primref )
+{
+    return primref->upper.xyz;
+}
+
+GRL_INLINE void AABB_extend(struct AABB *aabb, struct AABB *v)
+{
+    aabb->lower = min(aabb->lower, v->lower);
+    aabb->upper = max(aabb->upper, v->upper);
+}
+
+GRL_INLINE void AABB_extend_point(struct AABB *aabb, const float4 p)
+{
+    aabb->lower = min(aabb->lower, p);
+    aabb->upper = max(aabb->upper, p);
+}
+
+GRL_INLINE void AABB_extendlu(struct AABB *aabb, const float4 lower, const float4 upper)
+{
+    aabb->lower = min(aabb->lower, lower);
+    aabb->upper = max(aabb->upper, upper);
+}
+
+GRL_INLINE struct AABB AABB_enlarge(struct AABB *aabb, const float v)
+{
+    struct AABB box;
+    box.lower = aabb->lower - (float4)v;
+    box.upper = aabb->upper + (float4)v;
+    return box;
+}
+
+GRL_INLINE void AABB_intersect(struct AABB *aabb, struct AABB *v)
+{
+    aabb->lower = max(aabb->lower, v->lower);
+    aabb->upper = min(aabb->upper, v->upper);
+}
+
+GRL_INLINE float4 AABB_size(struct AABB *aabb)
+{
+    return aabb->upper - aabb->lower;
+}
+
+GRL_INLINE float4 AABB_centroid2(struct AABB *aabb)
+{
+    return aabb->lower + aabb->upper;
+}
+
+GRL_INLINE float AABB_halfArea(struct AABB *aabb)
+{
+    const float4 d = AABB_size(aabb);
+    return halfarea(d.xyz);
+}
+
+GRL_INLINE float AABB_intersecion_size(struct AABB* aabb, struct AABB* v)
+{
+    struct AABB temp = *aabb;
+    AABB_intersect(&temp, v);
+    float4 len = AABB_size(&temp);
+    float ret = 0.0f;
+    if (len.x >= 0.0f && len.y >= 0.0f && len.z >= 0.0f) {
+        float3 v = { len.x, len.y, len.z };
+        ret = halfarea(v);
+    }
+    return ret;
+}
+
+GRL_INLINE bool AABB_subset(struct AABB* small, struct AABB* big)
+{
+    const int4 b0 = small->lower >= big->lower;
+    const int4 b1 = small->upper <= big->upper;
+    const int4 b = b0 & b1;
+    return b.x & b.y & b.z;
+}
+
+GRL_INLINE struct AABB AABBfromAABB3f(const struct AABB3f box)
+{
+    struct AABB box4d = {
+        {box.lower[0], box.lower[1], box.lower[2], 0.0f},
+        {box.upper[0], box.upper[1], box.upper[2], 0.0f}
+    };
+    return box4d;
+}
+
+GRL_INLINE struct AABB3f AABB3fFromAABB(const struct AABB box)
+{
+    struct AABB3f box3d = {
+        {box.lower[0], box.lower[1], box.lower[2]},
+        {box.upper[0], box.upper[1], box.upper[2]}
+    };
+    return box3d;
+}
+
+GRL_INLINE bool AABB_verify(struct AABB* aabb)
+{
+    bool error = false;
+    if (aabb->lower.x > aabb->upper.x)
+        error = true;
+    if (aabb->lower.y > aabb->upper.y)
+        error = true;
+    if (aabb->lower.z > aabb->upper.z)
+        error = true;
+    if (!isfinite(aabb->lower.x))
+        error = true;
+    if (!isfinite(aabb->lower.y))
+        error = true;
+    if (!isfinite(aabb->lower.z))
+        error = true;
+    if (!isfinite(aabb->upper.x))
+        error = true;
+    if (!isfinite(aabb->upper.y))
+        error = true;
+    if (!isfinite(aabb->upper.z))
+        error = true;
+    return error;
+}
+
+GRL_INLINE void AABB_print(struct AABB* aabb)
+{
+    printf("AABB {\n  area = %f\n  lower = %f\n  upper = %f\n  geomID = %i  primID0 = %i  primID1 = %i\n  aabb->lower.w = %x  aabb->upper.w = %x }\n",
+        AABB_halfArea(aabb),
+        aabb->lower.xyz,
+        aabb->upper.xyz,
+        PRIMREF_geomID(aabb),
+        PRIMREF_primID0(aabb),
+        PRIMREF_primID1(aabb),
+        as_uint(aabb->lower.w),
+        as_uint(aabb->upper.w));
+}
+
+#ifdef __OPENCL_VERSION__
+
+GRL_INLINE PrimRef PrimRef_sub_group_shuffle(PrimRef* primRef, const uint slotID)
+{
+    PrimRef shuffledPrimref;
+    shuffledPrimref.lower.x = intel_sub_group_shuffle(primRef->lower.x, slotID);
+    shuffledPrimref.lower.y = intel_sub_group_shuffle(primRef->lower.y, slotID);
+    shuffledPrimref.lower.z = intel_sub_group_shuffle(primRef->lower.z, slotID);
+    shuffledPrimref.lower.w = intel_sub_group_shuffle(primRef->lower.w, slotID);
+    shuffledPrimref.upper.x = intel_sub_group_shuffle(primRef->upper.x, slotID);
+    shuffledPrimref.upper.y = intel_sub_group_shuffle(primRef->upper.y, slotID);
+    shuffledPrimref.upper.z = intel_sub_group_shuffle(primRef->upper.z, slotID);
+    shuffledPrimref.upper.w = intel_sub_group_shuffle(primRef->upper.w, slotID);
+    return shuffledPrimref;
+}
+
+GRL_INLINE struct AABB AABB_sub_group_broadcast(struct AABB *aabb, const uint slotID)
+{
+    struct AABB bounds;
+    bounds.lower.x = sub_group_broadcast(aabb->lower.x, slotID);
+    bounds.lower.y = sub_group_broadcast(aabb->lower.y, slotID);
+    bounds.lower.z = sub_group_broadcast(aabb->lower.z, slotID);
+    bounds.lower.w = 0;
+    bounds.upper.x = sub_group_broadcast(aabb->upper.x, slotID);
+    bounds.upper.y = sub_group_broadcast(aabb->upper.y, slotID);
+    bounds.upper.z = sub_group_broadcast(aabb->upper.z, slotID);
+    bounds.upper.w = 0;
+    return bounds;
+}
+GRL_INLINE struct AABB AABB_sub_group_shuffle(struct AABB* aabb, const uint slotID)
+{
+    struct AABB bounds;
+    bounds.lower.x = intel_sub_group_shuffle(aabb->lower.x, slotID);
+    bounds.lower.y = intel_sub_group_shuffle(aabb->lower.y, slotID);
+    bounds.lower.z = intel_sub_group_shuffle(aabb->lower.z, slotID);
+    bounds.lower.w = 0;
+    bounds.upper.x = intel_sub_group_shuffle(aabb->upper.x, slotID);
+    bounds.upper.y = intel_sub_group_shuffle(aabb->upper.y, slotID);
+    bounds.upper.z = intel_sub_group_shuffle(aabb->upper.z, slotID);
+    bounds.upper.w = 0;
+    return bounds;
+}
+
+GRL_INLINE uint AABB_sub_group_shuffle_coordPerLane(struct AABB* aabb, const uint slotID)
+{
+    float coordData[8] = {
+        sub_group_broadcast(aabb->lower.x, slotID),
+        sub_group_broadcast(aabb->lower.y, slotID),
+        sub_group_broadcast(aabb->lower.z, slotID),
+        sub_group_broadcast(aabb->lower.w, slotID),
+        sub_group_broadcast(aabb->upper.x, slotID),
+        sub_group_broadcast(aabb->upper.y, slotID),
+        sub_group_broadcast(aabb->upper.z, slotID),
+        sub_group_broadcast(aabb->upper.w, slotID) };
+
+    uint coordDataFiltered;
+    const uint lane = get_sub_group_local_id();
+    if (lane < 8) coordDataFiltered = as_uint(coordData[lane]);
+    return coordDataFiltered;
+}
+
+GRL_INLINE struct AABB AABB_sub_group_reduce(struct AABB *aabb)
+{
+    struct AABB bounds;
+    bounds.lower.x = sub_group_reduce_min(aabb->lower.x);
+    bounds.lower.y = sub_group_reduce_min(aabb->lower.y);
+    bounds.lower.z = sub_group_reduce_min(aabb->lower.z);
+    bounds.lower.w = 0;
+    bounds.upper.x = sub_group_reduce_max(aabb->upper.x);
+    bounds.upper.y = sub_group_reduce_max(aabb->upper.y);
+    bounds.upper.z = sub_group_reduce_max(aabb->upper.z);
+    bounds.upper.w = 0;
+    return bounds;
+}
+
+
+GRL_INLINE struct AABB AABB_sub_group_reduce_N6( struct AABB* aabb )
+{
+    float3 l = aabb->lower.xyz;
+    float3 u = aabb->upper.xyz;
+    l = min( l, intel_sub_group_shuffle_down( l, l, 4 ) );
+    l = min( l, intel_sub_group_shuffle_down( l, l, 2 ) );
+    l = min( l, intel_sub_group_shuffle_down( l, l, 1 ) );
+    u = max( u, intel_sub_group_shuffle_down( u, u, 4 ) );
+    u = max( u, intel_sub_group_shuffle_down( u, u, 2 ) );
+    u = max( u, intel_sub_group_shuffle_down( u, u, 1 ) );
+    
+    struct AABB bounds;
+    bounds.lower.x = l.x;
+    bounds.lower.y = l.y;
+    bounds.lower.z = l.z;
+    bounds.lower.w = 0;
+    bounds.upper.x = u.x;
+    bounds.upper.y = u.y;
+    bounds.upper.z = u.z;
+    bounds.upper.w = 0;
+    return bounds;
+}
+
+
+GRL_INLINE struct AABB AABB_work_group_reduce(struct AABB *aabb)
+{
+    struct AABB bounds;
+    bounds.lower.x = work_group_reduce_min(aabb->lower.x);
+    bounds.lower.y = work_group_reduce_min(aabb->lower.y);
+    bounds.lower.z = work_group_reduce_min(aabb->lower.z);
+    bounds.upper.x = work_group_reduce_max(aabb->upper.x);
+    bounds.upper.y = work_group_reduce_max(aabb->upper.y);
+    bounds.upper.z = work_group_reduce_max(aabb->upper.z);
+    return bounds;
+}
+
+GRL_INLINE struct AABB AABB_sub_group_scan_exclusive_min_max(struct AABB *aabb)
+{
+    struct AABB bounds;
+    bounds.lower.x = sub_group_scan_exclusive_min(aabb->lower.x);
+    bounds.lower.y = sub_group_scan_exclusive_min(aabb->lower.y);
+    bounds.lower.z = sub_group_scan_exclusive_min(aabb->lower.z);
+    bounds.lower.w = 0;
+    bounds.upper.x = sub_group_scan_exclusive_max(aabb->upper.x);
+    bounds.upper.y = sub_group_scan_exclusive_max(aabb->upper.y);
+    bounds.upper.z = sub_group_scan_exclusive_max(aabb->upper.z);
+    bounds.upper.w = 0;
+    return bounds;
+}
+
+GRL_INLINE struct AABB AABB_sub_group_scan_inclusive_min_max(struct AABB *aabb)
+{
+    struct AABB bounds;
+    bounds.lower.x = sub_group_scan_inclusive_min(aabb->lower.x);
+    bounds.lower.y = sub_group_scan_inclusive_min(aabb->lower.y);
+    bounds.lower.z = sub_group_scan_inclusive_min(aabb->lower.z);
+    bounds.lower.w = 0;
+    bounds.upper.x = sub_group_scan_inclusive_max(aabb->upper.x);
+    bounds.upper.y = sub_group_scan_inclusive_max(aabb->upper.y);
+    bounds.upper.z = sub_group_scan_inclusive_max(aabb->upper.z);
+    bounds.upper.w = 0;
+    return bounds;
+}
+
+GRL_INLINE void AABB_global_atomic_merge(global struct AABB *global_aabb, struct AABB *aabb)
+{
+    atomic_min((volatile __global float *)&global_aabb->lower + 0, aabb->lower.x);
+    atomic_min((volatile __global float *)&global_aabb->lower + 1, aabb->lower.y);
+    atomic_min((volatile __global float *)&global_aabb->lower + 2, aabb->lower.z);
+    atomic_max((volatile __global float *)&global_aabb->upper + 0, aabb->upper.x);
+    atomic_max((volatile __global float *)&global_aabb->upper + 1, aabb->upper.y);
+    atomic_max((volatile __global float *)&global_aabb->upper + 2, aabb->upper.z);
+}
+
+GRL_INLINE void AABB_global_atomic_merge_lu(global struct AABB* global_aabb, float3 lower, float3 upper )
+{
+    atomic_min((volatile __global float*) & global_aabb->lower + 0, lower.x);
+    atomic_min((volatile __global float*) & global_aabb->lower + 1, lower.y);
+    atomic_min((volatile __global float*) & global_aabb->lower + 2, lower.z);
+    atomic_max((volatile __global float*) & global_aabb->upper + 0, upper.x);
+    atomic_max((volatile __global float*) & global_aabb->upper + 1, upper.y);
+    atomic_max((volatile __global float*) & global_aabb->upper + 2, upper.z);
+}
+
+GRL_INLINE void AABB_global_atomic_merge_sub_group_lu(uniform global struct AABB* aabb, float3 lower, float3 upper)
+{
+    uint lane = get_sub_group_local_id();
+    float l[3];
+    l[0] = sub_group_reduce_min(lower.x);
+    l[1] = sub_group_reduce_min(lower.y);
+    l[2] = sub_group_reduce_min(lower.z);
+    float u[3];
+    u[0] = sub_group_reduce_max(upper.x);
+    u[1] = sub_group_reduce_max(upper.y);
+    u[2] = sub_group_reduce_max(upper.z);
+
+    if (lane < 3)
+    {
+        atomic_min((global float*)&aabb->lower + lane, l[lane]);
+        atomic_max((global float*)&aabb->upper + lane, u[lane]);
+    }
+}
+
+
+GRL_INLINE void AABB_local_atomic_merge(local struct AABB *aabb, const float4 lower, const float4 upper)
+{
+    if (lower.x < aabb->lower.x)
+        atomic_min((local float *)&aabb->lower + 0, lower.x);
+    if (lower.y < aabb->lower.y)
+        atomic_min((local float *)&aabb->lower + 1, lower.y);
+    if (lower.z < aabb->lower.z)
+        atomic_min((local float *)&aabb->lower + 2, lower.z);
+    if (upper.x > aabb->upper.x)
+        atomic_max((local float *)&aabb->upper + 0, upper.x);
+    if (upper.y > aabb->upper.y)
+        atomic_max((local float *)&aabb->upper + 1, upper.y);
+    if (upper.z > aabb->upper.z)
+        atomic_max((local float *)&aabb->upper + 2, upper.z);
+}
+#endif
+
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
+\ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/api_interface.h b/src/intel/vulkan/grl/gpu/api_interface.h
new file mode 100644
index 00000000000..71a1fff6327
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/api_interface.h
@@ -0,0 +1,840 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+#include "GRLStructs.h"
+#include "shared.h"
+#include "libs/lsc_intrinsics.h"
+
+typedef struct Geo GRL_RAYTRACING_GEOMETRY_DESC;
+
+typedef struct GRL_RAYTRACING_AABB
+{
+    float MinX;
+    float MinY;
+    float MinZ;
+    float MaxX;
+    float MaxY;
+    float MaxZ;
+} GRL_RAYTRACING_AABB;
+
+GRL_INLINE void GLR_set_raytracing_aabb(GRL_RAYTRACING_AABB* dest, struct AABB* source)
+{
+    dest->MinX = source->lower.x;
+    dest->MinY = source->lower.y;
+    dest->MinZ = source->lower.z;
+    dest->MaxX = source->upper.x;
+    dest->MaxY = source->upper.y;
+    dest->MaxZ = source->upper.z;
+}
+
+GRL_INLINE uint3 GRL_load_triangle(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint triID)
+{
+    global char* indices = (global char*)geomDesc->Desc.Triangles.pIndexBuffer;
+    uint index_format = geomDesc->Desc.Triangles.IndexFormat;
+
+    if (index_format == INDEX_FORMAT_R32_UINT)
+    {
+        const uint* data = (const uint*)(indices + triID * 3 * 4);
+        return (uint3)(data[0], data[1], data[2]);
+    }
+    else if (index_format == INDEX_FORMAT_NONE)
+    {
+        return (uint3)(triID * 3, triID * 3 + 1, triID * 3 + 2);
+    }
+    else
+    {
+        const ushort* data = (const ushort*)(indices + triID * 3 * 2);
+        return (uint3)(data[0], data[1], data[2]);
+    }
+}
+
+GRL_INLINE uint3 GRL_load_indices_from_buffer(global char* indices, const uint index_format, const uint triID)
+{
+    if (index_format == INDEX_FORMAT_R32_UINT)
+    {
+        return load_uint3_L1C_L3C((global uint3*)(indices + triID * 3 * 4), 0);
+    }
+    else if (index_format == INDEX_FORMAT_NONE)
+    {
+        return (uint3)(triID * 3, triID * 3 + 1, triID * 3 + 2);
+    }
+    else
+    {
+        const ushort* data = (const ushort*)(indices + triID * 3 * 2);
+        return (uint3)(data[0], data[1], data[2]);
+    }
+}
+
+// Load all 3 indices from one triangle, and a single index from another
+GRL_INLINE uint4 GRL_load_quad_indices(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, uint triID, uint triID_1, ushort fourth_vert)
+{
+    global char* indices = (global char*)geomDesc->Desc.Triangles.pIndexBuffer;
+    uint index_format = geomDesc->Desc.Triangles.IndexFormat;
+
+    if (index_format == INDEX_FORMAT_R32_UINT)
+    {
+        const uint* data0 = (const uint*)(indices + triID * 3 * 4);
+        const uint* data1 = (const uint*)(indices + triID_1 * 3 * 4);
+        return (uint4)(data0[0], data0[1], data0[2], data1[fourth_vert]);
+    }
+    else if (index_format == INDEX_FORMAT_NONE)
+    {
+        return (uint4)(triID * 3, triID * 3 + 1, triID * 3 + 2, triID_1 * 3 + fourth_vert);
+    }
+    else
+    {
+        const ushort* data0 = (const ushort*)(indices + triID * 3 * 2);
+        const ushort* data1 = (const ushort*)(indices + triID_1 * 3 * 2);
+        return (uint4)(data0[0], data0[1], data0[2], data1[fourth_vert]);
+    }
+}
+
+GRL_INLINE void GRL_set_Type(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, GeometryType type)
+{
+    geomDesc->Type = type;
+}
+
+GRL_INLINE GeometryType GRL_get_Type(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Type;
+}
+
+GRL_INLINE void GRL_set_Flags(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, uint8_t flags)
+{
+    geomDesc->Flags = flags;
+}
+
+GRL_INLINE uint8_t GRL_get_Flags(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Flags;
+}
+
+GRL_INLINE void GRL_set_triangles_Transform(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t transform)
+{
+    geomDesc->Desc.Triangles.pTransformBuffer = transform;
+}
+
+GRL_INLINE gpuva_t GRL_get_triangles_Transform(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Triangles.pTransformBuffer;
+}
+
+GRL_INLINE void GRL_set_triangles_IndexFormat(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, IndexFormat format)
+{
+    geomDesc->Desc.Triangles.IndexFormat = format;
+}
+
+GRL_INLINE IndexFormat GRL_get_triangles_IndexFormat(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Triangles.IndexFormat;
+}
+
+GRL_INLINE void GRL_set_triangles_VertexFormat(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, VertexFormat format)
+{
+    geomDesc->Desc.Triangles.VertexFormat = format;
+}
+
+GRL_INLINE VertexFormat GRL_get_triangles_VertexFormat(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Triangles.VertexFormat;
+}
+
+GRL_INLINE void GRL_set_triangles_IndexCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count)
+{
+    geomDesc->Desc.Triangles.IndexCount = count;
+}
+
+GRL_INLINE dword GRL_get_triangles_IndexCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Triangles.IndexCount;
+}
+
+GRL_INLINE void GRL_set_triangles_VertexCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count)
+{
+    geomDesc->Desc.Triangles.VertexCount = count;
+}
+
+GRL_INLINE dword GRL_get_triangles_VertexCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Triangles.VertexCount;
+}
+
+GRL_INLINE void GRL_set_triangles_IndexBuffer(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t buffer)
+{
+    geomDesc->Desc.Triangles.pIndexBuffer = buffer;
+}
+
+GRL_INLINE gpuva_t GRL_get_triangles_IndexBuffer(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Triangles.pIndexBuffer;
+}
+
+GRL_INLINE void GRL_set_triangles_VertexBuffer_StartAddress(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t address)
+{
+    geomDesc->Desc.Triangles.pVertexBuffer = address;
+}
+
+GRL_INLINE gpuva_t GRL_get_triangles_VertexBuffer_StartAddress(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Triangles.pVertexBuffer;
+}
+
+GRL_INLINE void GRL_set_triangles_VertexBuffer_StrideInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, unsigned long stride)
+{
+    geomDesc->Desc.Triangles.VertexBufferByteStride = stride;
+}
+
+GRL_INLINE unsigned long GRL_get_triangles_VertexBuffer_StrideInBytes(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Triangles.VertexBufferByteStride;
+}
+
+GRL_INLINE unsigned long GRL_get_triangles_IndexFormatSizeInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return (unsigned long)(geomDesc->Desc.Triangles.IndexFormat);
+}
+
+GRL_INLINE void GRL_set_procedurals_AABBCount(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, dword count)
+{
+    geomDesc->Desc.Procedural.AABBCount = count;
+}
+
+GRL_INLINE dword GRL_get_procedurals_AABBCount(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Procedural.AABBCount;
+}
+
+GRL_INLINE void GRL_set_procedurals_AABBs_StartAddress(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, gpuva_t address)
+{
+    geomDesc->Desc.Procedural.pAABBs_GPUVA = address;
+}
+
+GRL_INLINE gpuva_t GRL_get_procedurals_AABBs_StartAddress(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Procedural.pAABBs_GPUVA;
+}
+
+GRL_INLINE void GRL_set_procedurals_AABBs_StrideInBytes(GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, qword stride)
+{
+    geomDesc->Desc.Procedural.AABBByteStride = stride;
+}
+
+GRL_INLINE qword GRL_get_procedurals_AABBs_StrideInBytes(const GRL_RAYTRACING_GEOMETRY_DESC* geomDesc)
+{
+    return geomDesc->Desc.Procedural.AABBByteStride;
+}
+
+GRL_INLINE uint GRL_is_procedural(GRL_RAYTRACING_GEOMETRY_DESC* desc)
+{
+    return desc->Type == (unsigned char)GEOMETRY_TYPE_PROCEDURAL;
+}
+
+GRL_INLINE uint GRL_is_triangle(GRL_RAYTRACING_GEOMETRY_DESC* desc)
+{
+    return desc->Type != (unsigned char)GEOMETRY_TYPE_PROCEDURAL;
+}
+
+GRL_INLINE unsigned int GRL_get_ShaderIndex_Mask(GRL_RAYTRACING_GEOMETRY_DESC* desc)
+{
+    return 0x00FFFFFF;
+}
+
+GRL_INLINE dword GRL_atomic_add_triangles_VertexCount(GRL_RAYTRACING_GEOMETRY_DESC* desc, dword value)
+{
+    return atomic_add((global uint*) & desc->Desc.Triangles.VertexCount, value);
+}
+
+GRL_INLINE unsigned int GRL_get_primitive_count(GRL_RAYTRACING_GEOMETRY_DESC* desc)
+{
+    if (GRL_is_triangle(desc))
+    {
+        if (desc->Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE)
+        {
+            return desc->Desc.Triangles.VertexCount / 3;
+        }
+        else
+        {
+            return desc->Desc.Triangles.IndexCount / 3;
+        }
+    }
+    else
+    {
+        return desc->Desc.Procedural.AABBCount;
+    }
+}
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable // to leaf half values
+
+GRL_INLINE float snorm_to_float(short v)
+{
+    return min(1.0f, max(-1.0f, ((float)v) * (1.0f / 32767.0f))); // FIXME: do we have intrinsic for this?
+}
+
+GRL_INLINE float snorm8_to_float(signed char v)
+{
+    return min(1.0f, max(-1.0f, ((float)v) * (1.0f / 127.0f))); // FIXME: do we have intrinsic for this?
+}
+
+GRL_INLINE float unorm_to_float(unsigned short v)
+{
+    return min(1.0f, max(0.0f, ((float)v) * (1.0f / 65535.0f))); // FIXME: do we have intrinsic for this?
+}
+
+//only lower 10 bits of v are used
+GRL_INLINE float unorm10_to_float(unsigned v)
+{
+    const unsigned short mask = (unsigned short)((1u << 10u) - 1u);
+    const unsigned short v10 = (unsigned short)v & mask;
+    return min(1.0f, max(0.0f, ((float)v10) * (1.0f / 1023.0f))); // FIXME: do we have intrinsic for this?
+}
+
+GRL_INLINE float unorm8_to_float(unsigned char v)
+{
+    return min(1.0f, max(0.0f, ((float)v) * (1.0f / 255.0f))); // FIXME: do we have intrinsic for this?
+}
+
+GRL_INLINE float4 GRL_load_vertex(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint vtxID)
+{
+    float4 v = (float4)(0, 0, 0, 0);
+    global char* vertices = (global char*)geomDesc->Desc.Triangles.pVertexBuffer;
+    uint vertex_stride = geomDesc->Desc.Triangles.VertexBufferByteStride;
+    uint vertex_format = geomDesc->Desc.Triangles.VertexFormat;
+
+    if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT)
+    {
+        const float* data = (const float*)(vertices + vtxID * vertex_stride);
+        v = (float4)(data[0], data[1], data[2], 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT)
+    {
+        const float* data = (const float*)(vertices + vtxID * vertex_stride);
+        v = (float4)(data[0], data[1], 0.0f, 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT)
+    {
+        const half* data = (const half*)(vertices + vtxID * vertex_stride);
+        v = (float4)(data[0], data[1], data[2], 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT)
+    {
+        const half* data = (const half*)(vertices + vtxID * vertex_stride);
+        v = (float4)(data[0], data[1], 0.0f, 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM)
+    {
+        const short* data = (const short*)(vertices + vtxID * vertex_stride);
+        v = (float4)(snorm_to_float(data[0]),
+            snorm_to_float(data[1]),
+            snorm_to_float(data[2]),
+            0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM)
+    {
+        const short* data = (const short*)(vertices + vtxID * vertex_stride);
+        v = (float4)(snorm_to_float(data[0]),
+            snorm_to_float(data[1]),
+            0.0f,
+            0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM)
+    {
+        const unsigned short* data = (const unsigned short*)(vertices + vtxID * vertex_stride);
+        v = (float4)(unorm_to_float(data[0]),
+            unorm_to_float(data[1]),
+            unorm_to_float(data[2]),
+            0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM)
+    {
+        const unsigned short* data = (const unsigned short*)(vertices + vtxID * vertex_stride);
+        v = (float4)(unorm_to_float(data[0]),
+            unorm_to_float(data[1]),
+            0.0f,
+            0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM)
+    {
+        const unsigned data = *(const unsigned*)(vertices + vtxID * vertex_stride);
+        v = (float4)(unorm10_to_float(data),
+            unorm10_to_float((data >> 10)),
+            unorm10_to_float((data >> 20)),
+            0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM)
+    {
+        const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride);
+        v = (float4)(unorm8_to_float(data[0]),
+            unorm8_to_float(data[1]),
+            unorm8_to_float(data[2]),
+            0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM)
+    {
+        const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride);
+        v = (float4)(unorm8_to_float(data[0]),
+            unorm8_to_float(data[1]),
+            0.0f,
+            0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM)
+    {
+        const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride);
+        v = (float4)(snorm8_to_float(data[0]),
+            snorm8_to_float(data[1]),
+            snorm8_to_float(data[2]),
+            0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM)
+    {
+        const unsigned char* data = (const unsigned char*)(vertices + vtxID * vertex_stride);
+        v = (float4)(snorm8_to_float(data[0]),
+            snorm8_to_float(data[1]),
+            0.0f,
+            0.0f);
+    }
+
+    /* perform vertex transformation */
+    if (geomDesc->Desc.Triangles.pTransformBuffer)
+    {
+        global float* xfm = (global float*)geomDesc->Desc.Triangles.pTransformBuffer;
+        const float x = xfm[0] * v.x + xfm[1] * v.y + xfm[2] * v.z + xfm[3];
+        const float y = xfm[4] * v.x + xfm[5] * v.y + xfm[6] * v.z + xfm[7];
+        const float z = xfm[8] * v.x + xfm[9] * v.y + xfm[10] * v.z + xfm[11];
+        v = (float4)(x, y, z, 0.0f);
+    }
+
+    return v;
+}
+
+GRL_INLINE void GRL_load_triangle_vertices(global char* vertices, const uint vertex_format, const uint vertex_stride, global float* transform_buffer, const uint vtx0ID, const uint vtx1ID, const uint vtx2ID, float4* out)
+{
+    if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT)
+    {
+        const float3 data0 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx0ID * vertex_stride), 0));
+        const float3 data1 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx1ID * vertex_stride), 0));
+        const float3 data2 = as_float3(load_uint3_L1C_L3C((global uint3*)(vertices + vtx2ID * vertex_stride), 0));
+        out[0] = (float4)(data0[0], data0[1], data0[2], 0.0f);
+        out[1] = (float4)(data1[0], data1[1], data1[2], 0.0f);
+        out[2] = (float4)(data2[0], data2[1], data2[2], 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT)
+    {
+        const float* data0 = (const float*)(vertices + vtx0ID * vertex_stride);
+        const float* data1 = (const float*)(vertices + vtx1ID * vertex_stride);
+        const float* data2 = (const float*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(data0[0], data0[1], 0.0f, 0.0f);
+        out[1] = (float4)(data1[0], data1[1], 0.0f, 0.0f);
+        out[2] = (float4)(data2[0], data2[1], 0.0f, 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT)
+    {
+        const half* data0 = (const half*)(vertices + vtx0ID * vertex_stride);
+        const half* data1 = (const half*)(vertices + vtx1ID * vertex_stride);
+        const half* data2 = (const half*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(data0[0], data0[1], data0[2], 0.0f);
+        out[1] = (float4)(data1[0], data1[1], data1[2], 0.0f);
+        out[2] = (float4)(data2[0], data2[1], data2[2], 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT)
+    {
+        const half* data0 = (const half*)(vertices + vtx0ID * vertex_stride);
+        const half* data1 = (const half*)(vertices + vtx1ID * vertex_stride);
+        const half* data2 = (const half*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(data0[0], data0[1], 0.0f, 0.0f);
+        out[1] = (float4)(data1[0], data1[1], 0.0f, 0.0f);
+        out[2] = (float4)(data2[0], data2[1], 0.0f, 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM)
+    {
+        const short* data0 = (const short*)(vertices + vtx0ID * vertex_stride);
+        const short* data1 = (const short*)(vertices + vtx1ID * vertex_stride);
+        const short* data2 = (const short*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), snorm_to_float(data0[2]), 0.0f);
+        out[1] = (float4)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), snorm_to_float(data1[2]), 0.0f);
+        out[2] = (float4)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), snorm_to_float(data2[2]), 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM)
+    {
+        const short* data0 = (const short*)(vertices + vtx0ID * vertex_stride);
+        const short* data1 = (const short*)(vertices + vtx1ID * vertex_stride);
+        const short* data2 = (const short*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), 0.0f, 0.0f);
+        out[1] = (float4)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), 0.0f, 0.0f);
+        out[2] = (float4)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), 0.0f, 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM)
+    {
+        const unsigned short* data0 = (const unsigned short*)(vertices + vtx0ID * vertex_stride);
+        const unsigned short* data1 = (const unsigned short*)(vertices + vtx1ID * vertex_stride);
+        const unsigned short* data2 = (const unsigned short*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), unorm_to_float(data0[2]), 0.0f);
+        out[1] = (float4)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), unorm_to_float(data1[2]), 0.0f);
+        out[2] = (float4)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), unorm_to_float(data2[2]), 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM)
+    {
+        const unsigned short* data0 = (const unsigned short*)(vertices + vtx0ID * vertex_stride);
+        const unsigned short* data1 = (const unsigned short*)(vertices + vtx1ID * vertex_stride);
+        const unsigned short* data2 = (const unsigned short*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), 0.0f, 0.0f);
+        out[1] = (float4)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), 0.0f, 0.0f);
+        out[2] = (float4)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), 0.0f, 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM)
+    {
+        const unsigned data0 = *(const unsigned*)(vertices + vtx0ID * vertex_stride);
+        const unsigned data1 = *(const unsigned*)(vertices + vtx1ID * vertex_stride);
+        const unsigned data2 = *(const unsigned*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(unorm10_to_float(data0), unorm10_to_float(data0 >> 10), unorm10_to_float(data0 >> 20), 0.0f);
+        out[1] = (float4)(unorm10_to_float(data1), unorm10_to_float(data1 >> 10), unorm10_to_float(data1 >> 20), 0.0f);
+        out[2] = (float4)(unorm10_to_float(data2), unorm10_to_float(data2 >> 10), unorm10_to_float(data2 >> 20), 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM)
+    {
+        const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride);
+        const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride);
+        const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), unorm8_to_float(data0[2]), 0.0f);
+        out[1] = (float4)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), unorm8_to_float(data1[2]), 0.0f);
+        out[2] = (float4)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), unorm8_to_float(data2[2]), 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM)
+    {
+        const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride);
+        const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride);
+        const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), 0.0f, 0.0f);
+        out[1] = (float4)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), 0.0f, 0.0f);
+        out[2] = (float4)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), 0.0f, 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM)
+    {
+        const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride);
+        const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride);
+        const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), snorm8_to_float(data0[2]), 0.0f);
+        out[1] = (float4)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), snorm8_to_float(data1[2]), 0.0f);
+        out[2] = (float4)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), snorm8_to_float(data2[2]), 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM)
+    {
+        const unsigned char* data0 = (const unsigned char*)(vertices + vtx0ID * vertex_stride);
+        const unsigned char* data1 = (const unsigned char*)(vertices + vtx1ID * vertex_stride);
+        const unsigned char* data2 = (const unsigned char*)(vertices + vtx2ID * vertex_stride);
+        out[0] = (float4)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), 0.0f, 0.0f);
+        out[1] = (float4)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), 0.0f, 0.0f);
+        out[2] = (float4)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), 0.0f, 0.0f);
+    }
+
+    /* perform vertex transformation */
+    if (transform_buffer)
+    {
+        global float* xfm = (global float*)transform_buffer;
+        for (uint i = 0; i < 3; ++i)
+        {
+            const float x = xfm[0] * out[i].x + xfm[1] * out[i].y + xfm[2] * out[i].z + xfm[3];
+            const float y = xfm[4] * out[i].x + xfm[5] * out[i].y + xfm[6] * out[i].z + xfm[7];
+            const float z = xfm[8] * out[i].x + xfm[9] * out[i].y + xfm[10] * out[i].z + xfm[11];
+            out[i] = (float4)(x, y, z, 0.0f);
+        }
+    }
+}
+
+GRL_INLINE void GRL_load_quad_vertices_no_stride(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    float3* out0, float3* out1, float3* out2, float3* out3,
+    const uint4 vtxID, const uint vertex_format, global char* vertices)
+{
+    float3 v0, v1, v2, v3;
+
+    if (vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT)
+    {
+        const float* data0 = (const float*)(vertices + vtxID.x);
+        const float* data1 = (const float*)(vertices + vtxID.y);
+        const float* data2 = (const float*)(vertices + vtxID.z);
+        const float* data3 = (const float*)(vertices + vtxID.w);
+        v0 = (float3)(data0[0], data0[1], data0[2]);
+        v1 = (float3)(data1[0], data1[1], data1[2]);
+        v2 = (float3)(data2[0], data2[1], data2[2]);
+        v3 = (float3)(data3[0], data3[1], data3[2]);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R32G32_FLOAT)
+    {
+        const float* data0 = (const float*)(vertices + vtxID.x);
+        const float* data1 = (const float*)(vertices + vtxID.y);
+        const float* data2 = (const float*)(vertices + vtxID.z);
+        const float* data3 = (const float*)(vertices + vtxID.w);
+        v0 = (float3)(data0[0], data0[1], 0.0f);
+        v1 = (float3)(data1[0], data1[1], 0.0f);
+        v2 = (float3)(data2[0], data2[1], 0.0f);
+        v3 = (float3)(data3[0], data3[1], 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT)
+    {
+        const half* data0 = (const half*)(vertices + vtxID.x);
+        const half* data1 = (const half*)(vertices + vtxID.y);
+        const half* data2 = (const half*)(vertices + vtxID.z);
+        const half* data3 = (const half*)(vertices + vtxID.w);
+        v0 = (float3)(data0[0], data0[1], data0[2]);
+        v1 = (float3)(data1[0], data1[1], data1[2]);
+        v2 = (float3)(data2[0], data2[1], data2[2]);
+        v3 = (float3)(data3[0], data3[1], data3[2]);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16_FLOAT)
+    {
+        const half* data0 = (const half*)(vertices + vtxID.x);
+        const half* data1 = (const half*)(vertices + vtxID.y);
+        const half* data2 = (const half*)(vertices + vtxID.z);
+        const half* data3 = (const half*)(vertices + vtxID.w);
+        v0 = (float3)(data0[0], data0[1], 0.0f);
+        v1 = (float3)(data1[0], data1[1], 0.0f);
+        v2 = (float3)(data2[0], data2[1], 0.0f);
+        v3 = (float3)(data3[0], data3[1], 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM)
+    {
+        const short* data0 = (const short*)(vertices + vtxID.x);
+        const short* data1 = (const short*)(vertices + vtxID.y);
+        const short* data2 = (const short*)(vertices + vtxID.z);
+        const short* data3 = (const short*)(vertices + vtxID.w);
+        v0 = (float3)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), snorm_to_float(data0[2]));
+        v1 = (float3)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), snorm_to_float(data1[2]));
+        v2 = (float3)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), snorm_to_float(data2[2]));
+        v3 = (float3)(snorm_to_float(data3[0]), snorm_to_float(data3[1]), snorm_to_float(data3[2]));
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16_SNORM)
+    {
+        const short* data0 = (const short*)(vertices + vtxID.x);
+        const short* data1 = (const short*)(vertices + vtxID.y);
+        const short* data2 = (const short*)(vertices + vtxID.z);
+        const short* data3 = (const short*)(vertices + vtxID.w);
+        v0 = (float3)(snorm_to_float(data0[0]), snorm_to_float(data0[1]), 0.0f);
+        v1 = (float3)(snorm_to_float(data1[0]), snorm_to_float(data1[1]), 0.0f);
+        v2 = (float3)(snorm_to_float(data2[0]), snorm_to_float(data2[1]), 0.0f);
+        v3 = (float3)(snorm_to_float(data3[0]), snorm_to_float(data3[1]), 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM)
+    {
+        const unsigned short* data0 = (const unsigned short*)(vertices + vtxID.x);
+        const unsigned short* data1 = (const unsigned short*)(vertices + vtxID.y);
+        const unsigned short* data2 = (const unsigned short*)(vertices + vtxID.z);
+        const unsigned short* data3 = (const unsigned short*)(vertices + vtxID.w);
+        v0 = (float3)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), unorm_to_float(data0[2]));
+        v1 = (float3)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), unorm_to_float(data1[2]));
+        v2 = (float3)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), unorm_to_float(data2[2]));
+        v3 = (float3)(unorm_to_float(data3[0]), unorm_to_float(data3[1]), unorm_to_float(data3[2]));
+    }
+    else if (vertex_format == VERTEX_FORMAT_R16G16_UNORM)
+    {
+        const unsigned short* data0 = (const unsigned short*)(vertices + vtxID.x);
+        const unsigned short* data1 = (const unsigned short*)(vertices + vtxID.y);
+        const unsigned short* data2 = (const unsigned short*)(vertices + vtxID.z);
+        const unsigned short* data3 = (const unsigned short*)(vertices + vtxID.w);
+        v0 = (float3)(unorm_to_float(data0[0]), unorm_to_float(data0[1]), 0.0f);
+        v1 = (float3)(unorm_to_float(data1[0]), unorm_to_float(data1[1]), 0.0f);
+        v2 = (float3)(unorm_to_float(data2[0]), unorm_to_float(data2[1]), 0.0f);
+        v3 = (float3)(unorm_to_float(data3[0]), unorm_to_float(data3[1]), 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM)
+    {
+        const unsigned data0 = *(const unsigned*)(vertices + vtxID.x);
+        const unsigned data1 = *(const unsigned*)(vertices + vtxID.y);
+        const unsigned data2 = *(const unsigned*)(vertices + vtxID.z);
+        const unsigned data3 = *(const unsigned*)(vertices + vtxID.w);
+        v0 = (float3)(unorm10_to_float(data0), unorm10_to_float((data0 >> 10)), unorm10_to_float((data0 >> 20)));
+        v1 = (float3)(unorm10_to_float(data1), unorm10_to_float((data1 >> 10)), unorm10_to_float((data1 >> 20)));
+        v2 = (float3)(unorm10_to_float(data2), unorm10_to_float((data2 >> 10)), unorm10_to_float((data2 >> 20)));
+        v3 = (float3)(unorm10_to_float(data3), unorm10_to_float((data3 >> 10)), unorm10_to_float((data3 >> 20)));
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM)
+    {
+        const unsigned char* data0 = (const unsigned char*)(vertices + vtxID.x);
+        const unsigned char* data1 = (const unsigned char*)(vertices + vtxID.y);
+        const unsigned char* data2 = (const unsigned char*)(vertices + vtxID.z);
+        const unsigned char* data3 = (const unsigned char*)(vertices + vtxID.w);
+        v0 = (float3)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), unorm8_to_float(data0[2]));
+        v1 = (float3)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), unorm8_to_float(data1[2]));
+        v2 = (float3)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), unorm8_to_float(data2[2]));
+        v3 = (float3)(unorm8_to_float(data3[0]), unorm8_to_float(data3[1]), unorm8_to_float(data3[2]));
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8_UNORM)
+    {
+        const unsigned char* data0 = (const unsigned char*)(vertices + vtxID.x);
+        const unsigned char* data1 = (const unsigned char*)(vertices + vtxID.y);
+        const unsigned char* data2 = (const unsigned char*)(vertices + vtxID.z);
+        const unsigned char* data3 = (const unsigned char*)(vertices + vtxID.w);
+        v0 = (float3)(unorm8_to_float(data0[0]), unorm8_to_float(data0[1]), 0.0f);
+        v1 = (float3)(unorm8_to_float(data1[0]), unorm8_to_float(data1[1]), 0.0f);
+        v2 = (float3)(unorm8_to_float(data2[0]), unorm8_to_float(data2[1]), 0.0f);
+        v3 = (float3)(unorm8_to_float(data3[0]), unorm8_to_float(data3[1]), 0.0f);
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM)
+    {
+        const signed char* data0 = (const signed char*)(vertices + vtxID.x);
+        const signed char* data1 = (const signed char*)(vertices + vtxID.y);
+        const signed char* data2 = (const signed char*)(vertices + vtxID.z);
+        const signed char* data3 = (const signed char*)(vertices + vtxID.w);
+        v0 = (float3)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), snorm8_to_float(data0[2]));
+        v1 = (float3)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), snorm8_to_float(data1[2]));
+        v2 = (float3)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), snorm8_to_float(data2[2]));
+        v3 = (float3)(snorm8_to_float(data3[0]), snorm8_to_float(data3[1]), snorm8_to_float(data3[2]));
+    }
+    else if (vertex_format == VERTEX_FORMAT_R8G8_SNORM)
+    {
+        const signed char* data0 = (const signed char*)(vertices + vtxID.x);
+        const signed char* data1 = (const signed char*)(vertices + vtxID.y);
+        const signed char* data2 = (const signed char*)(vertices + vtxID.z);
+        const signed char* data3 = (const signed char*)(vertices + vtxID.w);
+        v0 = (float3)(snorm8_to_float(data0[0]), snorm8_to_float(data0[1]), 0.0f);
+        v1 = (float3)(snorm8_to_float(data1[0]), snorm8_to_float(data1[1]), 0.0f);
+        v2 = (float3)(snorm8_to_float(data2[0]), snorm8_to_float(data2[1]), 0.0f);
+        v3 = (float3)(snorm8_to_float(data3[0]), snorm8_to_float(data3[1]), 0.0f);
+    }
+    else
+    {
+        v0 = (float3)(0.0f, 0.0f, 0.0f);
+        v1 = (float3)(0.0f, 0.0f, 0.0f);
+        v2 = (float3)(0.0f, 0.0f, 0.0f);
+        v3 = (float3)(0.0f, 0.0f, 0.0f);
+    }
+
+
+    /* perform vertex transformation */
+    if (geomDesc->Desc.Triangles.pTransformBuffer)
+    {
+        global float* xfm = (global float*)geomDesc->Desc.Triangles.pTransformBuffer;
+
+        v0.xyz = (float3)(
+            xfm[0] * v0.x + xfm[1] * v0.y + xfm[2] * v0.z + xfm[3],
+            xfm[4] * v0.x + xfm[5] * v0.y + xfm[6] * v0.z + xfm[7],
+            xfm[8] * v0.x + xfm[9] * v0.y + xfm[10] * v0.z + xfm[11]
+            );
+
+        v1.xyz = (float3)(
+            xfm[0] * v1.x + xfm[1] * v1.y + xfm[2] * v1.z + xfm[3],
+            xfm[4] * v1.x + xfm[5] * v1.y + xfm[6] * v1.z + xfm[7],
+            xfm[8] * v1.x + xfm[9] * v1.y + xfm[10] * v1.z + xfm[11]
+            );
+
+        v2.xyz = (float3)(
+            xfm[0] * v2.x + xfm[1] * v2.y + xfm[2] * v2.z + xfm[3],
+            xfm[4] * v2.x + xfm[5] * v2.y + xfm[6] * v2.z + xfm[7],
+            xfm[8] * v2.x + xfm[9] * v2.y + xfm[10] * v2.z + xfm[11]
+            );
+
+        v3.xyz = (float3)(
+            xfm[0] * v3.x + xfm[1] * v3.y + xfm[2] * v3.z + xfm[3],
+            xfm[4] * v3.x + xfm[5] * v3.y + xfm[6] * v3.z + xfm[7],
+            xfm[8] * v3.x + xfm[9] * v3.y + xfm[10] * v3.z + xfm[11]
+            );
+    }
+
+    *out0 = v0;
+    *out1 = v1;
+    *out2 = v2;
+    *out3 = v3;
+}
+
+
+GRL_INLINE void GRL_load_quad_vertices(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    float3* out0, float3* out1, float3* out2, float3* out3,
+    uint4 vtxID)
+{
+    global char* vertices = (global char*)geomDesc->Desc.Triangles.pVertexBuffer;
+    uint vertex_format = geomDesc->Desc.Triangles.VertexFormat;
+    uint vertex_stride = geomDesc->Desc.Triangles.VertexBufferByteStride;
+
+    vtxID *= vertex_stride;
+
+    GRL_load_quad_vertices_no_stride(geomDesc, out0, out1, out2, out3,
+        vtxID, vertex_format, vertices);
+}
+
+
+GRL_INLINE GRL_RAYTRACING_AABB GRL_load_aabb(global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, const uint primID)
+{
+    global char* aabb0 = (global char*)geomDesc->Desc.Procedural.pAABBs_GPUVA;
+    global char* aabb = aabb0 + (primID * geomDesc->Desc.Procedural.AABBByteStride);
+    return *(global GRL_RAYTRACING_AABB*)aabb;
+}
+
+// same as for d3d12
+typedef struct GRL_RAYTRACING_INSTANCE_DESC
+{
+    float Transform[12];
+    //     unsigned int InstanceID : 24;
+    //     unsigned int InstanceMask : 8;
+    uint32_t DW0;
+    //     unsigned int InstanceContributionToHitGroupIndex : 24;
+    //     unsigned int Flags : 8;
+    uint32_t DW1;
+    global char* AccelerationStructure;
+} GRL_RAYTRACING_INSTANCE_DESC;
+
+GRL_INLINE float GRL_get_transform(const GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t row, const uint32_t column)
+{
+    return d->Transform[row * 4 + column];
+}
+
+GRL_INLINE uint32_t GRL_get_instanceID(const GRL_RAYTRACING_INSTANCE_DESC* d)
+{
+    return d->DW0 & ((1 << 24) - 1);
+}
+
+GRL_INLINE uint32_t GRL_get_InstanceMask(const GRL_RAYTRACING_INSTANCE_DESC* d)
+{
+    return d->DW0 >> 24;
+}
+
+GRL_INLINE uint32_t GRL_get_InstanceContributionToHitGroupIndex(const GRL_RAYTRACING_INSTANCE_DESC* d)
+{
+    return d->DW1 & ((1 << 24) - 1);
+}
+
+GRL_INLINE uint32_t GRL_get_InstanceFlags(const GRL_RAYTRACING_INSTANCE_DESC* d)
+{
+    return d->DW1 >> 24;
+}
+
+GRL_INLINE gpuva_t GRL_get_AccelerationStructure(const GRL_RAYTRACING_INSTANCE_DESC* d)
+{
+    return (gpuva_t)d->AccelerationStructure;
+}
+
+GRL_INLINE void GRL_set_transform(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t row, const uint32_t column, float value)
+{
+    d->Transform[row * 4 + column] = value;
+}
+
+GRL_INLINE void GRL_set_instanceID(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t id)
+{
+    d->DW0 &= 255 << 24;
+    d->DW0 |= id & ((1 << 24) - 1);
+}
+
+GRL_INLINE void GRL_set_InstanceMask(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t mask)
+{
+    d->DW0 &= ((1 << 24) - 1);
+    d->DW0 |= mask << 24;
+}
+
+GRL_INLINE void GRL_set_InstanceContributionToHitGroupIndex(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t contribution)
+{
+    d->DW1 &= 255 << 24;
+    d->DW1 |= contribution & ((1 << 24) - 1);
+}
+
+GRL_INLINE void GRL_set_InstanceFlags(GRL_RAYTRACING_INSTANCE_DESC* d, const uint32_t flags)
+{
+    d->DW1 &= ((1 << 24) - 1);
+    d->DW1 |= flags << 24;
+}
+
+GRL_INLINE void GRL_set_AccelerationStructure(GRL_RAYTRACING_INSTANCE_DESC* d, gpuva_t address)
+{
+    d->AccelerationStructure = (global char*)address;
+}
diff --git a/src/intel/vulkan/grl/gpu/atomic_update.cl b/src/intel/vulkan/grl/gpu/atomic_update.cl
new file mode 100644
index 00000000000..5171a122dc1
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/atomic_update.cl
@@ -0,0 +1,1112 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "GRLGen12.h"
+
+#include "bvh_build_refit.h"
+#include "bvh_build_treelet_refit.h"
+
+
+struct RefitScratch
+{
+    float lower[3];
+    uint mask;
+    float upper[3];
+    uint _pad;
+
+};
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(64, 1, 1))) void kernel
+init_refit_scratch(
+    global struct BVHBase* bvh,
+    global struct RefitScratch* scratch )
+{
+    uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0);
+
+    if ( tid < BVHBase_GetNumInternalNodes(bvh) )
+    {        
+        float4 v = (float4) (FLT_MAX,FLT_MAX,FLT_MAX,0);        
+        store_uint4_L1WB_L3WB( (global uint4*) &scratch[tid], 0, as_uint4(v) );
+        store_uint4_L1WB_L3WB( (global uint4*) &scratch[tid], 1, as_uint4(v) );
+    }
+}
+
+bool is_fat_leaf( InternalNode* curNode )
+{
+    return curNode->nodeType != BVH_INTERNAL_NODE; // TODO:  Not enough for traversal shaders!! if ts enabled need to check child types
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(64, 1, 1))) void kernel
+build_fatleaf_table(
+    global struct BVHBase* bvh )
+{
+    uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0);
+
+    if ( tid < BVHBase_GetNumInternalNodes(bvh) )
+    {
+        InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid;
+
+        if ( is_fat_leaf(curNode) )
+        {
+            uint offs = atomic_inc_global( &bvh->fatLeafCount );
+    
+            BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+            uint bp = *InnerNode_GetBackPointer(backPointers, tid);
+            
+            LeafTableEntry* leaf   = BVHBase_GetFatLeafTable(bvh)+offs;
+            leaf->backpointer      = bp;
+            leaf->inner_node_index = tid;
+            leaf->leaf_index       = (BVH_ROOT_NODE_OFFSET/64) + tid + curNode->childOffset - bvh->quadLeafStart;
+        }
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(64, 1, 1))) void kernel
+build_fatleaf_table_new_update(
+    global struct Globals *globals,
+    global struct BVHBase* bvh )
+{
+    uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0);
+
+    if ( tid < BVHBase_GetNumInternalNodes(bvh) )
+    {
+        InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid;
+
+        if ( is_fat_leaf(curNode) )
+        {
+            // This implementation uses fatleaf table structure but it is actually quad table
+            // Also tested implementation that process 2 fatleafs per SIMD line as we iterate over the children
+            // but performance was worse
+            BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+            uint bp = *InnerNode_GetBackPointer(backPointers, tid);
+            uint fatLeafTableStart = bvh->fatLeafTableStart;
+            
+            uint leaf_index = (BVH_ROOT_NODE_OFFSET/64) + tid + curNode->childOffset - bvh->quadLeafStart;
+            uint numChildren = (bp >> 3) & 0x7;
+            
+            uint quad_leaf_table_index = leaf_index;
+            
+            // Check if num children is outside of the % 256 work group
+            // If so, move these cases to the offset after numQuads and push them to the leftovers part
+            // where fatleaves are stored every 8th pos with additional padding
+            // This way we will not have the case in leftovers table where single fatleaf has children in 2 separate work groups
+            
+            uint prev_group = leaf_index & 255;
+            uint next_group = (leaf_index + (numChildren - 1)) & 255;
+            uint slm_pos = prev_group;
+            bool is_leftover = prev_group > next_group;
+            
+            if(is_leftover)
+            {
+                LeafTableEntry* leafBase = (LeafTableEntry*)(((char*)bvh) + (64u * fatLeafTableStart + 12 * quad_leaf_table_index));
+                uint numQuads_aligned_256 = (globals->numPrimitives + 255) & ~255;
+            
+                uint leftovers_offset = atomic_add_global( &bvh->quadLeftoversCountNewAtomicUpdate, 8 );
+            
+                for(uint i = 0; i < BVH_NODE_N6; i++)
+                {
+                    uint pos = (i < numChildren) ? i : 0;
+                    LeafTableEntry* leaf_null = &leafBase[pos];
+                    leaf_null->leaf_index = -1 << 3;
+                }
+            
+                quad_leaf_table_index = numQuads_aligned_256 + leftovers_offset;
+                slm_pos = leftovers_offset & 255;
+            }
+            
+            LeafTableEntry* leaf = (LeafTableEntry*)(((char*)bvh) + (64u * fatLeafTableStart + 12 * quad_leaf_table_index));
+            
+            for(uint i = 0; i < BVH_NODE_N6; i++)
+            {
+                uint pos = (i < numChildren) ? i : 0;
+                LeafTableEntry* leafCur = &leaf[pos];
+                leafCur->backpointer = bp;
+                leafCur->inner_node_index = (tid << 8) | slm_pos;
+                leafCur->leaf_index = (leaf_index << 3) | pos;
+            }
+            
+            // Need to clean the unused area where we pad to 8 for leftovers
+            if(is_leftover)
+            {
+                for(uint i = 1; i < 8; i++)
+                {
+                    uint pos = (i >= numChildren) ? i : 7;
+                    LeafTableEntry* leafCur = &leaf[pos];
+                    leafCur->leaf_index = -1 << 3;
+                }
+            }
+        }
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(64, 1, 1))) void kernel
+build_innernode_table(
+    global struct BVHBase* bvh )
+{
+    uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0);
+
+    if ( tid < BVHBase_GetNumInternalNodes(bvh) )
+    {
+        InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid;
+
+        if ( !is_fat_leaf( curNode ) )
+        {
+            uint offs = atomic_inc_global( &bvh->innerCount );
+
+            BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+            uint bp = *InnerNode_GetBackPointer(backPointers, tid);
+
+            InnerNodeTableEntry* inner   = BVHBase_GetInnerNodeTable(bvh)+offs;
+            inner->node_index_and_numchildren = (tid<<3) | ((bp>>3) &7);
+            inner->first_child = tid + curNode->childOffset;
+        }
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1))) void kernel
+fixup_quad_table(
+    global struct BVHBase* bvh )
+{
+    // This kernel has 2 work groups that set the magic number for unused data in
+    // fatleaf table. One work group for thelast group of the first part where quads are packed,
+    // second one for the last group of the part where quads are stored padded
+
+    uint numQuads = BVHBase_GetNumQuads(bvh);
+    uint numQuadLeftovers = bvh->quadLeftoversCountNewAtomicUpdate;
+    uint numQuadLeftovers_aligned_256 = (numQuadLeftovers + 255) & ~255;
+
+    uint numQuads_aligned_256 = (numQuads + 255) & ~255;
+    uint quadOffsetEnd = numQuads_aligned_256 + get_group_id(0) * numQuadLeftovers_aligned_256;
+    uint quadOffsetStart = quadOffsetEnd - 256;
+
+    uint quads_number_last_group = (get_group_id(0) == 0) ? numQuads : numQuads_aligned_256 + numQuadLeftovers;
+
+    uint leftovers = quadOffsetEnd - quads_number_last_group;
+
+    uint tid = get_local_id(0) > (255 - leftovers) ? get_local_id(0) : 256 - leftovers;
+
+    if(leftovers != 0)
+    {
+        LeafTableEntry* leafBvh = BVHBase_GetFatLeafTable(bvh);
+        
+        LeafTableEntry* leaf = &leafBvh[quadOffsetStart + tid];
+        leaf->leaf_index = -1 << 3;
+    }
+
+    if(get_group_id(0) == 1 && get_local_id(0) == 0)
+        bvh->quadTableSizeNewAtomicUpdate = quadOffsetEnd;
+}
+
+
+// updates one quad leaf and gets BBOX contatining it
+GRL_INLINE void refit_bottom_child_quad_WB(
+    global struct QuadLeaf* quad,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    struct AABB* childAABB)
+{    
+    /* get the geomID and primID0/1 for both quad triangles */
+    const uint geomID = PrimLeaf_GetGeoIndex(&quad->leafDesc);
+    const uint primID0 = quad->primIndex0;
+    const uint primID1 = primID0 + QuadLeaf_GetPrimIndexDelta(quad);
+    ushort fourth_vert = 0;
+
+    if (primID1 != primID0)
+    {
+        ushort packed_indices = QuadLeaf_GetSecondTriangleIndices(quad);
+        fourth_vert = ((packed_indices & 0x0C) == 0x0C) ? 1 : fourth_vert;
+        fourth_vert = ((packed_indices & 0x30) == 0x30) ? 2 : fourth_vert;
+    }
+
+    global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDesc + geomID;
+
+    uint4 indices = GRL_load_quad_indices(desc, primID0, primID1, fourth_vert);
+
+    // read the indices of the 4 verts we want
+    float3 vtx0, vtx1, vtx2, vtx3;
+    GRL_load_quad_vertices(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices);
+
+    childAABB->lower.xyz = min( min( vtx0, vtx1 ), min(vtx2,vtx3) );
+    childAABB->upper.xyz = max( max( vtx0, vtx1 ), max(vtx2,vtx3) );
+
+    float4 pack0 = (float4) ( vtx0.x, vtx0.y, vtx0.z, vtx1.x );
+    float4 pack1 = (float4) ( vtx1.y, vtx1.z, vtx2.x, vtx2.y );
+    float4 pack2 = (float4) ( vtx2.z, vtx3.x, vtx3.y, vtx3.z );
+
+    global uint4* dst_verts = (global uint4*) &(quad->v[0][0]);
+    store_uint4_L1WB_L3WB( dst_verts, 0, as_uint4(pack0) );
+    store_uint4_L1WB_L3WB( dst_verts, 1, as_uint4(pack1) );
+    store_uint4_L1WB_L3WB( dst_verts, 2, as_uint4(pack2) );
+}
+
+inline uchar4 uchar4_shuffle_down( uchar4 v, uint offs )
+{
+    uint vi = as_uint(v);
+    return as_uchar4(intel_sub_group_shuffle_down(vi,vi,offs));
+}
+inline uchar4 uchar4_broadcast( uchar4 v, uint offs )
+{
+    uint vi = as_uint(v);
+    return as_uchar4(sub_group_broadcast(vi,offs));
+}
+
+GRL_INLINE void sg_InternalNode_setFields(
+    struct InternalNode* node, 
+    struct AABB reduced_aabb, 
+    const int offset, const uint nodeType, struct AABB* input_aabb, 
+    const uint numChildren, const uchar nodeMask )
+{
+    const float up = 1.0f + ulp;
+    const float down = 1.0f - ulp;
+
+    struct AABB conservative_aabb = conservativeAABB(&reduced_aabb);
+    const float3 org = conservative_aabb.lower.xyz;
+    
+    const float3 len = AABB_size(&conservative_aabb).xyz * up;
+    int3 exp;
+    const float3 mant = frexp_vec3(len, &exp);
+    exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
+
+    uchar4 lower_uchar = 0x80;
+    uchar4 upper_uchar = 0;
+
+    ushort lane = get_sub_group_local_id();
+    ushort simd8_id     = lane/8;
+    ushort logical_lane = lane%8;
+
+    if( logical_lane < numChildren )
+    {
+        struct AABB child_aabb = conservativeAABB( input_aabb ); // conservative ???
+
+        float3 lower = floor( bitShiftLdexp3( (child_aabb.lower.xyz - org) * down, -exp + 8 ) );
+        lower = clamp( lower, (float)(QUANT_MIN), (float)(QUANT_MAX) );
+        float3 upper = ceil( bitShiftLdexp3( (child_aabb.upper.xyz - org) * up, -exp + 8 ) );
+        upper = clamp( upper, (float)(QUANT_MIN), (float)(QUANT_MAX) );
+        lower_uchar.xyz = convert_uchar3_rtn( lower );
+        upper_uchar.xyz = convert_uchar3_rtp( upper );
+    }
+
+    uchar4 lo0 = lower_uchar;
+    uchar4 lo1 = uchar4_shuffle_down( lower_uchar, 1 );
+    uchar4 lo2 = uchar4_shuffle_down( lower_uchar, 2 );
+    uchar4 lo3 = uchar4_shuffle_down( lower_uchar, 3 );
+    uchar4 lo4 = uchar4_shuffle_down( lower_uchar, 4 );
+    uchar4 lo5 = uchar4_shuffle_down( lower_uchar, 5 );
+
+    uchar4 hi0 = upper_uchar;
+    uchar4 hi1 = uchar4_shuffle_down( upper_uchar,1 );
+    uchar4 hi2 = uchar4_shuffle_down( upper_uchar,2 );
+    uchar4 hi3 = uchar4_shuffle_down( upper_uchar,3 );
+    uchar4 hi4 = uchar4_shuffle_down( upper_uchar,4 );
+    uchar4 hi5 = uchar4_shuffle_down( upper_uchar,5 );
+
+    if( logical_lane == 0 )
+    {
+        uchar childBlockStride = 0x01 + (uint)(nodeType == NODE_TYPE_INSTANCE);
+
+        uint4 block0 = (uint4)(as_uint(org.x), as_uint(org.y), as_uint(org.z), offset);
+
+        char3 exp_char = (char3)(exp.x,exp.y,exp.z);
+
+        uint4 block1 = (uint4)(
+            as_uint((uchar4)(nodeType, 0 /* padding */, exp_char.x, exp_char.y)),
+            as_uint((uchar4)(exp_char.z, nodeMask, childBlockStride, childBlockStride)) ,
+            as_uint((uchar4)(childBlockStride, childBlockStride, childBlockStride, childBlockStride)) ,
+            as_uint((uchar4)(lo0.x,lo1.x,lo2.x,lo3.x))
+        );
+ 
+        uint4 block2 = (uint4)(
+            as_uint((uchar4)(lo4.x,lo5.x,hi0.x,hi1.x)) ,
+            as_uint((uchar4)(hi2.x,hi3.x,hi4.x,hi5.x)) ,
+            as_uint((uchar4)(lo0.y,lo1.y,lo2.y,lo3.y)) ,
+            as_uint((uchar4)(lo4.y,lo5.y,hi0.y,hi1.y)) 
+            );
+
+        uint4 block3 = (uint4)(
+            as_uint((uchar4)(hi2.y,hi3.y,hi4.y,hi5.y)),
+            as_uint((uchar4)(lo0.z,lo1.z,lo2.z,lo3.z)),
+            as_uint((uchar4)(lo4.z,lo5.z,hi0.z,hi1.z)),
+            as_uint((uchar4)(hi2.z,hi3.z,hi4.z,hi5.z))
+            );
+
+        global uint4* pNode = (global uint4*)node;
+
+#if 0
+        printf(
+            "block0 = %08x,%08x,%08x,%08x    %08x,%08x,%08x,%08x \n"
+            "block1 = %08x,%08x,%08x,%08x    %08x,%08x,%08x,%08x \n"
+            "block2 = %08x,%08x,%08x,%08x    %08x,%08x,%08x,%08x \n"
+            "block3 = %08x,%08x,%08x,%08x    %08x,%08x,%08x,%08x \n" ,
+            block0.x,block0.y,block0.z,block0.w, 
+            pNode[0].x, pNode[0].y, pNode[0].z, pNode[0].w,
+            block1.x,block1.y,block1.z,block1.w, 
+            pNode[1].x, pNode[1].y, pNode[1].z, pNode[1].w,
+            block2.x,block2.y,block2.z,block2.w, 
+            pNode[2].x, pNode[2].y, pNode[2].z, pNode[2].w ,
+            block3.x,block3.y,block3.z,block3.w, 
+            pNode[3].x, pNode[3].y, pNode[3].z, pNode[3].w );
+#endif
+
+         store_uint4_L1WB_L3WB( pNode, 0, block0 );
+         store_uint4_L1WB_L3WB( pNode, 1, block1 );
+         store_uint4_L1WB_L3WB( pNode, 2, block2 );
+         store_uint4_L1WB_L3WB( pNode, 3, block3 );
+    }
+
+}
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1))) 
+void kernel
+traverse_aabbs_quad(
+        global struct BVHBase* bvh,
+        global struct RefitScratch* scratch,
+        global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc
+    )
+{
+
+    uniform uint num_nodes = BVHBase_GetNumInternalNodes(bvh);
+    varying ushort lane = get_sub_group_local_id();
+
+    uniform uint num_leaves = bvh->fatLeafCount;
+
+    local struct RefitScratch local_scratch[256];
+    if( get_local_id(0) < min(num_nodes,256u) )
+    {
+        for( uint i=0; i<3; i++ ){
+            local_scratch[get_local_id(0)].lower[i] = FLT_MAX;
+            local_scratch[get_local_id(0)].upper[i] = FLT_MAX;
+        }
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    
+    ushort SIMD8_PER_SG   = get_sub_group_size()/8;
+    ushort SIMD8_PER_WG   = get_num_sub_groups()*SIMD8_PER_SG;
+    ushort simd8_local_id = get_sub_group_local_id()/8;
+    ushort simd8_id       = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id; 
+    ushort logical_lane   = lane%8;
+
+    uniform uint fatleaf_index = simd8_id + get_group_id(0)*SIMD8_PER_WG;
+
+
+    if ( fatleaf_index < num_leaves )
+    {            
+        LeafTableEntry* leaf = BVHBase_GetFatLeafTable(bvh)+fatleaf_index;
+        uint innerNodeIdx = leaf->inner_node_index;
+        uint bp           = leaf->backpointer;
+        uint leaf_index   = leaf->leaf_index;
+
+        varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+innerNodeIdx;
+        varying QuadLeaf* quad =  BVHBase_GetQuadLeaves(bvh) + leaf_index;
+
+        uint childOffs = (((char*)quad) - ((char*)curNode))/64;
+
+        varying struct AABB childrenBox;
+        AABB_init(&childrenBox);
+
+        uint numChildren = (bp >> 3) & 0x7;
+        if (logical_lane < numChildren)
+        {    
+            refit_bottom_child_quad_WB(  
+                (global struct QuadLeaf*) &quad[logical_lane],
+                geomDesc,
+                &childrenBox );
+        }
+
+        struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox);
+        struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0);
+        for (uint i = 1; i < SIMD8_PER_SG; i++)
+        {
+            struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i);
+            int3 is_upper_lane = ((uint3)(i)) == simd8_local_id;
+            reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane );
+            reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane );
+        }
+
+        sg_InternalNode_setFields(
+            curNode, 
+            reduce_bounds,
+            childOffs,
+            NODE_TYPE_QUAD,
+            &childrenBox,
+            numChildren,
+            0xff );
+
+        // atomic min operation vectorized across 6 lanes
+        //    [ lower.xyz ][-][upper.xyz][-]
+        //
+        // Lanes 3 and 7 are inactive.   'upper' is negated
+        bool atomic_mask = (1<<logical_lane) & 0x77;
+
+        uint lmod = logical_lane % 4;
+        uint ldiv = logical_lane / 4;
+        float vlo = reduce_bounds.lower.x;
+        float vhi = reduce_bounds.upper.x;
+        vlo = (lmod == 1) ? reduce_bounds.lower.y : vlo;
+        vhi = (lmod == 1) ? reduce_bounds.upper.y : vhi;
+        vlo = (lmod == 2) ? reduce_bounds.lower.z : vlo;
+        vhi = (lmod == 2) ? reduce_bounds.upper.z : vhi;
+        float v = (ldiv == 0) ? vlo : -vhi;
+
+
+        global float* pv = (global float*) &scratch[innerNodeIdx];
+
+        store_uint_L1WB_L3WB( (global uint*)(pv+logical_lane), 0, as_uint(v));
+
+        BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+        uint parent = (bp >> 6);
+
+        // check for parent != 0x03FFFFFF once to be sure we don't enter parent >= 256
+        if(atomic_mask && parent != 0x03FFFFFF)
+        {
+            while( parent >= 256 )
+            {
+                innerNodeIdx = parent;
+                bp =  *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+                atomic_min( ((global float*) &(scratch[innerNodeIdx]))+logical_lane, v );
+                parent = bp >> 6;
+            }
+            while( parent != 0x03FFFFFF )
+            {
+                innerNodeIdx = parent;
+                bp =  *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+                atomic_min( ((local float*) &(local_scratch[innerNodeIdx]))+logical_lane, v );
+                parent = bp >> 6;
+            }
+        }
+        
+    }
+
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+    num_nodes = min(num_nodes,256u);
+
+    local float* in = (local float*)&local_scratch[0];
+    global float* out = (global float*)&scratch[0];
+
+    for (uint i = get_local_id(0); i < num_nodes*6; i += 256 )
+    {
+        // since we want to save [ lower.xyz ][-][upper.xyz][-] i.e 0,1,2, 4,5,6 etc. we need to offset +1 for every triplet
+        uint idx = i + (i/3);
+
+        float v = in[idx];
+        if( v != FLT_MAX )
+            atomic_min( out + idx , v );
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(64, 1, 1))) 
+void kernel
+write_inner_nodes(
+    global struct BVHBase* bvh,
+    global struct RefitScratch* scratch
+    )
+{
+    uint SIMD8_PER_SG = get_sub_group_size()/8;
+    uniform uint node_id    = SIMD8_PER_SG * get_sub_group_global_id() + (get_sub_group_local_id()/8);
+    varying ushort lane = get_sub_group_local_id() % 8;
+    varying uint num_inners = bvh->innerCount;
+
+    if ( node_id < num_inners )
+    {
+        InnerNodeTableEntry* entry = BVHBase_GetInnerNodeTable(bvh) + node_id;
+        uint node_index  = entry->node_index_and_numchildren>>3;
+        uint numChildren = entry->node_index_and_numchildren & 7;
+        uint first_child = entry->first_child;
+
+        varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+node_index;
+
+        varying struct AABB childAABB;
+        AABB_init(&childAABB);
+
+        if( lane < numChildren )
+        {            
+            uint child = first_child + lane;
+            childAABB.lower.x = scratch[child].lower[0];
+            childAABB.lower.y = scratch[child].lower[1];
+            childAABB.lower.z = scratch[child].lower[2];
+            childAABB.upper.x = -scratch[child].upper[0];
+            childAABB.upper.y = -scratch[child].upper[1];
+            childAABB.upper.z = -scratch[child].upper[2];
+        }
+
+        varying struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childAABB);
+        struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0);
+        for (uint i = 1; i < SIMD8_PER_SG; i++)
+        {
+            struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i);
+            int3 is_upper_lane = ((uint3)(i)) ==  (get_sub_group_local_id()/8);
+            reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane );
+            reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane );
+        }
+        
+        sg_InternalNode_setFields(
+            curNode, 
+            reduce_bounds,
+            first_child - node_index,
+            NODE_TYPE_INTERNAL,
+            &childAABB,
+            numChildren,
+            0xff );
+
+    }
+
+    if (node_id == 0 && lane == 0 )
+    {
+        bvh->Meta.bounds.lower[0] = scratch[0].lower[0];
+        bvh->Meta.bounds.lower[1] = scratch[0].lower[1];
+        bvh->Meta.bounds.lower[2] = scratch[0].lower[2];
+        bvh->Meta.bounds.upper[0] = -scratch[0].upper[0];
+        bvh->Meta.bounds.upper[1] = -scratch[0].upper[1];
+        bvh->Meta.bounds.upper[2] = -scratch[0].upper[2];
+    }
+
+}
+
+
+
+#if 1
+#define SLM_BOX_COUNT 1024
+
+struct AABB load_box( uint place,  local struct AABB* local_boxes, global struct AABB* extra_boxes )
+{
+    if( place < SLM_BOX_COUNT )
+        return local_boxes[place];
+    else
+        return extra_boxes[place-SLM_BOX_COUNT];
+}
+
+void store_box( struct AABB box, uint place, local struct AABB* local_boxes, global struct AABB* extra_boxes )
+{
+    if (place < SLM_BOX_COUNT)
+    {
+        local_boxes[place] = box;
+    }
+    else
+    {
+        global uint4* ptr = (global uint4*)&extra_boxes[place-SLM_BOX_COUNT];
+        store_uint4_L1WB_L3WB( ptr,   0, as_uint4(box.lower) );
+        store_uint4_L1WB_L3WB( ptr+1, 0, as_uint4(box.upper) );
+    }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(512, 1, 1))) 
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel
+update_single_group_quads(
+    global struct BVHBase* bvh,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    global struct AABB* extra_boxes
+)
+{
+    uniform uint tid = get_sub_group_global_id();
+    uniform uint num_nodes = BVHBase_GetNumInternalNodes(bvh);
+    uniform uint num_leaves = bvh->fatLeafCount;
+    uniform uint num_inners = bvh->innerCount;
+
+    varying ushort lane = get_sub_group_local_id();
+    
+    local struct AABB local_boxes[SLM_BOX_COUNT]; // == 32KB
+   
+    // initialize nodes
+    for (uint i = get_local_id( 0 ); i < num_nodes; i+= get_local_size(0))
+    {
+        struct AABB tmp;
+        AABB_init(&tmp);
+        tmp.upper = -tmp.upper;
+        store_box( tmp, i, local_boxes, extra_boxes );
+    }
+
+
+    if( num_nodes > SLM_BOX_COUNT )
+        mem_fence_workgroup_default();
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+    
+
+    ushort SIMD8_PER_SG   = get_sub_group_size()/8;
+    ushort NUM_SIMD8      = get_num_sub_groups()*SIMD8_PER_SG;
+    ushort simd8_local_id = get_sub_group_local_id()/8;
+    ushort simd8_id       = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id; 
+    ushort logical_lane = lane%8;
+
+
+    for ( uint i = simd8_id; i < num_leaves; i+= NUM_SIMD8 )
+    {
+        LeafTableEntry* leaf = BVHBase_GetFatLeafTable(bvh)+i;
+        uint innerNodeIdx = leaf->inner_node_index;
+        uint bp           = leaf->backpointer;
+        uint leaf_index   = leaf->leaf_index;
+
+        varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+innerNodeIdx;
+        QuadLeaf* quad = BVHBase_GetQuadLeaves(bvh) + leaf_index;
+
+        uint childOffs = (((char*)quad) - ((char*)curNode))/64;
+
+        varying struct AABB childrenBox;
+        AABB_init(&childrenBox);
+
+        uint numChildren = (bp >> 3) & 0x7;
+        if (logical_lane < numChildren)
+        {                
+            
+            refit_bottom_child_quad_WB(  
+                (global struct QuadLeaf*) &quad[logical_lane],
+                geomDesc,
+                &childrenBox );
+        }
+
+        struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox);
+        struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0);
+        for (uint i = 1; i < SIMD8_PER_SG; i++)
+        {
+            struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i);
+            int3 is_upper_lane = ((uint3)(i)) == simd8_local_id;
+            reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane );
+            reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane );
+        }
+
+       
+        if( logical_lane == 0 )
+        {
+            struct AABB negated = reduce_bounds;
+            negated.upper = -negated.upper;
+            store_box( negated, innerNodeIdx, local_boxes, extra_boxes );
+        }
+
+        sg_InternalNode_setFields(
+            curNode, 
+            reduce_bounds,
+            childOffs,
+            NODE_TYPE_QUAD,
+            &childrenBox,
+            numChildren,
+            0xff );
+
+    
+        // atomic min operation vectorized across 6 lanes
+        //    [ lower.xyz ][-][upper.xyz][-]
+        //
+        // Lanes 3 and 7 are inactive.   'upper' is negated
+        uint lmod = logical_lane % 4;
+        uint ldiv = logical_lane / 4;
+        float vlo = reduce_bounds.lower.x;
+        float vhi = reduce_bounds.upper.x;
+        vlo = (lmod == 1) ? reduce_bounds.lower.y : vlo;
+        vhi = (lmod == 1) ? reduce_bounds.upper.y : vhi;
+        vlo = (lmod == 2) ? reduce_bounds.lower.z : vlo;
+        vhi = (lmod == 2) ? reduce_bounds.upper.z : vhi;
+        float v = (ldiv == 0) ? vlo : -vhi;
+        bool atomic_mask = (1<<logical_lane) & 0x77;
+
+        BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+        uint parent = (bp >> 6);
+
+        // check for parent != 0x03FFFFFF once to be sure we don't enter parent >= SLM_BOX_COUNT
+        if(atomic_mask && parent != 0x03FFFFFF)
+        {
+            while( parent >= SLM_BOX_COUNT )
+            {
+                innerNodeIdx = parent;
+                bp =  *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+                atomic_min( ((global float*) &(extra_boxes[innerNodeIdx-SLM_BOX_COUNT]))+logical_lane, v );
+                parent = bp >> 6;
+            }
+            while( parent != 0x03FFFFFF )
+            {
+                innerNodeIdx = parent;
+                bp =  *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+                atomic_min( ((local float*) &(local_boxes[innerNodeIdx]))+logical_lane, v );
+                parent = bp >> 6;
+            }
+        }
+
+    }
+
+    if( num_nodes > SLM_BOX_COUNT )
+        mem_fence_workgroup_default();
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    for ( uint i = simd8_id; i < num_inners; i+= NUM_SIMD8 )
+    {
+        InnerNodeTableEntry* inner = BVHBase_GetInnerNodeTable(bvh) + i;
+        uint node_index  = inner->node_index_and_numchildren>>3;
+        uint numChildren = inner->node_index_and_numchildren & 7;
+        uint first_child = inner->first_child;
+        
+        varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+ node_index;
+
+        //if (curNode->nodeType == BVH_INTERNAL_NODE) // TODO: Needs updating for traversal shaders
+        {                                           // TODO: Consider using an inner node table or UC load to avoid polluting LSC with these reads
+            uint child = first_child + logical_lane;
+
+            bool child_valid = (logical_lane < numChildren);
+            
+            struct AABB childAABB;
+            AABB_init(&childAABB);
+            if (child_valid)
+            {
+                childAABB = load_box( child, local_boxes, extra_boxes );
+                childAABB.upper = -childAABB.upper;
+            }
+
+            varying struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childAABB);
+            struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0);
+            for (uint i = 1; i < SIMD8_PER_SG; i++)
+            {
+                struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i);
+                int3 is_upper_lane = ((uint3)(i)) ==  (get_sub_group_local_id()/8);
+                reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane );
+                reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane );
+            }
+
+            sg_InternalNode_setFields(
+                curNode, 
+                reduce_bounds,
+                first_child - node_index,
+                NODE_TYPE_INTERNAL,
+                &childAABB,
+                numChildren,
+                0xff );
+        }
+    }
+
+
+    if (get_sub_group_id() == 0 && lane == 0 )
+    {
+        bvh->Meta.bounds.lower[0] = local_boxes[0].lower.x;
+        bvh->Meta.bounds.lower[1] = local_boxes[0].lower.y;
+        bvh->Meta.bounds.lower[2] = local_boxes[0].lower.z;
+        bvh->Meta.bounds.upper[0] = -local_boxes[0].upper.x;
+        bvh->Meta.bounds.upper[1] = -local_boxes[0].upper.y;
+        bvh->Meta.bounds.upper[2] = -local_boxes[0].upper.z;
+    }
+
+}
+#endif
+
+GRL_INLINE void traverse_aabbs_new_update_func(
+        global struct BVHBase* bvh,
+        global char* vertices,
+        global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+        global struct RefitScratch* scratch,
+        uint vertex_format,
+        local struct AABB3f* children_AABBs,
+        local uint* num_fat_leaves,
+        local struct LeafTableEntry* leafTable_local,
+        const bool single_geo
+    )
+{
+    // The first part of the kernel with vertices loads/stores is executed with quad per work item,
+    // using previously prepared QuadDataIndices to get the quad data and vert indices
+    // Second part of the kernel that does the reduction, update fatleaf ain bvh and bottom up is
+    // executed per simd.
+    // For bottom up tested also with local part (using local scratch) but since there is not enough SLM additional
+    // barriers were needed to clean and reuse SLM, which curretnly kills performance. Could be worth to revisit
+    // on future gens.
+
+    varying uint lid = get_local_id(0);
+    varying uint tid = lid + get_group_id(0)*get_local_size(0);
+
+    num_fat_leaves[0] = 0;
+    leafTable_local[lid].leaf_index = -1 << 3;
+
+    LeafTableEntry* leaf = (LeafTableEntry*)(((char*)bvh) + (64u * bvh->fatLeafTableStart + 12 * tid));
+    uint innerNodeIdx_mem = leaf->inner_node_index;
+    uint bp           = leaf->backpointer;
+    uint leaf_index_mem = leaf->leaf_index;
+
+    uint numChildren = (bp >> 3) & 0x7;
+
+    uint leaf_index = leaf_index_mem >> 3;
+    uint slm_child_offset = leaf_index_mem & 0x7;
+
+    uint innerNodeIdx = innerNodeIdx_mem >> 8;
+    uint slm_pos_main = innerNodeIdx_mem & 0xFF;
+
+    uint first_el_of_group = get_group_id(0)*get_local_size(0);
+    uint quadsNum = BVHBase_GetNumQuads(bvh);
+    uint expected_tid = first_el_of_group < quadsNum ? first_el_of_group : quadsNum - 1;
+
+    // Skip writes when not all children for single fatleaf are present in this work group
+    bool skip_tid = leaf_index == 0x1FFFFFFF;
+    leaf_index = skip_tid ? expected_tid : leaf_index;
+
+    // Compute bounding box for quads
+    varying struct AABB3f childrenBox;
+        
+    tid = leaf_index + slm_child_offset;
+    
+    // Read vertex indices and quad header from separate buffer
+    uint quadIndicesStart = bvh->quadIndicesDataStart;
+    varying struct QuadDataIndices* vertex_indice_ptr = (QuadDataIndices*)(((char*)bvh) + (64u * quadIndicesStart + 32 * tid));
+    QuadDataIndices vertexMap = vertex_indice_ptr[0];
+    
+    varying global uint4* bounds =  (global uint4*)((char*)bvh + (64*bvh->quadLeafStart + 64*tid) );
+    uint4 quad_data = (uint4)(vertexMap.header_data[0], vertexMap.header_data[1], vertexMap.header_data[2], vertexMap.header_data[3]);
+    uint4 indices = (uint4)(vertexMap.vert_idx[0], vertexMap.vert_idx[1], vertexMap.vert_idx[2], vertexMap.vert_idx[3]);
+    
+    global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDesc;
+
+    if(!single_geo)
+    {
+        uint geomID = vertexMap.header_data[0] & 0xFFFFFF;
+        desc += geomID;
+        vertices = (global char*)desc->Desc.Triangles.pVertexBuffer;
+        vertex_format = desc->Desc.Triangles.VertexFormat;
+    }
+
+    float3 vtx0, vtx1, vtx2, vtx3;
+    GRL_load_quad_vertices_no_stride(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices, vertex_format, vertices);
+    
+    for(uint i = 0; i < 3; i++)
+        childrenBox.lower[i] = min( min( vtx0[i], vtx1[i] ), min(vtx2[i],vtx3[i]) );
+
+    for(uint i = 0; i < 3; i++)
+        childrenBox.upper[i] = max( max( vtx0[i], vtx1[i] ), max(vtx2[i],vtx3[i]) );
+    
+    float4 pack0 = (float4) ( vtx0.x, vtx0.y, vtx0.z, vtx1.x );
+    float4 pack1 = (float4) ( vtx1.y, vtx1.z, vtx2.x, vtx2.y );
+    float4 pack2 = (float4) ( vtx2.z, vtx3.x, vtx3.y, vtx3.z );
+    
+    // Store quad data in bvh
+    // Make sure this goes without partial writes to get best perf
+    store_uint4_L1WB_L3WB( bounds, 0, quad_data );
+    store_uint4_L1WB_L3WB( bounds, 1, as_uint4(pack0) );
+    store_uint4_L1WB_L3WB( bounds, 2, as_uint4(pack1) );
+    store_uint4_L1WB_L3WB( bounds, 3, as_uint4(pack2) );
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+    
+    struct AABB reduce_bounds;
+    
+    if(!skip_tid)
+    {
+        // Store AABB in SLM, to be used later for children quantization in fatleaf
+        children_AABBs[slm_pos_main + slm_child_offset] = childrenBox;
+    
+        if(slm_child_offset == 0)
+        {
+           uint offset = atomic_inc_local(&num_fat_leaves[0]);
+           leafTable_local[offset].inner_node_index = innerNodeIdx_mem;
+           leafTable_local[offset].backpointer = bp;
+           leafTable_local[offset].leaf_index = leaf_index_mem;
+        }
+    }
+       
+    barrier( CLK_LOCAL_MEM_FENCE );
+    
+    varying ushort lane   = get_sub_group_local_id();
+    ushort SIMD8_PER_SG   = get_sub_group_size()/8;
+    ushort SIMD8_PER_WG   = get_num_sub_groups()*SIMD8_PER_SG;
+    ushort simd8_local_id = get_sub_group_local_id()/8;
+    ushort simd8_id       = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id; 
+    ushort logical_lane   = lane%8;
+    
+    uint fatleaves_aligned_32 = (num_fat_leaves[0] + 31) & ~31;
+    
+    for(uint offset = 0; offset < fatleaves_aligned_32; offset += 32)
+    {
+        uniform uint fatleaf_index = simd8_id + offset;
+        uint innerNodeIdx_mem = leafTable_local[fatleaf_index].inner_node_index;
+        uint bp           = leafTable_local[fatleaf_index].backpointer;
+        uint leaf_index_mem   = leafTable_local[fatleaf_index].leaf_index;
+    
+        uint numChildren = (bp >> 3) & 0x7;
+        
+        uint leaf_index = leaf_index_mem >> 3;
+        uint slm_child_offset = leaf_index_mem & 0x7;
+        
+        uint innerNodeIdx = innerNodeIdx_mem >> 8;
+        uint slm_pos_main = innerNodeIdx_mem & 0xFF;
+        
+        bool skip_tid = leaf_index == 0x1FFFFFFF;
+        bool active_lane = (logical_lane < numChildren);
+        uint lane_children = active_lane ? logical_lane : 0;
+        
+        fatleaf_index = leaf_index;
+    
+        varying InternalNode* curNode = (InternalNode*)(((char*)bvh) + (BVH_ROOT_NODE_OFFSET + 64 * innerNodeIdx));
+        
+        global struct Quad *quads = (global struct Quad *)((char*)bvh + 64*bvh->quadLeafStart );
+    
+        varying struct AABB childrenBox_bu;
+        AABB_init(&childrenBox_bu);
+        
+        if(!skip_tid)
+            childrenBox_bu = AABBfromAABB3f(children_AABBs[slm_pos_main + lane_children]);
+    
+        struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox_bu);
+        struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0);
+        
+        for (uint i = 1; i < SIMD8_PER_SG; i++)
+        {
+            struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i);
+            int3 is_upper_lane = ((uint3)(i)) == simd8_local_id;
+            reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane );
+            reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane );
+        }
+
+        if(!skip_tid)
+        {
+            uint quad_offset = 64u * bvh->quadLeafStart + 64 * fatleaf_index;
+            varying QuadLeaf* quad =  (QuadLeaf*)(((char*)bvh) + quad_offset);
+            uint childOffs = (((char*)quad) - ((char*)curNode))/64;
+    
+            sg_InternalNode_setFields(
+            curNode, 
+            reduce_bounds,
+            childOffs,
+            NODE_TYPE_QUAD,
+            &childrenBox_bu,
+            numChildren,
+            0xff );
+            
+            bool atomic_mask = (1<<logical_lane) & 0x77;
+            
+            uint lmod = logical_lane % 4;
+            uint ldiv = logical_lane / 4;
+            float vlo = reduce_bounds.lower.x;
+            float vhi = reduce_bounds.upper.x;
+            vlo = (lmod == 1) ? reduce_bounds.lower.y : vlo;
+            vhi = (lmod == 1) ? reduce_bounds.upper.y : vhi;
+            vlo = (lmod == 2) ? reduce_bounds.lower.z : vlo;
+            vhi = (lmod == 2) ? reduce_bounds.upper.z : vhi;
+            float v = (ldiv == 0) ? vlo : -vhi;
+            
+            global float* pv = (global float*) &scratch[innerNodeIdx];
+            
+            store_uint_L1WB_L3WB( (global uint*)(pv+logical_lane), 0, as_uint(v));
+            
+            BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+            uint parent = (bp >> 6);
+            
+            global float* parent_v = (global float*) &(scratch[parent]) + logical_lane;
+            
+            if(atomic_mask && (*parent_v >= v) && (parent != 0x03FFFFFF))
+            {
+                innerNodeIdx = parent;
+                bp =  *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+                atomic_min( parent_v, v );
+                parent = bp >> 6;
+            
+                if(parent != 0x03FFFFFF)
+                {
+                    while( parent != 0x03FFFFFF )
+                    {
+                        innerNodeIdx = parent;
+                        bp =  *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+                
+                        global float* parent_v_global = (global float*) &(scratch[innerNodeIdx]) + logical_lane;
+                        if(*parent_v_global >= v)
+                            atomic_min( parent_v_global, v );
+                        else
+                            break;
+                
+                        parent = bp >> 6;
+                    }
+                }
+            }
+        }
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1))) 
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+void kernel
+traverse_aabbs_new_update(
+        global struct BVHBase* bvh,
+        global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+        global struct RefitScratch* scratch
+    )
+{
+    varying uint lid = get_local_id(0);
+    varying uint tid = lid + get_group_id(0)*get_local_size(0);
+
+    local struct AABB3f children_AABBs[256];
+    local struct LeafTableEntry leafTable_local[256];
+    local uint num_fat_leaves;
+
+    traverse_aabbs_new_update_func(bvh, (global char*)geomDesc /* not used */, geomDesc, scratch, (uint)-1 /* not used */,
+        &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], false);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1))) 
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+void kernel
+traverse_aabbs_new_update_single_geo(
+        global struct BVHBase* bvh,
+        global char* vertices,
+        global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+        global struct RefitScratch* scratch,
+        const uint vertex_format
+    )
+{
+    varying uint lid = get_local_id(0);
+    varying uint tid = lid + get_group_id(0)*get_local_size(0);
+
+    local struct AABB3f children_AABBs[256];
+    local struct LeafTableEntry leafTable_local[256];
+    local uint num_fat_leaves;
+
+    if(vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT)
+      traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R32G32B32_FLOAT,
+          &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R32G32_FLOAT)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R32G32_FLOAT,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_FLOAT,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R16G16_FLOAT)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_FLOAT,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_SNORM,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R16G16_SNORM)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_SNORM,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_UNORM,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R16G16_UNORM)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_UNORM,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R10G10B10A2_UNORM,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8B8A8_UNORM,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R8G8_UNORM)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8_UNORM,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8B8A8_SNORM,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else if(vertex_format == VERTEX_FORMAT_R8G8_SNORM)
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8_SNORM,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+    else
+        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, (uint)-1,
+            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
+}
diff --git a/src/intel/vulkan/grl/gpu/atomic_update.grl b/src/intel/vulkan/grl/gpu/atomic_update.grl
new file mode 100644
index 00000000000..9e1d6923d4a
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/atomic_update.grl
@@ -0,0 +1,198 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module atomic_update;
+
+kernel_module atomic_update ("atomic_update.cl")
+{    
+    links lsc_intrinsics;
+    kernel init_refit_scratch   < kernelFunction = "init_refit_scratch"  >;
+    kernel traverse_aabbs_quad  < kernelFunction = "traverse_aabbs_quad" >;
+    kernel write_inner_nodes    < kernelFunction = "write_inner_nodes"   >;
+    kernel build_fatleaf_table  < kernelFunction = "build_fatleaf_table" >;
+    kernel build_innernode_table < kernelFunction = "build_innernode_table" >;
+
+    kernel update_single_group_quads < kernelFunction = "update_single_group_quads" >;
+
+    kernel build_fatleaf_table_new_update  < kernelFunction = "build_fatleaf_table_new_update" >;
+    kernel fixup_quad_table  < kernelFunction = "fixup_quad_table" >;
+    kernel traverse_aabbs_new_update  < kernelFunction = "traverse_aabbs_new_update" >;
+    kernel traverse_aabbs_new_update_single_geo  < kernelFunction = "traverse_aabbs_new_update_single_geo" >;
+}
+
+import struct MKBuilderState "structs.grl";
+
+// this metakernel only initializes registers for use in a batching loop by "init_refit_scratch"
+metakernel init_refit_scratch_metakernel_registers()
+{
+    REG0.hi = 0;
+    REG1 = 3;
+    REG2 = 63;
+    REG3 = 4;
+    REG4 = 2;
+
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+}
+
+metakernel init_refit_scratch( qword bvh_base, qword scratch)//, dword max_inner_nodes )
+{
+    REG0.lo = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
+    define C_3  REG1;
+    define C_63 REG2;
+    define C_4  REG3;
+    define C_2  REG4;
+
+    REG0 = REG0 - C_3; // nodedataCurr - fixed offset
+    REG0 = REG0 + C_63; // + 63
+    REG0 = REG0 >> C_4; // >> 4
+    REG0 = REG0 >> C_2; // >> 2 == >> 6 == /64
+
+    DISPATCHDIM_X = REG0.lo;
+
+    dispatch_indirect init_refit_scratch//( (max_inner_nodes+63)/64, 1, 1 )
+        args(bvh_base,scratch);
+
+}
+
+metakernel build_node_tables( qword bvh_base )
+{
+    REG0 = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
+    REG1 = 2;
+    REG2 = 63;
+    REG3 = 4;
+    REG4 = 3;  // fixed offset... TODO: DON'T HARDCODE!!
+
+    REG0 = REG0 - REG4; // nodedataCurr - fixed offset
+    REG0 = REG0 + REG2; // + 63
+    REG0 = REG0 >> REG3; // >> 4
+    REG0 = REG0 >> REG1; // >> 2 == >> 6 == /64
+
+    DISPATCHDIM_X = REG0.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect build_fatleaf_table//( (max_inner_nodes+63)/64, 1, 1 )
+        args(bvh_base);
+    dispatch_indirect build_innernode_table//( (max_inner_nodes+63)/64, 1, 1 )
+        args(bvh_base);
+}
+
+metakernel build_node_tables_new_update( MKBuilderState state, qword bvh_base )
+{
+    REG0 = load_dword( bvh_base + 12 ); // TODO: DON'T HARDCODE!!
+    REG1 = 2;
+    REG2 = 63;
+    REG3 = 4;
+    REG4 = 3;  // fixed offset... TODO: DON'T HARDCODE!!
+
+    REG0 = REG0 - REG4; // nodedataCurr - fixed offset
+    REG0 = REG0 + REG2; // + 63
+    REG0 = REG0 >> REG3; // >> 4
+    REG0 = REG0 >> REG1; // >> 2 == >> 6 == /64
+
+    DISPATCHDIM_X = REG0.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect build_fatleaf_table_new_update//( (max_inner_nodes+63)/64, 1, 1 )
+        args(state.build_globals, bvh_base);
+    dispatch_indirect build_innernode_table//( (max_inner_nodes+63)/64, 1, 1 )
+        args(bvh_base);
+}
+
+metakernel fixup_quad_table( qword bvh_base )
+{
+    dispatch  fixup_quad_table(2,1,1)
+        args(bvh_base);
+}
+
+// this metakernel only initializes registers for use in a batching loop by "traverse_aabbs_quad" and "write_inner_nodes"
+metakernel init_traverse_aabbs_quad_and_write_inner_nodes()
+{
+    REG0.hi = 0;
+    REG1 = 1;
+    REG2 = 31;
+    REG3 = 4;
+    REG4 = 2;
+    REG5 = 7;
+    REG6 = 255;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+}
+
+metakernel traverse_aabbs_quad( qword bvh_base, qword scratch, qword geos)//, dword max_inner_nodes )
+{
+
+    REG0.lo = load_dword( bvh_base + 64 ); // TODO: DOn't hardcode!
+    define C_1  REG1;
+    define C_31 REG2;
+    define C_4  REG3;
+
+    REG0 = REG0 + C_31; // + 31
+    REG0 = REG0 >> C_4; // >> 4
+    REG0 = REG0 >> C_1; // >> 1 == >> 5 == /32
+
+    DISPATCHDIM_X = REG0.lo;
+
+    dispatch_indirect traverse_aabbs_quad//( (max_inner_nodes+32)/32, 1, 1 )
+        args(bvh_base,scratch,geos);
+}
+
+metakernel write_inner_nodes( qword bvh_base, qword scratch )//, dword max_inner_nodes )
+{
+    REG0.lo = load_dword( bvh_base + 68 ); // TODO: DOn't hardcode!
+    define C_1 REG1;
+    define C_2 REG4;
+    define C_7 REG5;
+
+    REG0 = REG0 + C_7;  // + 7
+    REG0 = REG0 >> C_2; // >> 2 
+    REG0 = REG0 >> C_1; // >> 1 ==>  >> 3  (/8)
+    DISPATCHDIM_X = REG0.lo;
+
+    dispatch_indirect  write_inner_nodes//( (max_inner_nodes+7)/8, 1, 1 )
+        args(bvh_base,scratch);
+}
+
+metakernel update_single_group_quads( qword bvh_base, qword geos, qword aabbs  )
+{
+    dispatch  update_single_group_quads(1,1,1) //( (max_inner_nodes+1)/2, 1, 1 )
+        args(bvh_base,geos,aabbs);
+}
+
+metakernel traverse_aabbs_new_update( qword bvh_base, qword geos, qword scratch )
+{
+    REG0.lo = load_dword( bvh_base + 84 ); // TODO: DOn't hardcode!
+    define C_255 REG6;
+    define C_4   REG3;
+    
+    REG0 = REG0 + C_255; // + 255
+    REG0 = REG0 >> C_4; // >> 4
+    REG0 = REG0 >> C_4; // >> 4 == >> 8 == /32
+    
+    DISPATCHDIM_X = REG0.lo;
+
+    dispatch_indirect traverse_aabbs_new_update//( (max_inner_nodes+255)/256, 1, 1 )
+        args(bvh_base, geos, scratch);
+}
+
+metakernel traverse_aabbs_new_update_single_geo( qword bvh_base, qword vertices, qword geos, qword scratch, dword vertex_format )
+{
+    REG0.lo = load_dword( bvh_base + 84 ); // TODO: DOn't hardcode!
+    define C_255 REG6;
+    define C_4   REG3;
+    
+    REG0 = REG0 + C_255; // + 255
+    REG0 = REG0 >> C_4; // >> 4
+    REG0 = REG0 >> C_4; // >> 4 == >> 8 == /32
+    
+    DISPATCHDIM_X = REG0.lo;
+
+    dispatch_indirect traverse_aabbs_new_update_single_geo//( (max_inner_nodes+255)/256, 1, 1 )
+        args(bvh_base, vertices, geos, scratch, vertex_format);
+}
+\ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/binned_sah_shared.h b/src/intel/vulkan/grl/gpu/binned_sah_shared.h
new file mode 100644
index 00000000000..8b22f6612cd
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/binned_sah_shared.h
@@ -0,0 +1,265 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+//
+//   This file contains structure definitions shared by GRL OCL kernels and host code
+//
+
+#include "GRLGen12.h"
+#pragma once
+
+#define BFS_NUM_BINS        16
+#define BFS_NUM_VCONTEXTS   256
+#define BFS_MAX_DEPTH 32
+
+#define TRIVIAL_BUILD_THRESHOLD   6
+#define SINGLE_WG_BUILD_THRESHOLD 256
+
+#define QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM 16384
+
+
+typedef uchar vcontext_id_t;
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+GRL_NAMESPACE_BEGIN(GPUBVHBuilder)
+
+struct BFS_Split
+{
+    float sah;
+    int dim;
+    int pos;
+};
+
+
+struct BFS_BinInfo
+{
+    float min_max[18 * BFS_NUM_BINS]; //  layout: bins[axis][num_bins][6]  
+                                      //          The 6 are lower(xyz) and -upper(xyz)
+                                      // bins use negated-max so that we can use vectorized mins instead of min/max pairs
+    uint counts[3 * BFS_NUM_BINS];
+};
+
+enum_uint8(SAHBuildFlags)
+{
+    SAH_FLAG_NEED_BACKPOINTERS = 1,        // identifies a mixed internal node where each child can have a different type
+    SAH_FLAG_NEED_MASKS        = 2
+};
+
+struct SAHBuildGlobals
+{
+    qword   p_primref_index_buffers;
+    qword   p_primrefs_buffer;
+    qword   p_bvh2;
+    qword   p_globals;     // TODO: deprecate this
+    qword   p_bvh_base;
+    gpuva_t p_qnode_root_buffer;
+
+    dword flags; // bit 1 is 'alloc_backpointers'.  bit 2 is 'need_masks'
+    dword num_primrefs;
+    dword leaf_size;
+    dword leaf_type;
+    
+    dword root_buffer_num_produced;
+    dword root_buffer_num_produced_hi;
+    dword root_buffer_num_consumed;
+    dword root_buffer_num_consumed_hi;
+    dword root_buffer_num_to_consume;
+    dword root_buffer_num_to_consume_hi;
+};
+
+struct SAHBuildBuffersInfo
+{
+    gpuva_t p_globals;
+    gpuva_t p_primref_index_buffers;
+    gpuva_t p_primrefs_buffer;
+    gpuva_t p_bvh2;
+    gpuva_t p_bvh_base;
+    gpuva_t p_qnode_root_buffer;
+    dword   sah_globals_flags;
+    dword   _pad;
+    gpuva_t _pad2;
+};
+
+typedef union LRBounds
+{    
+    struct
+    {
+        struct AABB3f left_centroid_bounds;
+        struct AABB3f left_geom_bounds;
+        struct AABB3f right_centroid_bounds;
+        struct AABB3f right_geom_bounds;
+    } boxes;
+    struct
+    {
+        float Array[24];
+    } scalars;
+} LRBounds;
+
+
+struct VContext
+{
+    uint dispatch_primref_begin;    // range of primrefs for this task
+    uint dispatch_primref_end;
+    uint bvh2_root;                 // BVH2 root node for this task
+    uint tree_depth;                // depth of this node in the tree
+    uint num_left;          // primref counts
+    uint num_right;
+    uint lr_mask;      // lower 8b : left mask.  upper 8b : right mask
+    uint batch_index;
+
+    // pass1 global working state and output
+    struct BFS_Split split;
+    struct BFS_BinInfo global_bin_info;
+
+    // pass2 global working state and output
+    LRBounds lr_bounds;
+};
+
+
+
+struct BFSDispatchRecord
+{
+    ushort batch_index;
+    ushort context_id;
+};
+
+
+struct BFSDispatchQueue
+{
+    uint num_dispatches;
+    uint wg_count[BFS_NUM_VCONTEXTS];
+    struct BFSDispatchRecord records[BFS_NUM_VCONTEXTS];
+};
+
+struct BFS1SpillStackEntry
+{
+    uint primref_begin;
+    uint primref_end;
+    uint bvh2_root;
+    ushort tree_depth;
+    ushort batch_index;
+};
+
+struct BFS1SpillStack
+{
+    uint size;
+    struct BFS1SpillStackEntry entries[BFS_NUM_VCONTEXTS * BFS_MAX_DEPTH];
+};
+
+struct QNodeGlobalRootBufferEntry
+{
+    uint bvh2_node;
+    uint qnode;
+    uint build_idx;
+    uint _pad;
+};
+
+struct QNodeGlobalRootBuffer
+{
+    uint curr_entries_offset; // we use "entries" as two buffers, so offset is either 0 or QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM
+    struct QNodeGlobalRootBufferEntry entries[QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM * 2];
+};
+
+struct DFSDispatchRecord
+{
+    uint primref_base;
+    uint bvh2_base;
+    uint batch_index;
+    ushort num_primrefs;
+    ushort tree_depth;
+};
+
+
+struct DFSDispatchQueue
+{
+    struct DFSDispatchRecord records[BFS_NUM_VCONTEXTS * 2];
+};
+
+#define VCONTEXT_STATE_EXECUTING   0
+#define VCONTEXT_STATE_UNALLOCATED 1
+
+union SchedulerUnion
+{
+    struct VContextScheduler
+    {
+        /////////////////////////////////////////////////////////////
+        //  State data used for communication with command streamer
+        //   NOTE: This part must match definition in 'new_sah_builder.grl'
+        /////////////////////////////////////////////////////////////
+
+        dword num_bfs_wgs;
+        dword num_dfs_wgs;
+
+        dword scheduler_postsync;
+        dword _pad1;
+
+        dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).  
+        dword num_single_builds;  // number of single-wg builds (#primrefs < threshold)
+
+        dword batched_build_wg_count;  // number of wgs to dispatch for initial BFS pass
+        dword batched_build_loop_mask; // value is 0 if  #builds <= #contexts.  else 1  command streamer uses this as a loop condition
+
+        /////////////////////////////////////////////////////////////
+
+        dword batched_build_count;  // number of batched builds in the SAHBuildGlobals buffer
+        dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
+
+        dword vcontext_state[BFS_NUM_VCONTEXTS];
+
+        struct BFSDispatchQueue bfs_queue;
+        struct DFSDispatchQueue dfs_queue;
+
+        struct VContext contexts[BFS_NUM_VCONTEXTS];
+
+        struct BFS1SpillStack bfs2_spill_stack;
+    } vContextScheduler;
+
+    struct QnodeScheduler
+    {
+        dword num_qnode_grb_curr_entries;
+        dword num_qnode_grb_new_entries;
+
+        dword scheduler_postsync;
+        dword _pad1;
+
+        dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).  
+        dword num_single_builds;  // number of single-wg builds (#primrefs < threshold)
+
+        dword batched_builds_to_process;
+        dword num_max_qnode_global_root_buffer_entries; // number of maximum entries to global root buffer
+
+        /////////////////////////////////////////////////////////////
+
+        dword batched_build_count;  // number of batched builds in the SAHBuildGlobals buffer
+        dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
+
+        struct QNodeGlobalRootBuffer qnode_global_root_buffer;
+    } qnodeScheduler;
+};
+
+
+struct BVH2Node
+{
+    struct AABB3f box;
+    uint  meta_u;   // leaf:  primref start.  inner: offset from node to its first child
+    uint  meta_ss;  
+    //ushort meta_s;   // leaf: primref count.  inner: offset from first to second child, in nodes  
+    //uchar is_inner; //  1 if inner, 0 if leaf
+    //uchar mask;
+};
+
+struct BVH2
+{
+    uint num_nodes;
+    uint _pad[7];  // align to 32B
+};
+
+
+GRL_NAMESPACE_END(GPUBVHBuilder)
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
diff --git a/src/intel/vulkan/grl/gpu/build_leaf.grl b/src/intel/vulkan/grl/gpu/build_leaf.grl
new file mode 100644
index 00000000000..7b154d03b43
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/build_leaf.grl
@@ -0,0 +1,206 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module leaf_builder;
+
+kernel_module leaf_kernels ("bvh_build_leaf.cl")
+{
+    links lsc_intrinsics;
+
+    kernel opencl_kernel_primref_to_quads                   < kernelFunction="primref_to_quads" >;
+    kernel opencl_kernel_primref_to_procedurals             < kernelFunction="primref_to_procedurals" >;
+    kernel opencl_kernel_create_HW_instance_nodes           < kernelFunction="create_HW_instance_nodes" >;
+    kernel opencl_kernel_create_HW_instance_nodes_pointers  < kernelFunction="create_HW_instance_nodes_pointers" >;
+}
+
+import struct MKBuilderState "structs.grl";
+import struct MKSizeEstimate "structs.grl";
+
+const Instances_GROUPSIZE = 16;
+
+metakernel buildLeafDXR_instances(
+            MKBuilderState state,
+            qword build_primref_index_buffers,
+            qword srcInstanceDescrArray,
+            dword stride,
+            dword offset,
+            dword numPrims)
+{
+    define num_groups (numPrims+Instances_GROUPSIZE-1)/Instances_GROUPSIZE;
+    dispatch opencl_kernel_create_HW_instance_nodes(num_groups,1,1) args(
+        state.build_globals,
+        build_primref_index_buffers,
+        state.build_primref_buffer,
+        state.bvh_buffer,
+        srcInstanceDescrArray,
+        stride,
+        offset);
+}
+
+metakernel buildLeafDXR_instances_indirect(
+            MKBuilderState state,
+            qword build_primref_index_buffers,
+            qword srcInstanceDescrArray,
+            qword indirectBuildRangeInfo,
+            dword stride,
+            dword offset)
+{
+    define num_groups  REG0;
+    define groupsize_1 REG1; // groupsize - 1
+    define C_4         REG2;
+
+    // init with primitiveCount
+    num_groups = load_dword(indirectBuildRangeInfo);
+    groupsize_1 = 15; // Instances_GROUPSIZE - 1
+    C_4 = 4;          // log_2(Instances_GROUPSIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_4; // num_groups / Instances_GROUPSIZE;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_create_HW_instance_nodes args(
+        state.build_globals,
+        build_primref_index_buffers,
+        state.build_primref_buffer,
+        state.bvh_buffer,
+        srcInstanceDescrArray,
+        stride,
+        offset);
+}
+
+metakernel buildLeafDXR_instances_pointers(
+            MKBuilderState state,
+            qword build_primref_index_buffers,
+            qword srcInstanceDescrArrayPtr,
+            dword stride,
+            dword offset,
+            dword numPrims)
+{
+    define num_groups (numPrims+Instances_GROUPSIZE-1)/Instances_GROUPSIZE;
+    dispatch opencl_kernel_create_HW_instance_nodes_pointers(num_groups,1,1) args(
+        state.build_globals,
+        build_primref_index_buffers,
+        state.build_primref_buffer,
+        state.bvh_buffer,
+        srcInstanceDescrArrayPtr,
+        stride,
+        offset);
+}
+
+metakernel buildLeafDXR_instances_pointers_indirect(
+            MKBuilderState state,
+            qword build_primref_index_buffers,
+            qword srcInstanceDescrArrayPtr,
+            qword indirectBuildRangeInfo,
+            dword stride,
+            dword offset)
+{
+    define num_groups  REG0;
+    define groupsize_1 REG1; // groupsize - 1
+    define C_4         REG2;
+
+    // init with primitiveCount
+    num_groups = load_dword(indirectBuildRangeInfo);
+    groupsize_1 = 15; // Instances_GROUPSIZE - 1
+    C_4 = 4;          // log_2(Instances_GROUPSIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_4; // num_groups / Instances_GROUPSIZE;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_create_HW_instance_nodes_pointers args(
+        state.build_globals,
+        build_primref_index_buffers,
+        state.build_primref_buffer,
+        state.bvh_buffer,
+        srcInstanceDescrArrayPtr,
+        stride,
+        offset);
+}
+
+metakernel buildLeafDXR_procedurals(
+            MKBuilderState state,
+            qword build_primref_index_buffers,
+            dword stride,
+            dword offset,
+            qword p_numPrimitives)
+{
+    define C_1                  REG0;
+    define REG_PRIMS_PER_WG     REG1;
+    define REG_PRIMS_PER_WG_SHR REG2;
+
+    C_1 = 1;
+    REG_PRIMS_PER_WG = 16;
+    REG_PRIMS_PER_WG_SHR = 4;// We cannot use div, so we use shift right instead (shift by 4 = div by 16 elements)
+
+    define reg_numPrimitives  REG3;
+    define reg_num_wgs        REG4;
+
+    reg_numPrimitives = load_dword(p_numPrimitives);
+    reg_num_wgs = reg_numPrimitives + REG_PRIMS_PER_WG;
+    reg_num_wgs = reg_num_wgs - C_1;
+    reg_num_wgs = reg_num_wgs >> REG_PRIMS_PER_WG_SHR;
+
+    DISPATCHDIM_X = reg_num_wgs;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_primref_to_procedurals args(
+        state.build_globals,
+        state.build_primref_buffer,
+        build_primref_index_buffers,
+        state.bvh_buffer,
+        state.geomDesc_buffer,
+        stride,
+        offset);
+}
+
+metakernel buildLeafDXR_quads(
+            MKBuilderState state,
+            qword build_primref_index_buffers,
+            dword stride,
+            dword offset,
+            qword p_numPrimitives,
+            dword allow_update)
+{
+    define C_1                  REG0;
+    define REG_PRIMS_PER_WG     REG1;
+    define SHIFT                REG2;
+
+    C_1 = 1;
+    REG_PRIMS_PER_WG = 32;
+    SHIFT = 4;// We cannot use div, so we use shift right instead (shift by 4 = div by 16 elements)
+
+    define reg_numPrimitives  REG3;
+    define reg_num_wgs        REG4;
+
+    reg_numPrimitives = load_dword(p_numPrimitives);
+    reg_num_wgs = reg_numPrimitives + REG_PRIMS_PER_WG;
+    reg_num_wgs = reg_num_wgs - C_1;
+    reg_num_wgs = reg_num_wgs >> SHIFT;
+    reg_num_wgs = reg_num_wgs >> C_1;
+
+    DISPATCHDIM_X = reg_num_wgs;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_primref_to_quads args(
+        state.build_globals,
+        state.build_primref_buffer,
+        build_primref_index_buffers,
+        state.bvh_buffer,
+        state.geomDesc_buffer,
+        stride,
+        offset,
+        allow_update);
+}
diff --git a/src/intel/vulkan/grl/gpu/build_primref.grl b/src/intel/vulkan/grl/gpu/build_primref.grl
new file mode 100644
index 00000000000..33728bd01f6
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/build_primref.grl
@@ -0,0 +1,229 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module build_primref;
+
+kernel_module primref_kernels ("bvh_build_primref.cl")
+{
+    links lsc_intrinsics;
+
+    kernel opencl_kernel_primrefs_from_DXR_instances          < kernelFunction="primrefs_from_DXR_instances" >;
+    kernel opencl_kernel_primrefs_from_DXR_instances_indirect < kernelFunction="primrefs_from_DXR_instances_indirect" >;
+    kernel opencl_kernel_primrefs_from_DXR_instances_pointers < kernelFunction="primrefs_from_DXR_instances_pointers" >;
+    kernel opencl_kernel_primrefs_from_DXR_instances_pointers_indirect < kernelFunction="primrefs_from_DXR_instances_pointers_indirect" >;
+
+    kernel opencl_kernel_triangles_to_primrefs            < kernelFunction="triangles_to_primrefs" >;
+    kernel opencl_kernel_triangles_to_primrefs_indirect   < kernelFunction="triangles_to_primrefs_indirect" >;
+    kernel opencl_kernel_procedurals_to_primrefs          < kernelFunction="procedurals_to_primrefs" >;
+    kernel opencl_kernel_procedurals_to_primrefs_indirect < kernelFunction="procedurals_to_primrefs_indirect" >;
+}
+
+import struct MKBuilderState "structs.grl";
+import struct MKSizeEstimate "structs.grl";
+
+
+const PrimirefsFromInstances_GROUPSIZE = 16;
+
+metakernel buildPrimirefsFromInstances(
+            qword instanceDescBuff,
+            MKSizeEstimate estimate,
+            MKBuilderState build_state,
+            dword allowUpdate)
+{
+  define num_groups ((estimate.numPrimitives + PrimirefsFromInstances_GROUPSIZE-1)/PrimirefsFromInstances_GROUPSIZE);
+  dispatch opencl_kernel_primrefs_from_DXR_instances(num_groups,1,1) args(
+    build_state.build_globals,
+    build_state.bvh_buffer,
+    instanceDescBuff,
+    estimate.numPrimitives,
+    build_state.build_primref_buffer,
+    allowUpdate);
+}
+
+metakernel buildPrimirefsFromInstancesIndirect(
+            qword instanceDescBuff,
+            qword indirectBuildRangeInfo,
+            MKBuilderState build_state,
+            dword allowUpdate)
+{
+    define num_groups  REG0;
+    define groupsize_1 REG1; // groupsize - 1
+    define C_4         REG2;
+
+    // init with primitiveCount
+    num_groups = load_dword(indirectBuildRangeInfo);
+    groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1
+    C_4 = 4;          // log_2(PrimirefsFromInstances_GROUPSIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_primrefs_from_DXR_instances_indirect args(
+        build_state.build_globals,
+        build_state.bvh_buffer,
+        instanceDescBuff,
+        indirectBuildRangeInfo,
+        build_state.build_primref_buffer,
+        allowUpdate);
+}
+
+metakernel buildPrimirefsFromInstancesArrOfPtrs(
+            qword instanceDescPtrArrayBuff,
+            MKSizeEstimate estimate,
+            MKBuilderState build_state,
+            dword allowUpdate)
+{
+  define num_groups ((estimate.numPrimitives + PrimirefsFromInstances_GROUPSIZE-1)/PrimirefsFromInstances_GROUPSIZE);
+  dispatch opencl_kernel_primrefs_from_DXR_instances_pointers(num_groups,1,1) args(
+    build_state.build_globals,
+    build_state.bvh_buffer,
+    instanceDescPtrArrayBuff,
+    estimate.numPrimitives,
+    build_state.build_primref_buffer,
+    allowUpdate);
+}
+
+metakernel buildPrimirefsFromInstancesArrOfPtrsIndirect(
+            qword instanceDescPtrArrayBuff,
+            qword indirectBuildRangeInfo,
+            MKSizeEstimate estimate,
+            MKBuilderState build_state,
+            dword allowUpdate)
+{
+    define num_groups  REG0;
+    define groupsize_1 REG1; // groupsize - 1
+    define C_4         REG2;
+
+    // init with primitiveCount
+    num_groups = load_dword(indirectBuildRangeInfo);
+    groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1
+    C_4 = 4;          // log_2(PrimirefsFromInstances_GROUPSIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_primrefs_from_DXR_instances_pointers_indirect args(
+        build_state.build_globals,
+        build_state.bvh_buffer,
+        instanceDescPtrArrayBuff,
+        build_state.build_primref_buffer,
+        indirectBuildRangeInfo,
+        allowUpdate);
+}
+
+
+
+
+metakernel primrefs_from_tris(
+            MKBuilderState build_state,
+            MKSizeEstimate estimate,
+            qword geo_ptr,
+            dword geom_id,
+            dword geom_flags,
+            dword num_prims)
+{
+    define num_threads ((num_prims+15)/16);
+    dispatch opencl_kernel_triangles_to_primrefs(num_threads,1,1) args(
+        build_state.build_globals,
+        build_state.bvh_buffer,
+        build_state.build_primref_buffer,
+        geo_ptr,
+        (geom_id & 0x00ffffff) + (geom_flags<<24),
+        num_prims);
+}
+
+metakernel primrefs_from_tris_indirect(
+            MKBuilderState build_state,
+            MKSizeEstimate estimate,
+            qword geo_ptr,
+            qword indirectBuildRangeInfo,
+            dword geom_id,
+            dword geom_flags)
+{
+    define num_groups  REG0;
+    define groupsize_1 REG1; // groupsize - 1
+    define C_4         REG2;
+
+    // init with primitiveCount
+    num_groups  = load_dword(indirectBuildRangeInfo);
+    groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1
+    C_4         = 4;  // log_2(PrimirefsFromInstances_GROUPSIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_triangles_to_primrefs_indirect args(
+        build_state.build_globals,
+        build_state.bvh_buffer,
+        build_state.build_primref_buffer,
+        geo_ptr,
+        indirectBuildRangeInfo,
+        (geom_id & 0x00ffffff) + (geom_flags << 24));
+}
+
+metakernel primrefs_from_proc(
+            MKBuilderState build_state,
+            MKSizeEstimate estimate,
+            qword geo_ptr,
+            dword geom_id,
+            dword geom_flags,
+            dword num_prims)
+{
+    define num_threads ((num_prims+15)/16);
+    dispatch opencl_kernel_procedurals_to_primrefs(num_threads,1,1) args(
+        build_state.build_globals,
+        build_state.bvh_buffer,
+        build_state.build_primref_buffer,
+        geo_ptr,
+        (geom_id & 0x00ffffff) + (geom_flags<<24),
+        num_prims);
+}
+
+metakernel primrefs_from_proc_indirect(
+            MKBuilderState build_state,
+            MKSizeEstimate estimate,
+            qword geo_ptr,
+            qword indirectBuildRangeInfo,
+            dword geom_id,
+            dword geom_flags)
+{
+    define num_groups  REG0;
+    define groupsize_1 REG1; // groupsize - 1
+    define C_4         REG2;
+
+    // init with primitiveCount
+    num_groups  = load_dword(indirectBuildRangeInfo);
+    groupsize_1 = 15; // PrimirefsFromInstances_GROUPSIZE - 1
+    C_4         = 4;  // log_2(PrimirefsFromInstances_GROUPSIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_4; // num_groups / PrimirefsFromInstances_GROUPSIZE;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_procedurals_to_primrefs_indirect args(
+        build_state.build_globals,
+        build_state.bvh_buffer,
+        build_state.build_primref_buffer,
+        geo_ptr,
+        indirectBuildRangeInfo,
+        (geom_id & 0x00ffffff) + (geom_flags<<24));
+}
diff --git a/src/intel/vulkan/grl/gpu/build_refit.grl b/src/intel/vulkan/grl/gpu/build_refit.grl
new file mode 100644
index 00000000000..46d6e76add2
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/build_refit.grl
@@ -0,0 +1,324 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module build_refit;
+
+kernel_module morton_kernels ("bvh_build_refit.cl")
+{
+    links lsc_intrinsics;
+
+    kernel update_instance_leaves    < kernelFunction="update_instance_leaves" >;
+    kernel refit_indirect_sg         < kernelFunction="Refit_indirect_sg" >;
+    kernel update_instance_leaves_indirect    < kernelFunction="update_instance_leaves_indirect" >;
+
+
+}
+
+const INSTANCE_LEAF_GROUP_SIZE = 16;
+const REFIT_GROUP_SIZE = 8;
+
+metakernel update_instance_leaves(
+    qword bvh,
+    qword dxrInstancesArray,
+    qword dxrInstancesPtrArray,
+    qword instance_leaf_aabbs,
+    dword num_instances )
+{
+    define num_groups (num_instances + INSTANCE_LEAF_GROUP_SIZE - 1) / INSTANCE_LEAF_GROUP_SIZE;
+
+    dispatch update_instance_leaves(num_groups, 1, 1) args(
+        bvh,
+        dxrInstancesArray,
+        dxrInstancesPtrArray,
+        instance_leaf_aabbs);
+}
+
+metakernel update_instance_leaves_indirect(
+    qword bvh,
+    qword dxrInstancesArray,
+    qword dxrInstancesPtrArray,
+    qword instance_leaf_aabbs,
+    qword indirectBuildRangeInfo)
+{
+    define num_groups  REG0;
+    define groupsize_1 REG1; // groupsize - 1
+    define C_4         REG2;
+
+    // init with primitiveCount
+    num_groups = load_dword(indirectBuildRangeInfo);
+    groupsize_1 = 15; // INSTANCE_LEAF_GROUP_SIZE - 1
+    C_4 = 4;  // log_2(INSTANCE_LEAF_GROUP_SIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_4; // num_groups / INSTANCE_LEAF_GROUP_SIZE;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect update_instance_leaves_indirect args(
+        bvh,
+        dxrInstancesArray,
+        dxrInstancesPtrArray,
+        instance_leaf_aabbs,
+        indirectBuildRangeInfo);
+}
+
+/*
+metakernel refit(
+    qword bvh,
+    qword geomDesc,
+    qword instance_aabbs,
+    dword dispatchSize )
+{
+    define num_groups (dispatchSize + REFIT_GROUP_SIZE - 1) / REFIT_GROUP_SIZE;
+
+    dispatch refit(num_groups, 1, 1) args(
+        bvh,
+        geomDesc,
+        instance_aabbs);
+}
+
+const REFIT_SIMD_SIZE = 8;
+const REFIT_SIMD_SIZE_SHIFT = 3;
+
+metakernel refit_indirect(
+    qword bvh,
+    qword bvh_inner_nodes_start_value,
+    qword bvh_inner_nodes_end,
+    qword geomDesc,
+    qword instance_aabbs )
+{
+    define cRoundingSIMD REG4;
+    define TWO REG3;
+    define ONE REG5;
+    cRoundingSIMD  = (REFIT_SIMD_SIZE - 1);
+
+    TWO = 2;
+    ONE = 1;
+
+    REG0 = bvh_inner_nodes_start_value;
+    REG1 = load_dword(bvh_inner_nodes_end);
+    REG1.hi = 0;
+    REG2 = REG1 - REG0;
+    REG2 = REG2 + cRoundingSIMD;
+    REG2 = REG2 >> TWO;             // JDB:  >>3 must be implemented as >>2 then >>1 because command streamer
+    REG2 = REG2 >> ONE;             //   only supports pow2 shifts because somebody wanted to save area.
+
+    DISPATCHDIM_X = REG2.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect refit_indirect args(
+        bvh,
+        geomDesc,
+        instance_aabbs);
+
+}
+*/
+
+metakernel refit_indirect_sg(
+    qword bvh,
+    qword bvh_inner_nodes_start_value,
+    qword bvh_inner_nodes_end,
+    qword geomDesc,
+    qword instance_aabbs )
+{
+    
+    REG0 = bvh_inner_nodes_start_value;
+    REG1.lo = load_dword(bvh_inner_nodes_end);
+    REG1.hi = 0;
+    REG2 = REG1 - REG0;
+    
+    DISPATCHDIM_X = REG2.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect refit_indirect_sg args(
+        bvh,
+        geomDesc,
+        instance_aabbs);
+
+}
+/*
+////////////////////////////////////////////////////////////////
+// constructing treelets
+// phase 1: mark nodes that will be roots of bottom treelets
+// also for each node leave a number of startpoints that are under it and max depth of the path from the node
+metakernel find_refit_treelets(
+    qword bvh,
+    qword treelet_node_data,
+    qword scratch_startpoints,
+    qword startpointAlloc,
+    qword bvh_inner_nodes_start_value,
+    qword bvh_inner_nodes_end )
+{
+    define cRoundingSIMD REG4;
+    define TWO REG3;
+    define ONE REG5;
+    cRoundingSIMD  = (REFIT_SIMD_SIZE - 1);
+
+    TWO = 2;
+    ONE = 1;
+
+    REG0 = bvh_inner_nodes_start_value;
+    REG1.lo = load_dword(bvh_inner_nodes_end);
+    REG1.hi = 0;
+    REG2 = REG1 - REG0;
+    REG2 = REG2 + cRoundingSIMD;
+    REG2 = REG2 >> TWO;             // JDB:  >>3 must be implemented as >>2 then >>1 because command streamer
+    REG2 = REG2 >> ONE;             //   only supports pow2 shifts because somebody wanted to save area.
+
+    DISPATCHDIM_X = REG2.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect find_refit_treelets args(
+        bvh,
+        treelet_node_data,
+        scratch_startpoints,
+        startpointAlloc);
+}
+
+
+////////////////////////////////////////////////////////////////
+// constructing treelets
+// phase 2 totally parallel, run threads up to assign startpoints to given treelet
+// 
+metakernel assign_refit_startpoints_to_treelets(
+    qword bvh,
+    qword treelet_node_data,
+    qword scratch_startpoints,
+    qword bvh_inner_nodes_start_value,
+    qword bvh_inner_nodes_end )
+{
+    define cRoundingSIMD REG4;
+    define TWO REG3;
+    define ONE REG5;
+    cRoundingSIMD  = (REFIT_SIMD_SIZE - 1);
+
+    TWO = 2;
+    ONE = 1;
+
+    REG0 = bvh_inner_nodes_start_value;
+    REG1.lo = load_dword(bvh_inner_nodes_end);
+    REG1.hi = 0;
+    REG2 = REG1 - REG0;
+    REG2 = REG2 + cRoundingSIMD;
+    REG2 = REG2 >> TWO;             // JDB:  >>3 must be implemented as >>2 then >>1 because command streamer
+    REG2 = REG2 >> ONE;             //   only supports pow2 shifts because somebody wanted to save area.
+
+    DISPATCHDIM_X = REG2.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect assign_refit_startpoints_to_treelets args(
+        bvh,
+        treelet_node_data,
+        scratch_startpoints);
+}
+
+
+////////////////////////////////////////////////////////////////
+// constructing treelets
+// phase 3 local work: group per treelet, sort the startpoints in treelets ?// by length of the path
+metakernel finalize_treelets_in_groups(
+    qword bvh,
+    qword scratch_startpoints,
+    qword ptrNumTreelets )
+{
+    REG0 = load_qword(ptrNumTreelets);
+
+    DISPATCHDIM_X = REG0.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect finalize_treelets_in_groups args(
+        bvh,
+        scratch_startpoints);
+}
+
+
+////////////////////////////////////////////////////////////////
+// Updating treelets
+// phase 1 update vertex and generate boxes for vertices
+//
+
+const PER_GROUP_ELEMENTS_ROUNDING = 15;
+const PER_GROUP_ELEMENTS_SHIFT = 4;
+
+metakernel init_treelets_refit(qword pSquashGroupsCountToReset)
+{
+    REG1 = 0;
+    store_qword(pSquashGroupsCountToReset, REG1);
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+    //REG4 = PER_GROUP_ELEMENTS_SHIFT;
+    //REG5.hi = PER_GROUP_ELEMENTS_ROUNDING;
+    //REG5.lo = 0;
+}
+
+metakernel update_quads(
+    qword scratch_box,
+    qword bvh,
+    qword input,
+    dword numPrimsDividedBy32,
+    qword bigSquashInput)
+{
+    //REG0 = load_qword(quads_nodes_begin_end_pair);
+    //REG1.hi = REG0.lo; // this holds inner nodes begin
+    //REG2 = REG0 - REG1;
+    //REG2 = REG2 + REG5;
+    //REG2 = REG2 >> REG4;
+    //DISPATCHDIM_X = REG2.hi;
+
+    dispatch  refit_quads(numPrimsDividedBy32, 1, 1) args(
+        bvh,
+        input,
+        scratch_box,
+        numPrimsDividedBy32,
+        bigSquashInput );
+}
+
+//
+////////////////////////////////////////////////////////////////
+
+
+////////////////////////////////////////////////////////////////
+//
+// phase 1 or 2 - update primitives as well as bottom up refit internal nodes
+// in single dispatch (in single group per tree)
+metakernel refit_tree_by_group_including_quads(
+    qword squashed_inputs,
+    dword numBuilds
+)
+{
+    dispatch refit_tree_per_group(numBuilds, 1, 1) args(
+        squashed_inputs);
+}
+//
+////////////////////////////////////////////////////////////////
+
+
+////////////////////////////////////////////////////////////////
+//
+// phase 2 bottom up refit internal nodes
+//
+metakernel refit_treelet_per_group(
+    qword bigSquashInput,
+    qword ptrNumTreelets)
+{
+    DISPATCHDIM_X = load_dword(ptrNumTreelets);
+
+    dispatch_indirect refit_treelet_per_group args(
+        bigSquashInput);
+}
+//
+////////////////////////////////////////////////////////////////
+
+#endif
+*/
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_BFS.cl b/src/intel/vulkan/grl/gpu/bvh_build_BFS.cl
new file mode 100644
index 00000000000..d72f192056e
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_BFS.cl
@@ -0,0 +1,4823 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "binned_sah_shared.h"
+
+#include "libs/lsc_intrinsics.h"
+#include "intrinsics.h"
+#include "AABB.h"
+#include "AABB3f.h"
+
+#include "qbvh6.h"
+#include "common.h"
+
+#include "libs/lsc_intrinsics.h"
+
+#define SGPRINT_16x(prefix,fmt,type,val)  {\
+                                        type v0 = sub_group_broadcast( val, 0 );\
+                                        type v1 = sub_group_broadcast( val, 1 );\
+                                        type v2 = sub_group_broadcast( val, 2 );\
+                                        type v3 = sub_group_broadcast( val, 3 );\
+                                        type v4 = sub_group_broadcast( val, 4 );\
+                                        type v5 = sub_group_broadcast( val, 5 );\
+                                        type v6 = sub_group_broadcast( val, 6 );\
+                                        type v7 = sub_group_broadcast( val, 7 );\
+                                        type v8 = sub_group_broadcast( val, 8 );\
+                                        type v9 = sub_group_broadcast( val, 9 );\
+                                        type v10 = sub_group_broadcast( val, 10 );\
+                                        type v11 = sub_group_broadcast( val, 11 );\
+                                        type v12 = sub_group_broadcast( val, 12 );\
+                                        type v13 = sub_group_broadcast( val, 13 );\
+                                        type v14 = sub_group_broadcast( val, 14 );\
+                                        type v15 = sub_group_broadcast( val, 15 );\
+                                        sub_group_barrier(CLK_LOCAL_MEM_FENCE); \
+                                        if( get_sub_group_local_id() == 0 ) { \
+                                        printf(prefix fmt fmt fmt fmt fmt fmt fmt fmt \
+                                                      fmt fmt fmt fmt fmt fmt fmt fmt"\n" , \
+                                            v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15);}}
+
+
+#define SGPRINT_6x(prefix,fmt,type,val)  {\
+                                        type v0 = sub_group_broadcast( val, 0 );\
+                                        type v1 = sub_group_broadcast( val, 1 );\
+                                        type v2 = sub_group_broadcast( val, 2 );\
+                                        type v3 = sub_group_broadcast( val, 3 );\
+                                        type v4 = sub_group_broadcast( val, 4 );\
+                                        type v5 = sub_group_broadcast( val, 5 );\
+                                        sub_group_barrier(CLK_LOCAL_MEM_FENCE); \
+                                        if( get_sub_group_local_id() == 0 ) { \
+                                        printf(prefix fmt fmt fmt fmt fmt fmt "\n" , \
+                                            v0,v1,v2,v3,v4,v5);}}
+
+#define BFS_WG_SIZE  512
+
+#define BFS_NUM_VCONTEXTS 256 // must be multiple of 64
+
+#define TREE_ARITY 6
+
+#define DFS_WG_SIZE  256
+#define DFS_THRESHOLD 256
+
+
+void BFSDispatchQueue_print(struct BFSDispatchQueue* q, uint n)
+{
+    for (uint i = 0; i < q->num_dispatches; i++)
+        printf("   %u,ctx=%u,batch=%u\n", q->wg_count[i], q->records[i].context_id, q->records[i].batch_index);
+}
+
+void VContextScheduler_print(struct VContextScheduler* scheduler)
+{
+    if (get_local_id(0) == 0)
+    {
+        printf("SCHEDULER:\n");
+        printf("    bfs=%u dfs=%u\n", scheduler->num_bfs_wgs, scheduler->num_dfs_wgs);
+
+        printf("BFS QUEUE:\n");
+        BFSDispatchQueue_print(&scheduler->bfs_queue, scheduler->num_bfs_wgs);
+
+
+        printf("DFS QUEUE\n");
+        for (uint i = 0; i < scheduler->num_dfs_wgs; i++)
+        {
+            struct DFSDispatchRecord* r = &scheduler->dfs_queue.records[i];
+            printf("    (%u-%u) root=%u  depth=%u  batch_index=%u\n",
+                r->primref_base, r->primref_base + r->num_primrefs,
+                r->bvh2_base, r->tree_depth, r->batch_index);
+        }
+
+        printf("CONTEXTS:\n");
+        for (uint i = 0; i < BFS_NUM_VCONTEXTS; i++)
+        {
+            if (scheduler->vcontext_state[i] != VCONTEXT_STATE_UNALLOCATED)
+            {
+                printf(" context: %u  state=%u\n", i, scheduler->vcontext_state[i]);
+                printf("     prims: %u-%u\n", scheduler->contexts[i].dispatch_primref_begin, scheduler->contexts[i].dispatch_primref_end);
+                printf("     depth: %u\n", scheduler->contexts[i].tree_depth);
+                printf("     root: %u\n", scheduler->contexts[i].bvh2_root);
+                printf("     batch: %u\n", scheduler->contexts[i].batch_index);
+            }
+        }
+
+
+
+    }
+
+}
+
+
+inline float3 select_min(float3 v, bool mask)
+{
+    return (float3)(mask ? v.x : (float)(INFINITY),
+        mask ? v.y : (float)(INFINITY),
+        mask ? v.z : (float)(INFINITY));
+}
+inline float3 select_max(float3 v, bool mask)
+{
+    return (float3)(mask ? v.x : -(float)(INFINITY),
+        mask ? v.y : -(float)(INFINITY),
+        mask ? v.z : -(float)(INFINITY));
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+//  The 'LRBounds' structure uses negated-max to allow
+//  both atomic_min and atomic_max to be issued fused into one message
+
+struct AABB3f LRBounds_get_left_centroid( LRBounds* b )
+{
+    struct AABB3f* pbox = &b->boxes.left_centroid_bounds;
+    return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) );
+}
+struct AABB3f LRBounds_get_right_centroid( LRBounds* b )
+{
+    struct AABB3f* pbox = &b->boxes.right_centroid_bounds;
+    return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) );
+}
+struct AABB3f LRBounds_get_left_geom( LRBounds* b )
+{
+    struct AABB3f* pbox = &b->boxes.left_geom_bounds;
+    return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) );
+}
+struct AABB3f LRBounds_get_right_geom( LRBounds* b )
+{
+    struct AABB3f* pbox = &b->boxes.right_geom_bounds;
+    return AABB3f_construct( AABB3f_load_lower(pbox), -AABB3f_load_upper(pbox) );
+}
+
+
+void LRBounds_merge_left( local LRBounds* b, float3 CMin, float3 CMax, float3 GMin, float3 GMax )
+{
+    // All of the input vectors have come from sub-group reductions and are thus uniform
+    //   Using atomic_min calls as below results in IGC generating 12 atomic_min messages and a large stack of movs
+    //  The code below should result in 1 atomic_min message and a simularly large stack of movs
+
+    float mergeVal0 = INFINITY;
+    float mergeVal1 = INFINITY;
+    uint i = get_sub_group_local_id();
+
+    // insert the various merge values into one register
+    //  We use two parallel variables here to enable some ILP
+
+    uint imod = (i>=6) ? (i-6) : i;
+    mergeVal0 = (imod==0)  ?  CMin.x : mergeVal0;
+    mergeVal1 = (imod==0)  ?  GMin.x : mergeVal1;
+
+    mergeVal0 = (imod==1)  ?  CMin.y : mergeVal0;
+    mergeVal1 = (imod==1)  ?  GMin.y : mergeVal1;
+
+    mergeVal0 = (imod==2)  ?  CMin.z : mergeVal0;
+    mergeVal1 = (imod==2)  ?  GMin.z : mergeVal1;
+
+    mergeVal0 = (imod==3)  ? -CMax.x : mergeVal0;
+    mergeVal1 = (imod==3)  ? -GMax.x : mergeVal1;
+
+    mergeVal0 = (imod==4)  ? -CMax.y : mergeVal0;
+    mergeVal1 = (imod==4)  ? -GMax.y : mergeVal1;
+
+    mergeVal0 = (imod==5)  ? -CMax.z : mergeVal0;
+    mergeVal1 = (imod==5)  ? -GMax.z : mergeVal1;
+
+    float merge = (i<6) ? mergeVal0 : mergeVal1;
+    if( i < 12 )
+        atomic_min( &b->scalars.Array[i], merge );
+
+    //atomic_min( &b->boxes.left_centroid_bounds.lower[0], CMin.x );
+    //atomic_min( &b->boxes.left_centroid_bounds.lower[1], CMin.y );
+    //atomic_min( &b->boxes.left_centroid_bounds.lower[2], CMin.z );
+    //atomic_min( &b->boxes.left_centroid_bounds.upper[0], -CMax.x );
+    //atomic_min( &b->boxes.left_centroid_bounds.upper[1], -CMax.y );
+    //atomic_min( &b->boxes.left_centroid_bounds.upper[2], -CMax.z );
+    //atomic_min( &b->boxes.left_geom_bounds.lower[0],      GMin.x );
+    //atomic_min( &b->boxes.left_geom_bounds.lower[1],      GMin.y );
+    //atomic_min( &b->boxes.left_geom_bounds.lower[2],      GMin.z );
+    //atomic_min( &b->boxes.left_geom_bounds.upper[0], -GMax.x );
+    //atomic_min( &b->boxes.left_geom_bounds.upper[1], -GMax.y );
+    //atomic_min( &b->boxes.left_geom_bounds.upper[2], -GMax.z );
+}
+
+void LRBounds_merge_right( local LRBounds* b, float3 CMin, float3 CMax, float3 GMin, float3 GMax )
+{
+    // All of the input vectors have come from sub-group reductions and are thus uniform
+    //   Using atomic_min calls as below results in IGC generating 12 atomic_min messages and a large stack of movs
+    //  The code below should result in 1 atomic_min message and a simularly large stack of movs
+
+    float mergeVal0 = INFINITY;
+    float mergeVal1 = INFINITY;
+    uint i = get_sub_group_local_id();
+
+    // insert the various merge values into one register
+    //  We use two parallel variables here to enable some ILP
+
+    uint imod = (i>=6) ? (i-6) : i;
+    mergeVal0 = (imod==0)  ?  CMin.x : mergeVal0;
+    mergeVal1 = (imod==0)  ?  GMin.x : mergeVal1;
+
+    mergeVal0 = (imod==1)  ?  CMin.y : mergeVal0;
+    mergeVal1 = (imod==1)  ?  GMin.y : mergeVal1;
+
+    mergeVal0 = (imod==2)  ?  CMin.z : mergeVal0;
+    mergeVal1 = (imod==2)  ?  GMin.z : mergeVal1;
+
+    mergeVal0 = (imod==3)  ? -CMax.x : mergeVal0;
+    mergeVal1 = (imod==3)  ? -GMax.x : mergeVal1;
+
+    mergeVal0 = (imod==4)  ? -CMax.y : mergeVal0;
+    mergeVal1 = (imod==4)  ? -GMax.y : mergeVal1;
+
+    mergeVal0 = (imod==5)  ? -CMax.z : mergeVal0;
+    mergeVal1 = (imod==5)  ? -GMax.z : mergeVal1;
+
+    float merge = (i<6) ? mergeVal0 : mergeVal1;
+    if( i < 12 )
+        atomic_min( &b->scalars.Array[i+12], merge );
+
+    //atomic_min( &b->boxes.right_centroid_bounds.lower[0],  CMin.x );
+    //atomic_min( &b->boxes.right_centroid_bounds.lower[1],  CMin.y );
+    //atomic_min( &b->boxes.right_centroid_bounds.lower[2],  CMin.z );
+    //atomic_min( &b->boxes.right_centroid_bounds.upper[0], -CMax.x );
+    //atomic_min( &b->boxes.right_centroid_bounds.upper[1], -CMax.y );
+    //atomic_min( &b->boxes.right_centroid_bounds.upper[2], -CMax.z );
+    //atomic_min( &b->boxes.right_geom_bounds.lower[0],      GMin.x );
+    //atomic_min( &b->boxes.right_geom_bounds.lower[1],      GMin.y );
+    //atomic_min( &b->boxes.right_geom_bounds.lower[2],      GMin.z );
+    //atomic_min( &b->boxes.right_geom_bounds.upper[0],     -GMax.x );
+    //atomic_min( &b->boxes.right_geom_bounds.upper[1],     -GMax.y );
+    //atomic_min( &b->boxes.right_geom_bounds.upper[2],     -GMax.z );
+}
+
+void LRBounds_merge( global LRBounds* globalBounds, local LRBounds* localBounds )
+{
+    uint i = get_local_id(0);
+    if( i < 24 )
+        atomic_min(&globalBounds->scalars.Array[i], localBounds->scalars.Array[i] );
+}
+
+
+void LRBounds_init( LRBounds* bounds )
+{
+    uint i = get_local_id(0) * 4;
+    if( i < 24 )
+    {
+        // compiler should merge it into a 4xdword send
+        bounds->scalars.Array[i+0] = INFINITY;
+        bounds->scalars.Array[i+1] = INFINITY;
+        bounds->scalars.Array[i+2] = INFINITY;
+        bounds->scalars.Array[i+3] = INFINITY;
+    }
+
+}
+
+
+inline void LRBounds_init_subgroup( LRBounds* bounds)
+{
+    uint sg_size = get_sub_group_size();
+    uint lane = get_sub_group_local_id();
+
+    for (uint i = lane * 4; i < 24; i += sg_size * 4)
+    {
+        // compiler should merge it into a 4xdword send
+        bounds->scalars.Array[i+0] = INFINITY;
+        bounds->scalars.Array[i+1] = INFINITY;
+        bounds->scalars.Array[i+2] = INFINITY;
+        bounds->scalars.Array[i+3] = INFINITY;
+    }
+
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+inline void BinInfo_init(struct BFS_BinInfo* bin_info)
+{
+    for (uint id = get_local_id(0) * 4; id < 18 * BFS_NUM_BINS; id += get_local_size(0) * 4)
+    {
+        float inf = INFINITY;
+        // compiler should merge it into a 4xdword send
+        bin_info->min_max[id+0] = inf;
+        bin_info->min_max[id+1] = inf;
+        bin_info->min_max[id+2] = inf;
+        bin_info->min_max[id+3] = inf;
+    }
+    for (uint id = get_local_id(0) * 4; id < 3 * BFS_NUM_BINS; id += get_local_size(0) * 4)
+    {
+        // compiler should merge it into a 4xdword send
+        bin_info->counts[id+0] = 0;
+        bin_info->counts[id+1] = 0;
+        bin_info->counts[id+2] = 0;
+        bin_info->counts[id+3] = 0;
+    }
+}
+
+
+// copy global to local
+inline void BinInfo_copy( local struct BFS_BinInfo* local_bin_info, global struct BFS_BinInfo* global_bin_info )
+{
+    for (uint id = get_local_id(0); id < 18 * BFS_NUM_BINS; id += get_local_size(0))
+    {
+        float inf = INFINITY ;
+        float f = global_bin_info->min_max[id];
+        local_bin_info->min_max[id] = f;
+    }
+    for (uint id = get_local_id(0); id < 3 * BFS_NUM_BINS; id += get_local_size(0))
+    {
+        local_bin_info->counts[id] = global_bin_info->counts[id];
+    }
+}
+
+inline void BinInfo_init_subgroup(struct BFS_BinInfo* bin_info)
+{
+    uint sg_size = get_sub_group_size();
+    uint lane = get_sub_group_local_id();
+
+    for (uint i = lane * 4; i < 3 * BFS_NUM_BINS; i += sg_size * 4)
+    {
+        // compiler should merge it into a 4xdword send
+        bin_info->counts[i+0] = 0;
+        bin_info->counts[i+1] = 0;
+        bin_info->counts[i+2] = 0;
+        bin_info->counts[i+3] = 0;
+    }
+
+
+    for (uint i = lane * 4; i < 18 * BFS_NUM_BINS; i += sg_size * 4)
+    {
+        // compiler should merge it into a 4xdword send
+        bin_info->min_max[i+0] = INFINITY;
+        bin_info->min_max[i+1] = INFINITY;
+        bin_info->min_max[i+2] = INFINITY;
+        bin_info->min_max[i+3] = INFINITY;
+    }
+
+}
+
+float3 shuffle_down_float3( float3 a, float3 b, uint delta )
+{
+    return (float3)(
+        intel_sub_group_shuffle_down( a.x, b.x, delta ),
+        intel_sub_group_shuffle_down( a.y, b.y, delta ),
+        intel_sub_group_shuffle_down( a.z, b.z, delta )
+        );
+}
+
+
+
+
+void BinInfo_primref_ballot_loop( local struct BFS_BinInfo* bin_info, uint axis, uint bin, float3 lower, float3 upper, bool active_lane )
+{
+    local float* bins_min = &bin_info->min_max[0];
+    local float* bins_max = &bin_info->min_max[3];
+
+    varying uint place = (bin + axis*BFS_NUM_BINS);
+    varying uint lane = get_sub_group_local_id();
+
+    uniform uint active_mask = intel_sub_group_ballot(active_lane);
+
+    while( active_mask )
+    {
+        uniform uint leader     = ctz( active_mask );
+        uniform uint lead_place = intel_sub_group_shuffle( place, leader );
+        varying bool matching_bin = lead_place == place && active_lane;
+
+        varying float3 lo = (float3)(INFINITY,INFINITY,INFINITY);
+        varying float3 hi = (float3)(-INFINITY,-INFINITY,-INFINITY);
+        if (matching_bin)
+        {
+            lo = lower.xyz;
+            hi = upper.xyz;
+        }
+
+        lo = sub_group_reduce_min_float3( lo );
+        hi = sub_group_reduce_max_float3( hi );
+
+        {
+            // atomic min operation vectorized across 6 lanes
+            //    [ lower.xyz ][-][upper.xyz][-]
+            //
+            // Lanes 3 and 7 are inactive
+
+            uint lmod = lane % 4;
+            uint ldiv = lane / 4;
+            float vlo = lo.x;
+            float vhi = hi.x;
+            vlo = (lmod == 1) ? lo.y : vlo;
+            vhi = (lmod == 1) ? hi.y : vhi;
+            vlo = (lmod == 2) ? lo.z : vlo;
+            vhi = (lmod == 2) ? hi.z : vhi;
+
+            float v = (ldiv == 0) ? vlo : -vhi;
+
+            if( (1<<lane) & 0x77 )
+                atomic_min( &bin_info->min_max[ 6*lead_place + lmod + 3*ldiv ], v );
+        }
+
+      //if( lane == 0 )
+      //    atomic_add_local(&bin_info->counts[lead_place], popcount(active_mask & intel_sub_group_ballot(matching_bin)) );
+
+        active_mask = active_mask & intel_sub_group_ballot(!matching_bin);
+    }
+}
+
+inline void BinInfo_add_primref(struct BinMapping* binMapping, local struct BFS_BinInfo* bin_info, PrimRef* primref, bool active_lane )
+{
+
+    const float4 lower = primref->lower;
+    const float4 upper = primref->upper;
+    const float4 p = lower + upper;
+    const uint4 i = convert_uint4( (p - binMapping->ofs) * binMapping->scale );
+
+    BinInfo_primref_ballot_loop( bin_info, 0, i.x, lower.xyz, upper.xyz, active_lane );
+    BinInfo_primref_ballot_loop( bin_info, 1, i.y, lower.xyz, upper.xyz, active_lane );
+    BinInfo_primref_ballot_loop( bin_info, 2, i.z, lower.xyz, upper.xyz, active_lane );
+
+    if (active_lane)
+    {
+        atomic_inc_local( &bin_info->counts[i.x + 0 * BFS_NUM_BINS] );
+        atomic_inc_local( &bin_info->counts[i.y + 1 * BFS_NUM_BINS] );
+        atomic_inc_local( &bin_info->counts[i.z + 2 * BFS_NUM_BINS] );
+    }
+}
+
+inline void BinInfo_merge(global struct BFS_BinInfo* global_info, local struct BFS_BinInfo* local_info)
+{
+    uint id = get_local_id(0);
+    for (uint id = get_local_id(0); id < 18 * BFS_NUM_BINS; id += get_local_size(0))
+    {
+            float v = local_info->min_max[id];
+            if( v != INFINITY )
+                atomic_min(&global_info->min_max[id], v);
+    }
+    for (uint id = get_local_id(0); id < 3 * BFS_NUM_BINS; id += get_local_size(0))
+    {
+            uint c = local_info->counts[id];
+            if( c )
+                atomic_add_global(&global_info->counts[id], c);
+    }
+}
+
+inline struct AABB3f BinInfo_get_AABB(struct BFS_BinInfo* bin_info, ushort bin, ushort axis)
+{
+    float* min = &bin_info->min_max[6*(bin + axis*BFS_NUM_BINS)];
+    float* max = min + 3;
+    struct AABB3f box;
+    for (uint i = 0; i < 3; i++)
+    {
+        box.lower[i] = min[i];
+        box.upper[i] = -max[i];
+    }
+
+    return box;
+}
+
+inline uint3 BinInfo_get_counts(struct BFS_BinInfo* bin_info, ushort bin)
+{
+    uint3 counts;
+    counts.x = bin_info->counts[bin + 0 * BFS_NUM_BINS]; // TODO: block load these
+    counts.y = bin_info->counts[bin + 1 * BFS_NUM_BINS];
+    counts.z = bin_info->counts[bin + 2 * BFS_NUM_BINS];
+    return counts;
+}
+inline uint BinInfo_get_count(struct BFS_BinInfo* bin_info, ushort bin, ushort axis)
+{
+    return bin_info->counts[bin + axis * BFS_NUM_BINS];
+}
+
+
+void BVH2_Initialize( struct BVH2* bvh )
+{
+    bvh->num_nodes = 1;
+}
+
+inline bool BVH2_IsInnerNode( global struct BVH2* bvh, uint node_index )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    return (n->meta_ss & 0x10000) != 0;
+}
+inline uint BVH2_GetRoot( struct BVH2* bvh )
+{
+    return 0;
+}
+
+//////////////////////////////////////////////
+// BVH2NodeMetaData funcs
+//////////////////////////////////////////////
+struct BVH2NodeMetaData
+{
+    uint  meta_u;   // leaf:  primref start.  inner: offset from node to its first child
+    uint  meta_ss;
+};
+
+inline struct BVH2NodeMetaData BVH2_GetNodeMetaData( global struct BVH2* bvh, uint node_index )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    struct BVH2NodeMetaData meta;
+    meta.meta_u = n->meta_u;
+    meta.meta_ss = n->meta_ss;
+    return meta;
+}
+
+inline bool BVH2NodeMetaData_IsInnerNode( struct BVH2NodeMetaData* meta )
+{
+    return (meta->meta_ss & 0x10000) != 0;
+}
+
+inline ushort BVH2NodeMetaData_GetLeafPrimCount( struct BVH2NodeMetaData* meta )
+{
+    return meta->meta_ss & 0xffff;
+}
+
+inline uint BVH2NodeMetaData_GetLeafPrimStart( struct BVH2NodeMetaData* meta )
+{
+    return meta->meta_u;
+}
+
+inline uint BVH2NodeMetaData_GetMask( struct BVH2NodeMetaData* meta )
+{
+    return (meta->meta_ss>>24);
+}
+
+//////////////////////////////////////////////
+
+inline ushort BVH2_GetLeafPrimCount( struct BVH2* bvh, uint node_index )
+{
+    struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index;
+    return n->meta_ss & 0xffff;
+}
+inline uint BVH2_GetLeafPrimStart( struct BVH2* bvh, uint node_index )
+{
+    struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index;
+    return n->meta_u;
+}
+inline uint2 BVH2_GetChildIndices( struct BVH2* bvh, uint node_index )
+{
+    struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index;
+    uint2 idx;
+    idx.x = n->meta_u;
+    idx.y = idx.x + (n->meta_ss & 0xffff);
+    return idx;
+}
+
+inline float BVH2_GetNodeArea( global struct BVH2* bvh, uint node_index )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    return AABB3f_halfArea( &n->box );
+}
+
+
+inline struct AABB3f BVH2_GetNodeBox( global struct BVH2* bvh, uint node_index )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    return n->box;
+}
+inline void BVH2_SetNodeBox( global struct BVH2* bvh, uint node_index, struct AABB3f* box )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    n->box = *box;
+}
+
+inline void BVH2_SetNodeBox_lu( global struct BVH2* bvh, uint node_index, float3 lower, float3 upper )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    AABB3f_set( &n->box, lower, upper );
+}
+
+inline void BVH2_InitNodeBox( struct BVH2* bvh, uint node_index )
+{
+    struct BVH2Node* n = ((struct BVH2Node*)(bvh + 1)) + node_index;
+    AABB3f_init( &n->box );
+}
+
+inline struct AABB BVH2_GetAABB( global struct BVH2* bvh, uint node_index )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    struct AABB r;
+    r.lower.xyz = AABB3f_load_lower( &n->box );
+    r.upper.xyz = AABB3f_load_upper( &n->box );
+    return r;
+}
+
+inline void BVH2_WriteInnerNode( global struct BVH2* bvh, uint node_index, struct AABB3f* box, uint2 child_offsets, uint mask )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    n->box = *box;
+    n->meta_u   = child_offsets.x;
+    n->meta_ss  = 0x10000 + (child_offsets.y - child_offsets.x) + (mask<<24);
+  //  n->is_inner  = true;
+}
+
+inline void BVH2_WriteLeafNode( global struct BVH2* bvh, uint node_index, struct AABB3f* box, uint prim_start, uint prim_count, uint mask  )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    n->box = *box;
+    n->meta_u   = prim_start;
+    n->meta_ss  = prim_count + (mask<<24);
+    //  n->is_inner  = true;
+}
+
+inline uint BVH2_GetMask( global struct BVH2* bvh, uint node_index )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    return (n->meta_ss>>24);
+}
+
+
+uint BVH2_AllocateNodes( global struct BVH2* bvh, uint num_nodes )
+{
+    return atomic_add_global( &bvh->num_nodes, num_nodes );
+}
+
+inline void BVH2_AtomicMergeNodeBox( global struct BVH2* bvh, uint node_index, float3 lower, float3 upper )
+{
+    global struct BVH2Node* n = ((global struct BVH2Node*)(bvh + 1)) + node_index;
+    AABB3f_atomic_merge_global_lu( &n->box, lower, upper );
+}
+
+
+void BVH2_print( global struct BVH2* bvh, uint start_node )
+{
+    if ( get_local_id( 0 ) == 0 && get_sub_group_id() == 0 )
+    {
+        uint num_nodes = bvh->num_nodes;
+
+        uint2 stack[BFS_MAX_DEPTH * 2];
+        uint sp = 0;
+
+        printf( "allocated_nodes=%u\n", num_nodes );
+
+        stack[sp++] = (uint2)(start_node, 0);
+        while ( sp > 0 )
+        {
+            uint2 data = stack[--sp];
+            uint node = data.x;
+            uint depth = data.y;
+
+            for ( uint i = 0; i < depth; i++ )
+                printf( "    " );
+
+            if ( BVH2_IsInnerNode( bvh, node ) )
+            {
+                uint2 kids = BVH2_GetChildIndices( bvh, node );
+                printf( " %5u: inner: %u %u \n", node, kids.x, kids.y );
+                stack[sp++] = (uint2)(kids.y, depth + 1);
+                stack[sp++] = (uint2)(kids.x, depth + 1);
+
+                struct AABB3f l = BVH2_GetNodeBox( bvh, kids.x );
+                struct AABB3f r = BVH2_GetNodeBox( bvh, kids.y );
+                struct AABB3f p = BVH2_GetNodeBox( bvh, node );
+
+                float3 pl = AABB3f_load_lower( &p );
+                float3 pu = AABB3f_load_upper( &p );
+                float3 ll = AABB3f_load_lower( &l );
+                float3 lu = AABB3f_load_upper( &l );
+                float3 rl = AABB3f_load_lower( &r );
+                float3 ru = AABB3f_load_upper( &r );
+                if ( any( ll < pl ) || any( rl < pl ) ||
+                     any( lu > pu ) || any( ru > pu ) )
+                {
+                    for ( uint i = 0; i < depth; i++ )
+                        printf( "    " );
+
+                    printf( "BAD_BOUNDS!!!!!!!! %u\n", node );
+                }
+
+
+            }
+            else
+            {
+
+                uint start = BVH2_GetLeafPrimStart( bvh, node );
+                uint count = BVH2_GetLeafPrimCount( bvh, node );
+                printf( " %5u: leaf: start=%u count=%u\n  ",node,start,count );
+
+            }
+        }
+    }
+    barrier( CLK_LOCAL_MEM_FENCE );
+}
+
+
+global uint* SAHBuildGlobals_GetPrimrefIndices_In( struct SAHBuildGlobals* globals, bool odd_pass )
+{
+    uint num_refs = globals->num_primrefs;
+    global uint* ib = (global uint*) globals->p_primref_index_buffers;
+    return ib + (odd_pass ? num_refs : 0);
+}
+
+global uint* SAHBuildGlobals_GetPrimrefIndices_Out( struct SAHBuildGlobals* globals, bool odd_pass )
+{
+    uint num_refs = globals->num_primrefs;
+    global uint* ib = (global uint*) globals->p_primref_index_buffers;
+    return ib + (odd_pass ? 0 : num_refs);
+}
+
+global PrimRef* SAHBuildGlobals_GetPrimrefs( struct SAHBuildGlobals* globals )
+{
+    return (global PrimRef*) globals->p_primrefs_buffer;
+}
+
+global struct BVH2* SAHBuildGlobals_GetBVH2( struct SAHBuildGlobals* globals )
+{
+    return (global struct BVH2*)globals->p_bvh2;
+}
+
+uint SAHBuildGlobals_GetLeafSizeInBytes( struct SAHBuildGlobals* globals )
+{
+    return globals->leaf_size;
+}
+
+uint SAHBuildGlobals_GetLeafType( struct SAHBuildGlobals* globals )
+{
+    return globals->leaf_type;
+}
+
+uint SAHBuildGlobals_GetInternalNodeType( struct SAHBuildGlobals* globals )
+{
+    return NODE_TYPE_INTERNAL;
+}
+
+global struct BVHBase* SAHBuildGlobals_GetBVHBase( struct SAHBuildGlobals* globals )
+{
+    return (global struct BVHBase*) globals->p_bvh_base;
+}
+
+uint SAHBuildGlobals_GetTotalPrimRefs( struct SAHBuildGlobals* globals )
+{
+    return globals->num_primrefs;
+}
+
+inline bool SAHBuildGlobals_NeedBackPointers( struct SAHBuildGlobals* globals )
+{
+    return globals->flags & SAH_FLAG_NEED_BACKPOINTERS;
+}
+inline bool SAHBuildGlobals_NeedMasks( struct SAHBuildGlobals* globals )
+{
+    return globals->flags & SAH_FLAG_NEED_MASKS;
+}
+
+
+void SAHBuildGlobals_print( struct SAHBuildGlobals* globals )
+{
+    if ( get_local_id( 0 ) == 0 )
+    {
+        printf( "SAHBuildGlobals: %p\n", globals );
+        printf( "  p_primref_index_buffers =%p\n", globals->p_primref_index_buffers );
+        printf( "  p_primrefs_buffer =%p\n",       globals->p_primrefs_buffer );
+        printf( "  p_bvh2 =%p\n",                  globals->p_bvh2 );
+        printf( "  p_globals =%p\n",               globals->p_globals );
+        printf( "  p_bvh_base =%p\n",              globals->p_bvh_base );
+        printf( "  num_primrefs = %u\n",           globals->num_primrefs );
+        printf( "  leaf_size = %u\n",              globals->leaf_size );
+        printf( "  leaf_type = %u\n",              globals->leaf_type );
+        printf( "  p_qnode_buffer = %p\n",         globals->p_qnode_root_buffer);
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+}
+
+
+uint get_num_wgs(uint thread_count, uint WG_SIZE)
+{
+    return (thread_count + WG_SIZE - 1) / WG_SIZE;
+}
+
+
+
+
+
+struct BFSDispatchArgs
+{
+    global struct VContextScheduler* scheduler;
+    global struct VContext* context;
+    global struct BVH2* bvh2;
+    global uint* primref_index_in;
+    global uint* primref_index_out;
+    global PrimRef* primref_buffer;
+
+    uint   wg_primref_begin;
+    uint   wg_primref_end;
+    uint   dispatch_primref_begin;
+    uint   dispatch_primref_end;
+    uint   context_id;
+    uint   num_wgs;
+    uint   bvh2_root;
+    uint   global_num_primrefs;
+    bool   do_mask_processing;
+};
+
+
+
+
+// TODO_OPT:  Enable larger WGs
+//    We need a way to do this in a portable fashion.
+//     Gen12 can support larger WGs than Gen9 can
+//
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 512, 1, 1 )) )
+kernel void
+begin( global struct VContextScheduler* scheduler,
+       dword leaf_size,
+       dword leaf_type,
+       global uint* primref_index_buffers,
+       global PrimRef* primref_buffer,
+       global struct BVH2* bvh2,
+       global struct BVHBase* bvh_base,
+       global struct Globals* globals,
+       global struct SAHBuildGlobals* sah_globals,
+       global uint2* qnode_root_buffer,
+       dword sah_globals_flags
+     )
+{
+    dword num_primrefs = globals->numPrimitives;
+    if ( get_local_id( 0 ) == 0 )
+    {
+        sah_globals->p_primrefs_buffer       = (qword) primref_buffer;
+        sah_globals->p_primref_index_buffers = (qword)primref_index_buffers;
+        sah_globals->p_bvh2                  = (qword) bvh2;
+        sah_globals->p_bvh_base              = (qword) bvh_base;
+        sah_globals->leaf_size               = leaf_size;
+        sah_globals->leaf_type               = leaf_type;
+        sah_globals->num_primrefs            = num_primrefs;
+        sah_globals->p_globals               = (qword) globals;
+        sah_globals->p_qnode_root_buffer     = (gpuva_t) qnode_root_buffer;
+        sah_globals->flags                   = sah_globals_flags;
+
+        // initialize the spill stack
+        scheduler->bfs2_spill_stack.size = 0;
+
+        // initialize BVH2 node counter
+        BVH2_Initialize( bvh2 );
+
+        // configure first vcontext for first build
+        scheduler->contexts[0].dispatch_primref_begin = 0;
+        scheduler->contexts[0].dispatch_primref_end   = num_primrefs;
+        scheduler->contexts[0].bvh2_root              = BVH2_GetRoot( bvh2 );
+        scheduler->contexts[0].tree_depth             = 0;
+        scheduler->contexts[0].batch_index            = 0;
+
+        scheduler->bfs_queue.records[0].context_id = 0;
+
+        scheduler->contexts[0].num_left = 0;
+        scheduler->contexts[0].num_right = 0;
+        scheduler->contexts[0].lr_mask = 0;
+
+        // copy centroid bounds into the BVH2 root node'
+        BVH2_SetNodeBox_lu( bvh2, BVH2_GetRoot( bvh2 ), globals->centroidBounds.lower.xyz, globals->centroidBounds.upper.xyz );
+
+        // zero the trivial build counters.. these are only used by the batch-build path
+        //  but single-wg QNode path (if used) depends on them
+        scheduler->num_trivial_builds = 0;
+        scheduler->num_single_builds = 0;
+
+        // initialize the root-buffer counters
+        sah_globals->root_buffer_num_produced     = 0;
+        sah_globals->root_buffer_num_produced_hi  = 0;
+        sah_globals->root_buffer_num_consumed     = 0;
+        sah_globals->root_buffer_num_consumed_hi  = 0;
+    }
+
+    // initialize vcontext states
+    for ( uint i = get_local_id( 0 ); i < BFS_NUM_VCONTEXTS; i += get_local_size( 0 ) )
+        scheduler->vcontext_state[i] = (i==0) ? VCONTEXT_STATE_EXECUTING : VCONTEXT_STATE_UNALLOCATED;
+
+    // initialize global bin info in vcontext - only context[0] will be used in first iteration
+    BinInfo_init( &scheduler->contexts[0].global_bin_info );
+    LRBounds_init( &scheduler->contexts[0].lr_bounds );
+
+   // barrier( CLK_GLOBAL_MEM_FENCE  ); // lsc flush ... driver now does these as part of COMPUTE_WALKER
+}
+
+// TODO_OPT:  Enable larger WGs
+//    We need a way to do this in a portable fashion.
+//     Gen12 can support larger WGs than Gen9 can
+//
+
+
+// TODO_OPT:  Enable larger WGs
+//    We need a way to do this in a portable fashion.
+//     Gen12 can support larger WGs than Gen9 can
+//
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(512, 1, 1)))
+kernel void
+categorize_builds_and_init_scheduler(
+    global struct VContextScheduler* scheduler,
+    global gpuva_t* globals_ptrs,                // OCL-C does not allow kernel parameters to be pointer-to-pointer, so we trick it...
+    global struct SAHBuildBuffersInfo* buffers_info,
+    global struct SAHBuildGlobals* builds_out,
+    dword num_builds
+)
+{
+    local uint num_trivial;
+    local uint num_single;
+    local uint num_full;
+
+    if (get_group_id(0) == 0) // first workgroup performs build categorization
+    {
+        if (get_local_id(0) == 0)
+        {
+            num_trivial = 0;
+            num_single = 0;
+            num_full = 0;
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // first pass, count builds of each type
+        uint triv = 0;
+        uint single = 0;
+        uint full = 0;
+        for (uint i = get_local_id(0); i < num_builds; i += get_local_size(0))
+        {
+            global struct Globals* globals = (global struct Globals*) globals_ptrs[i];
+            dword num_refs = globals->numPrimitives;
+
+            if (num_refs <= TRIVIAL_BUILD_THRESHOLD)
+                triv++;
+            else if (num_refs <= SINGLE_WG_BUILD_THRESHOLD)
+                single++;
+            else
+                full++;
+        }
+
+        // merge counts across work-group.  These variables are now offsets into this thread's ranges
+        triv   = atomic_add_local(&num_trivial, triv);
+        single = atomic_add_local(&num_single, single);
+        full   = atomic_add_local(&num_full, full);
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        global struct SAHBuildGlobals* trivial_builds_out = builds_out;
+        global struct SAHBuildGlobals* single_builds_out = builds_out + num_trivial;
+        global struct SAHBuildGlobals* full_builds_out = builds_out + num_trivial + num_single;
+
+        for (uint i = get_local_id(0); i < num_builds; i += get_local_size(0))
+        {
+            global struct Globals* globals = (global struct Globals*) globals_ptrs[i];
+            global struct SAHBuildBuffersInfo* buffers = &buffers_info[i];
+
+            dword num_refs = globals->numPrimitives;
+            dword leaf_type = globals->leafPrimType;
+            dword leaf_size = globals->leafSize;
+
+            global struct SAHBuildGlobals* place;
+            if (num_refs <= TRIVIAL_BUILD_THRESHOLD)
+                place = trivial_builds_out + (triv++);
+            else if (num_refs <= SINGLE_WG_BUILD_THRESHOLD)
+                place = single_builds_out + (single++);
+            else
+                place = full_builds_out + (full++);
+
+            place->p_primref_index_buffers = buffers->p_primref_index_buffers;
+            place->p_primrefs_buffer    = buffers->p_primrefs_buffer;
+            place->p_bvh2               = buffers->p_bvh2;
+            place->p_bvh_base           = buffers->p_bvh_base;
+            place->p_globals            = (gpuva_t)globals;
+            place->num_primrefs         = num_refs;
+            place->leaf_size            = leaf_size;
+            place->leaf_type            = leaf_type;
+            place->flags                = buffers->sah_globals_flags;
+            place->p_qnode_root_buffer  = buffers->p_qnode_root_buffer;
+
+            // only initialize BVH2 if it will actually be used by the build
+            //   trivial passes will not use it
+            if( num_refs > SINGLE_WG_BUILD_THRESHOLD )
+            {
+                // initialize BVH2 node counter
+                global struct BVH2* bvh2 = SAHBuildGlobals_GetBVH2(place);
+                BVH2_Initialize(bvh2);
+
+                // copy centroid bounds into the BVH2 root node'
+                BVH2_SetNodeBox_lu(bvh2, BVH2_GetRoot(bvh2), globals->centroidBounds.lower.xyz, globals->centroidBounds.upper.xyz);
+            }
+        }
+
+        if (get_local_id(0) == 0)
+        {
+            scheduler->num_trivial_builds   = num_trivial;
+            scheduler->num_single_builds    = num_single;
+            scheduler->batched_build_offset = num_trivial + num_single;
+            scheduler->batched_build_count  = num_full;
+        }
+    }
+    else // second workgroup initializes the scheduler
+    {
+        // initialize vcontext states
+        for (uint i = get_local_id(0); i < BFS_NUM_VCONTEXTS; i += get_local_size(0))
+            scheduler->vcontext_state[i] = (i == 0) ? VCONTEXT_STATE_EXECUTING : VCONTEXT_STATE_UNALLOCATED;
+
+        // initialize global bin info in vcontexts
+        for (uint i = get_sub_group_id(); i < BFS_NUM_VCONTEXTS; i += get_num_sub_groups())
+            BinInfo_init_subgroup(&scheduler->contexts[i].global_bin_info);
+
+        // initialize the spill stack
+        if (get_local_id(0) == 0)
+            scheduler->bfs2_spill_stack.size = 0;
+    }
+
+    //barrier( CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE );// lsc flush ... driver now does these as part of COMPUTE_WALKER
+}
+
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BFS_NUM_VCONTEXTS, 1, 1)))
+kernel void
+begin_batchable(
+    global struct VContextScheduler* scheduler,
+    global struct SAHBuildGlobals* sah_globals
+)
+{
+    ushort scheduler_build_offset = scheduler->batched_build_offset;
+    ushort scheduler_num_builds   = scheduler->batched_build_count;
+
+    ushort num_builds = min( scheduler_num_builds, (ushort)BFS_NUM_VCONTEXTS );
+
+    uint num_wgs = 0;
+
+    ushort tid = get_local_id(0);
+    if ( tid < num_builds )
+    {
+        ushort batch_index = scheduler_build_offset + tid;
+
+        uint num_primrefs = sah_globals[batch_index].num_primrefs;
+
+        // configure first vcontext for first build
+        scheduler->contexts[tid].dispatch_primref_begin = 0;
+        scheduler->contexts[tid].dispatch_primref_end   = num_primrefs;
+        scheduler->contexts[tid].bvh2_root              = BVH2_GetRoot( SAHBuildGlobals_GetBVH2(&sah_globals[batch_index]) );
+        scheduler->contexts[tid].tree_depth             = 0;
+        scheduler->contexts[tid].batch_index            = batch_index;
+        scheduler->vcontext_state[tid] = VCONTEXT_STATE_EXECUTING;
+
+        scheduler->contexts[tid].num_left = 0;
+        scheduler->contexts[tid].num_right = 0;
+        scheduler->contexts[tid].lr_mask   = 0;
+
+        num_wgs = get_num_wgs( num_primrefs, BFS_WG_SIZE );
+
+        scheduler->bfs_queue.wg_count[tid] = num_wgs;
+        scheduler->bfs_queue.records[tid].batch_index = batch_index;
+        scheduler->bfs_queue.records[tid].context_id  = tid;
+    }
+
+    num_wgs = work_group_reduce_add(num_wgs);
+
+    if (tid == 0)
+    {
+        // write out build count and offset for next BFS iteration
+        scheduler->batched_build_offset = scheduler_build_offset + num_builds;
+        scheduler->batched_build_count  = scheduler_num_builds - num_builds;
+
+        // write out initial WG count and loop termination mask for command streamer to consume
+        scheduler->batched_build_wg_count  = num_wgs;
+        scheduler->batched_build_loop_mask = (scheduler_num_builds > num_builds) ? 1 : 0;
+
+        scheduler->bfs_queue.num_dispatches = num_builds;
+    }
+
+    for ( uint i = get_sub_group_id(); i < num_builds; i += get_num_sub_groups() )
+        BinInfo_init_subgroup( &scheduler->contexts[i].global_bin_info );
+
+    for ( uint i = get_sub_group_id(); i < num_builds; i += get_num_sub_groups() )
+        LRBounds_init_subgroup( &scheduler->contexts[i].lr_bounds );
+}
+
+
+
+bool is_leaf( uint num_refs )
+{
+    return num_refs <= TREE_ARITY;
+}
+
+bool is_dfs( uint num_refs )
+{
+    return num_refs > TREE_ARITY&& num_refs <= DFS_THRESHOLD;
+}
+
+bool is_bfs( uint num_refs )
+{
+    return num_refs > DFS_THRESHOLD;
+}
+
+int2 is_leaf_2( uint2 num_refs )
+{
+    return num_refs.xy <= TREE_ARITY;
+}
+int2 is_bfs_2( uint2 num_refs )
+{
+    return num_refs.xy > DFS_THRESHOLD;
+}
+
+int2 is_dfs_2( uint2 num_refs )
+{
+    return num_refs.xy > TREE_ARITY && num_refs.xy <= DFS_THRESHOLD;
+}
+
+#if 0
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+sg_scheduler( global struct VContextScheduler* scheduler )
+{
+    local struct BFS1SpillStackEntry SLM_local_spill_stack[BFS_NUM_VCONTEXTS];
+    local uchar SLM_context_state[BFS_NUM_VCONTEXTS];
+    local vcontext_id_t SLM_free_list[BFS_NUM_VCONTEXTS];
+    local vcontext_id_t SLM_exec_list[BFS_NUM_VCONTEXTS];
+
+
+    varying ushort lane = get_sub_group_local_id();
+
+    uniform uint free_list_size = 0;
+    uniform uint exec_list_size = 0;
+
+    // read context states, build lists of free and executing contexts
+    for (varying uint i = lane; i < BFS_NUM_VCONTEXTS; i += get_sub_group_size())
+    {
+        uchar state = scheduler->vcontext_state[i];
+        SLM_context_state[i] = state;
+
+        uniform ushort exec_mask = intel_sub_group_ballot(state == VCONTEXT_STATE_EXECUTING);
+
+        varying ushort prefix_exec = subgroup_bit_prefix_exclusive(exec_mask);
+        varying ushort prefix_free = lane - prefix_exec;
+        varying ushort exec_list_pos = exec_list_size + prefix_exec;
+        varying ushort free_list_pos = free_list_size + prefix_free;
+
+        if (state == VCONTEXT_STATE_EXECUTING)
+            SLM_exec_list[exec_list_pos] = i;
+        else
+            SLM_free_list[free_list_pos] = i;
+
+        uniform ushort num_exec = popcount(exec_mask);
+        exec_list_size += num_exec;
+        free_list_size += get_sub_group_size() - num_exec;
+    }
+
+    uniform uint total_bfs_dispatches = 0;
+    uniform uint total_dfs_dispatches = 0;
+    uniform uint bfs_spill_stack_size   = 0;
+    uniform uint total_bfs_wgs      = 0;
+
+    // process executing context.  accumulate bfs/dfs dispatches and free-list entries
+    for (uint i = 0; i < exec_list_size; i+= get_sub_group_size() )
+    {
+        varying ushort num_dfs_dispatches     = 0;
+        varying ushort num_bfs_spills         = 0;
+
+        varying ushort num_bfs_children;
+        varying ushort context_id;
+        struct VContext* context;
+        varying uint num_left      ;
+        varying uint num_right     ;
+        varying uint primref_begin ;
+        varying uint primref_end   ;
+        varying uint depth         ;
+
+        bool active_lane = (i + lane) < exec_list_size;
+        if ( active_lane )
+        {
+            context_id = SLM_exec_list[i + lane];
+            context    = &scheduler->contexts[context_id];
+
+            num_left      = context->num_left;
+            num_right     = context->num_right;
+            primref_begin = context->dispatch_primref_begin;
+            primref_end   = context->dispatch_primref_end;
+            depth         = context->tree_depth;
+
+            // get dispatch counts
+
+            num_dfs_dispatches = is_dfs(num_left) + is_dfs(num_right);
+            num_bfs_children = is_bfs(num_left) + is_bfs(num_right);
+            num_bfs_spills = (num_bfs_children == 2) ? 1 : 0;
+        }
+
+        // allocate space for DFS, BFS dispatches, and BFS spills
+        varying uint dfs_pos               = total_dfs_dispatches + sub_group_scan_exclusive_add(num_dfs_dispatches);
+        varying ushort mask_bfs_spills     = intel_sub_group_ballot(num_bfs_children & 2); // spill if #children == 2
+        varying ushort mask_bfs_dispatches = intel_sub_group_ballot(num_bfs_children & 3); // dispatch if #children == 1 or 2
+        varying uint bfs_spill_pos         = bfs_spill_stack_size + subgroup_bit_prefix_exclusive(mask_bfs_spills);
+        varying uint bfs_dispatch_pos      = total_bfs_dispatches + subgroup_bit_prefix_exclusive(mask_bfs_dispatches);
+
+        total_dfs_dispatches += sub_group_reduce_add(num_dfs_dispatches);
+        bfs_spill_stack_size += popcount(mask_bfs_spills);
+        total_bfs_dispatches += popcount(mask_bfs_dispatches);
+
+        varying uint num_bfs_wgs = 0;
+        if (active_lane)
+        {
+            if (num_dfs_dispatches)
+            {
+                if (is_dfs(num_left))
+                {
+                    scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin;
+                    scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_left;
+                    scheduler->dfs_queue.records[dfs_pos].bvh2_base = context->left_bvh2_root;
+                    scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1;
+                    dfs_pos++;
+                }
+                if (is_dfs(num_right))
+                {
+                    scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin + num_left;
+                    scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_right;
+                    scheduler->dfs_queue.records[dfs_pos].bvh2_base = context->right_bvh2_root;
+                    scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1;
+                }
+            }
+
+            uint num_bfs_children = is_bfs(num_left) + is_bfs(num_right);
+            if (num_bfs_children == 2)
+            {
+                // spill the right child.. push an entry onto local spill stack
+                SLM_local_spill_stack[bfs_spill_pos].primref_begin = primref_begin + num_left;
+                SLM_local_spill_stack[bfs_spill_pos].primref_end = primref_end;
+                SLM_local_spill_stack[bfs_spill_pos].bvh2_root = context->right_bvh2_root;
+                SLM_local_spill_stack[bfs_spill_pos].tree_depth = depth + 1;
+
+                // setup BFS1 dispatch for left child
+                context->dispatch_primref_end = primref_begin + num_left;
+                context->bvh2_root = context->left_bvh2_root;
+                context->tree_depth = depth + 1;
+                num_bfs_wgs = get_num_wgs(num_left, BFS_WG_SIZE);
+
+                scheduler->bfs_queue.wg_count[bfs_dispatch_pos]           = num_bfs_wgs;
+                scheduler->bfs_queue.records[bfs_dispatch_pos].context_id = context_id;
+            }
+            else if (num_bfs_children == 1)
+            {
+                // setup BFS1 dispatch for whichever child wants it
+                if (is_bfs(num_left))
+                {
+                    // bfs on left child
+                    context->dispatch_primref_end = context->dispatch_primref_begin + num_left;
+                    context->bvh2_root = context->left_bvh2_root;
+                    context->tree_depth = depth + 1;
+                    num_bfs_wgs = get_num_wgs(num_left, BFS_WG_SIZE);
+                }
+                else
+                {
+                    // bfs on right child
+                    context->dispatch_primref_begin = context->dispatch_primref_begin + num_left;
+                    context->bvh2_root = context->right_bvh2_root;
+                    context->tree_depth = depth + 1;
+                    num_bfs_wgs = get_num_wgs(num_right, BFS_WG_SIZE);
+                }
+
+                scheduler->bfs_queue.wg_count[bfs_dispatch_pos]           = num_bfs_wgs;
+                scheduler->bfs_queue.records[bfs_dispatch_pos].context_id = context_id;
+            }
+            else
+            {
+                // no bfs dispatch.. this context is now free
+                SLM_context_state[context_id] = VCONTEXT_STATE_UNALLOCATED;
+            }
+        }
+
+        // count bfs work groups
+        total_bfs_wgs += sub_group_reduce_add(num_bfs_wgs);
+
+        // add newly deallocated contexts to the free list
+        uniform uint free_mask = intel_sub_group_ballot( active_lane && num_bfs_children == 0);
+        varying uint free_list_pos = free_list_size + subgroup_bit_prefix_exclusive(free_mask);
+        free_list_size += popcount(free_mask);
+
+        if ( free_mask & (1<<lane) )
+            SLM_free_list[free_list_pos] = context_id;
+
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // if we have more free contexts than spills, read additional spills from the scheduler's spill stack
+    uniform uint memory_spill_stack_size = scheduler->bfs2_spill_stack.size;
+
+    if(bfs_spill_stack_size < free_list_size && memory_spill_stack_size > 0 )
+    {
+        uniform uint read_count = min(free_list_size - bfs_spill_stack_size, memory_spill_stack_size);
+
+        for (varying uint i = lane; i < read_count; i+= get_sub_group_size())
+            SLM_local_spill_stack[bfs_spill_stack_size + i] = scheduler->bfs2_spill_stack.entries[memory_spill_stack_size - 1 - i];
+
+        bfs_spill_stack_size += read_count;
+        memory_spill_stack_size -= read_count;
+    }
+
+    // steal pending BFS work and assign it to free contexts
+    uniform uint num_steals = min(bfs_spill_stack_size, free_list_size);
+
+    for (uniform uint i = 0; i < num_steals; i += get_sub_group_size())
+    {
+        varying uint num_bfs_wgs = 0;
+
+        if (i + lane < num_steals)
+        {
+            uint context_id = SLM_free_list[i+lane];
+            struct VContext* context = &scheduler->contexts[context_id];
+            struct BFS1SpillStackEntry entry = SLM_local_spill_stack[i+lane];
+
+            context->dispatch_primref_begin = entry.primref_begin;
+            context->dispatch_primref_end = entry.primref_end;
+            context->bvh2_root = entry.bvh2_root;
+            context->tree_depth = entry.tree_depth;
+
+            num_bfs_wgs = get_num_wgs(entry.primref_end - entry.primref_begin, BFS_WG_SIZE);
+
+            scheduler->bfs_queue.wg_count[total_bfs_dispatches + i + lane] = num_bfs_wgs;
+            scheduler->bfs_queue.records[total_bfs_dispatches + i + lane].context_id = context_id;
+
+            SLM_context_state[context_id] = VCONTEXT_STATE_EXECUTING;
+        }
+
+        total_bfs_wgs += sub_group_reduce_add( num_bfs_wgs );
+    }
+
+    total_bfs_dispatches += num_steals;
+
+    //  write out excess spills to global spill stack
+    uniform uint extra_spills = bfs_spill_stack_size - num_steals;
+    for (varying uint i = lane; i < extra_spills; i += get_sub_group_size())
+    {
+        scheduler->bfs2_spill_stack.entries[memory_spill_stack_size + i] = SLM_local_spill_stack[num_steals+i];
+    }
+
+
+    // write out modified context states
+    for ( varying uint i = lane; i < BFS_NUM_VCONTEXTS; i += get_sub_group_size())
+        scheduler->vcontext_state[i] = SLM_context_state[i];
+
+
+    if (get_local_id(0) == 0)
+    {
+        // write out new memory stack size
+        scheduler->bfs2_spill_stack.size = memory_spill_stack_size + extra_spills;
+
+        // store workgroup counters
+        scheduler->bfs_queue.num_dispatches = total_bfs_dispatches;
+        scheduler->num_bfs_wgs = total_bfs_wgs;
+        scheduler->num_dfs_wgs = total_dfs_dispatches;
+    }
+
+  //  barrier(CLK_GLOBAL_MEM_FENCE); // make memory writes globally visible// lsc flush ... driver now does these as part of COMPUTE_WALKER
+}
+#endif
+
+#define SCHEDULER_SG_SIZE 16
+#define SCHEDULER_WG_SIZE BFS_NUM_VCONTEXTS
+#define SCHEDULER_NUM_SGS (SCHEDULER_WG_SIZE / SCHEDULER_SG_SIZE)
+
+
+struct BFSDispatchArgs get_bfs_args_from_record_batchable(
+    struct BFSDispatchRecord* record,
+    global struct VContextScheduler* scheduler,
+    global struct SAHBuildGlobals* globals_buffer );
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(SCHEDULER_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(SCHEDULER_SG_SIZE)))
+kernel void
+scheduler(global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals )
+{
+    local struct BFS1SpillStackEntry SLM_local_spill_stack[2 * BFS_NUM_VCONTEXTS];
+    local uint SLM_local_spill_stack_size;
+    local uint SLM_dfs_dispatch_count;
+
+    if (get_local_id(0) == 0)
+    {
+        SLM_local_spill_stack_size = 0;
+        SLM_dfs_dispatch_count = 0;
+    }
+
+    uint context_id = get_local_id(0);
+    uint state = scheduler->vcontext_state[context_id];
+    uint initial_state = state;
+
+    uint batch_index = 0;
+    global struct VContext* context = &scheduler->contexts[context_id];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+
+    uint global_spill_stack_size = scheduler->bfs2_spill_stack.size;
+
+
+    if (state == VCONTEXT_STATE_EXECUTING)
+    {
+        uint left_bvh2_root;
+        uint right_bvh2_root;
+
+        uint num_left = context->num_left;
+        uint num_right = context->num_right;
+
+        uint primref_begin = context->dispatch_primref_begin;
+        uint primref_end = context->dispatch_primref_end;
+
+        uint depth = context->tree_depth;
+        uint batch_index = context->batch_index;
+
+        struct BFSDispatchRecord record;
+        record.context_id = context_id;
+        record.batch_index = context->batch_index;
+
+        struct BFSDispatchArgs args = get_bfs_args_from_record_batchable( &record, scheduler, sah_globals);
+
+        // do cleanup of bfs_pass2
+        {
+            // compute geom bounds
+            struct AABB3f left_geom_bounds;
+            struct AABB3f right_geom_bounds;
+            struct AABB3f left_centroid_bounds;
+            struct AABB3f right_centroid_bounds;
+            uint2 lr_counts = (uint2)(num_left, num_right);
+
+            {
+                left_centroid_bounds    = LRBounds_get_left_centroid( &context->lr_bounds );
+                left_geom_bounds        = LRBounds_get_left_geom(  &context->lr_bounds );
+                right_centroid_bounds   = LRBounds_get_right_centroid( &context->lr_bounds );
+                right_geom_bounds       = LRBounds_get_right_geom( &context->lr_bounds );
+            }
+
+            int2 v_is_leaf = is_leaf_2( lr_counts );
+            int2 v_is_dfs  = is_dfs_2( lr_counts );
+            int2 v_is_bfs  = is_bfs_2( lr_counts );
+            uint left_mask  = args.do_mask_processing ? context->lr_mask & 0xff : 0xff;
+            uint right_mask = args.do_mask_processing ? (context->lr_mask & 0xff00) >> 8 : 0xff;
+
+            // how many BVH2 nodes do we need to allocate?  For DFS, we need to pre-allocate full subtree
+            uint2 lr_node_counts = select( (uint2)(1,1), (2*lr_counts-1), v_is_dfs );
+            uint left_node_count = lr_node_counts.x;
+            uint right_node_count = lr_node_counts.y;
+
+            // allocate the nodes
+            uint first_node = BVH2_AllocateNodes( args.bvh2, left_node_count + right_node_count );
+
+            // point our root node at its children
+            left_bvh2_root  = first_node;
+            right_bvh2_root = first_node + left_node_count;
+
+            // store combined geom bounds in the root node's AABB.. we previously stored centroid bounds there
+            //   but node creation requires geom bounds
+            struct AABB3f geom_bounds = left_geom_bounds;
+            AABB3f_extend(&geom_bounds, &right_geom_bounds);
+            BVH2_WriteInnerNode( args.bvh2, args.bvh2_root, &geom_bounds, (uint2)(left_bvh2_root,right_bvh2_root), left_mask | right_mask );
+
+//            printf(" node: %u  mask: %x\n", args.bvh2_root, left_mask|right_mask );
+
+            // store the appropriate AABBs in the child nodes
+            //   - BFS passes need centroid bounds
+            //   - DFS passes need geom bounds
+            //  Here we also write leaf connectivity information (prim start+count)
+            //   this will be overwritten later if we are creating an inner node
+            struct AABB3f left_box, right_box;
+            left_box  = AABB3f_select( left_geom_bounds,  left_centroid_bounds,  v_is_bfs.xxx );
+            right_box = AABB3f_select( right_geom_bounds, right_centroid_bounds, v_is_bfs.yyy );
+
+            uint left_start  = primref_begin;
+            uint right_start = primref_begin + num_left;
+            BVH2_WriteLeafNode( args.bvh2, left_bvh2_root,  &left_box, left_start,  num_left, left_mask );
+            BVH2_WriteLeafNode( args.bvh2, right_bvh2_root, &right_box, right_start, num_right, right_mask );
+
+            // make input and output primref index buffers consistent in the event we're creating a leaf
+            //   There should only ever be one leaf created, otherwise we'd have done a DFS pass sooner
+            if (any( v_is_leaf.xy ))
+            {
+                uint start    = v_is_leaf.x ? left_start : right_start;
+                uint num_refs = v_is_leaf.x ? num_left : num_right;
+
+                for(uint i = 0; i < num_refs; i++)
+                {
+                    args.primref_index_in[start + i] = args.primref_index_out[start + i];
+                }
+            }
+        }
+
+        // when BFS2 finishes, we need to dispatch two child tasks.
+        //   DFS dispatches can run free and do not need a context
+        //   BFS dispatches need a context.
+        //  In the case where both of the child nodes are BFS, the current context can immediately run one of the child dispatches
+        //   and the other is spilled for an unallocated context to pick up
+
+        uint num_dfs_dispatches = is_dfs(num_left) + is_dfs(num_right);
+        if (num_dfs_dispatches)
+        {
+            uint dfs_pos = atomic_add_local(&SLM_dfs_dispatch_count, num_dfs_dispatches);
+            if (is_dfs(num_left))
+            {
+                scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin;
+                scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_left;
+                scheduler->dfs_queue.records[dfs_pos].bvh2_base = left_bvh2_root;
+                scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1;
+                scheduler->dfs_queue.records[dfs_pos].batch_index = batch_index;
+                dfs_pos++;
+            }
+            if (is_dfs(num_right))
+            {
+                scheduler->dfs_queue.records[dfs_pos].primref_base = primref_begin + num_left;
+                scheduler->dfs_queue.records[dfs_pos].num_primrefs = num_right;
+                scheduler->dfs_queue.records[dfs_pos].bvh2_base = right_bvh2_root;
+                scheduler->dfs_queue.records[dfs_pos].tree_depth = depth + 1;
+                scheduler->dfs_queue.records[dfs_pos].batch_index = batch_index;
+            }
+        }
+
+        uint num_bfs_children = is_bfs(num_left) + is_bfs(num_right);
+        if (num_bfs_children)
+        {
+            uint place = atomic_add_local(&SLM_local_spill_stack_size, num_bfs_children);
+            if (is_bfs(num_left))
+            {
+                SLM_local_spill_stack[place].primref_begin = primref_begin;
+                SLM_local_spill_stack[place].primref_end = primref_begin + num_left;
+                SLM_local_spill_stack[place].bvh2_root = left_bvh2_root;
+                SLM_local_spill_stack[place].tree_depth = depth + 1;
+                SLM_local_spill_stack[place].batch_index = batch_index;
+                place++;
+            }
+            if (is_bfs(num_right))
+            {
+                SLM_local_spill_stack[place].primref_begin = primref_begin + num_left;
+                SLM_local_spill_stack[place].primref_end = primref_end;
+                SLM_local_spill_stack[place].bvh2_root = right_bvh2_root;
+                SLM_local_spill_stack[place].tree_depth = depth + 1;
+                SLM_local_spill_stack[place].batch_index = batch_index;
+                place++;
+            }
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    uint local_spill_stack_size = SLM_local_spill_stack_size;
+
+    struct BFS1SpillStackEntry entry;
+    state = VCONTEXT_STATE_UNALLOCATED;
+    if (context_id < local_spill_stack_size)
+    {
+        // pull BFS work from the local spill stack if there's enough work there
+        entry = SLM_local_spill_stack[context_id];
+        state = VCONTEXT_STATE_EXECUTING;
+    }
+    else if ((context_id - local_spill_stack_size) < (global_spill_stack_size))
+    {
+        // if there isn't enough work on the local stack, consume from the global one
+        uint global_pos = (global_spill_stack_size - 1) - (context_id - local_spill_stack_size);
+        entry = scheduler->bfs2_spill_stack.entries[global_pos];
+        state = VCONTEXT_STATE_EXECUTING;
+    }
+
+    // contexts which received work set themselves up for the next BFS1 dispatch
+    uint num_bfs_wgs = 0;
+    uint num_bfs_dispatches = 0;
+    if (state == VCONTEXT_STATE_EXECUTING)
+    {
+        context->dispatch_primref_begin = entry.primref_begin;
+        context->dispatch_primref_end = entry.primref_end;
+        context->bvh2_root = entry.bvh2_root;
+        context->tree_depth = entry.tree_depth;
+        context->batch_index = entry.batch_index;
+
+        context->num_left = 0;
+        context->num_right = 0;
+        context->lr_mask = 0;
+
+        batch_index = entry.batch_index;
+        num_bfs_wgs = get_num_wgs(entry.primref_end - entry.primref_begin, BFS_WG_SIZE);
+        num_bfs_dispatches = 1;
+    }
+
+
+    if (local_spill_stack_size > BFS_NUM_VCONTEXTS)
+    {
+        // write out additional spills if we produced more work than we can consume
+        uint excess_spills = local_spill_stack_size - BFS_NUM_VCONTEXTS;
+        uint write_base = global_spill_stack_size;
+        uint lid = get_local_id(0);
+        if (lid < excess_spills)
+            scheduler->bfs2_spill_stack.entries[write_base + lid] = SLM_local_spill_stack[BFS_NUM_VCONTEXTS + lid];
+
+        if (lid == 0)
+            scheduler->bfs2_spill_stack.size = global_spill_stack_size + excess_spills;
+    }
+    else if (global_spill_stack_size > 0)
+    {
+        // otherwise, if we consumed any spills from the global stack, update the stack size
+        if (get_local_id(0) == 0)
+        {
+            uint global_spills_consumed = min(global_spill_stack_size, BFS_NUM_VCONTEXTS - local_spill_stack_size);
+            scheduler->bfs2_spill_stack.size = global_spill_stack_size - global_spills_consumed;
+        }
+    }
+
+
+    // Do various WG reductions..  the code below is a hand-written version of the following:
+    //
+    // uint bfs_dispatch_queue_pos     = work_group_scan_exclusive_add( num_bfs_dispatches );
+    // uint reduce_num_bfs_wgs         = work_group_reduce_add(num_bfs_wgs);
+    // uint reduce_num_bfs_dispatches  = work_group_reduce_add(num_bfs_dispatches);
+    uint bfs_dispatch_queue_pos;
+    uint reduce_num_bfs_dispatches;
+    uint reduce_num_bfs_wgs;
+    local uint partial_dispatches[SCHEDULER_WG_SIZE / SCHEDULER_SG_SIZE];
+    local uint partial_wgs[SCHEDULER_WG_SIZE / SCHEDULER_SG_SIZE];
+    {
+        partial_dispatches[get_sub_group_id()] = sub_group_reduce_add(num_bfs_dispatches);
+        partial_wgs[get_sub_group_id()] = sub_group_reduce_add(num_bfs_wgs);
+
+        uint sg_prefix = sub_group_scan_exclusive_add(num_bfs_dispatches);
+
+        uint prefix_dispatches = 0;
+        uint total_dispatches = 0;
+        uint total_wgs = 0;
+        ushort lane = get_sub_group_local_id();
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for (ushort i = 0; i < SCHEDULER_NUM_SGS; i += SCHEDULER_SG_SIZE) // this loop is intended to be fully unrolled after compilation
+        {
+            uint p_dispatch = partial_dispatches[i + lane];
+            uint p_wg = partial_wgs[i + lane];
+
+            prefix_dispatches += (i + lane < get_sub_group_id()) ? p_dispatch : 0;
+            total_dispatches += p_dispatch;
+            total_wgs += p_wg;
+        }
+
+        bfs_dispatch_queue_pos = sg_prefix + sub_group_reduce_add(prefix_dispatches);
+        reduce_num_bfs_dispatches = sub_group_reduce_add(total_dispatches);
+        reduce_num_bfs_wgs = sub_group_reduce_add(total_wgs);
+    }
+
+    // insert records into BFS queue
+    if (num_bfs_dispatches)
+    {
+        scheduler->bfs_queue.wg_count[bfs_dispatch_queue_pos] = num_bfs_wgs;
+        scheduler->bfs_queue.records[bfs_dispatch_queue_pos].context_id = context_id;
+        scheduler->bfs_queue.records[bfs_dispatch_queue_pos].batch_index = batch_index;
+    }
+
+
+    // store modified vcontext state if it has changed
+    if (initial_state != state)
+        scheduler->vcontext_state[context_id] = state;
+
+
+    // store workgroup counters
+    if (get_local_id(0) == 0)
+    {
+        scheduler->bfs_queue.num_dispatches = reduce_num_bfs_dispatches;
+        scheduler->num_bfs_wgs = reduce_num_bfs_wgs;
+        scheduler->num_dfs_wgs = SLM_dfs_dispatch_count;
+    }
+
+    const uint contexts_to_clear = min( (uint)BFS_NUM_VCONTEXTS, (uint)(local_spill_stack_size+global_spill_stack_size) );
+
+    for ( uint i = get_sub_group_id(); i < contexts_to_clear; i += get_num_sub_groups() )
+        BinInfo_init_subgroup( &scheduler->contexts[i].global_bin_info );
+
+    for ( uint i = get_sub_group_id(); i < contexts_to_clear; i += get_num_sub_groups() )
+        LRBounds_init_subgroup( &scheduler->contexts[i].lr_bounds );
+}
+
+#if 0
+uint record_search( struct BFSDispatchRecord* record_out, global struct BFSDispatchQueue* queue )
+{
+    uint group = get_group_id(0);
+    ushort lane = get_sub_group_local_id();
+    uint num_dispatches = queue->num_dispatches;
+    uint base = 0;
+    for (uint i = 0; i < num_dispatches; i += get_sub_group_size())
+    {
+        uint counts = intel_sub_group_block_read(&queue->wg_count[i]);
+
+        for (uint j = 0; j < get_sub_group_size(); j++)
+        {
+            uint n = sub_group_broadcast(counts, j);
+            if (group < n)
+            {
+                *record_out = queue->records[i + j];
+                return group;
+            }
+            group -= n;
+        }
+    }
+
+    return 0; // NOTE: unreachable in practice
+}
+#endif
+
+
+uint record_search(struct BFSDispatchRecord* record_out, global struct BFSDispatchQueue* queue)
+{
+    uint group = get_group_id(0);
+
+    uint num_dispatches = queue->num_dispatches;
+
+    uint dispatch_id = 0;
+    uint local_id = 0;
+    uint i = 0;
+    do
+    {
+        uint counts = intel_sub_group_block_read(&queue->wg_count[i]);
+        uint prefix = sub_group_scan_exclusive_add(counts);
+
+        uint g = group - prefix;
+        uint ballot = intel_sub_group_ballot(g < counts);
+        if (ballot)
+        {
+            uint lane = ctz(ballot);
+            dispatch_id = i + lane;
+            local_id = intel_sub_group_shuffle(g, lane);
+            break;
+        }
+
+        group -= sub_group_broadcast(prefix + counts, get_sub_group_size() - 1);
+
+        i += get_sub_group_size();
+    } while (i < num_dispatches);
+
+
+    *record_out = queue->records[dispatch_id];
+    return local_id;
+}
+
+
+
+
+struct BFSDispatchArgs get_bfs_args(struct BFSDispatchRecord* record, global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals, uint local_group_id)
+{
+    uint context_id = record->context_id;
+    struct VContext* context = &scheduler->contexts[context_id];
+    bool odd_pass = context->tree_depth & 1;
+
+    struct BFSDispatchArgs args;
+    args.scheduler              = scheduler;
+    args.primref_index_in       = SAHBuildGlobals_GetPrimrefIndices_In( globals, odd_pass );
+    args.primref_index_out      = SAHBuildGlobals_GetPrimrefIndices_Out( globals, odd_pass );
+    args.primref_buffer         = SAHBuildGlobals_GetPrimrefs( globals );
+    args.wg_primref_begin       = context->dispatch_primref_begin + local_group_id * BFS_WG_SIZE;
+    args.wg_primref_end         = min( args.wg_primref_begin + BFS_WG_SIZE, context->dispatch_primref_end );
+    args.dispatch_primref_begin = context->dispatch_primref_begin;
+    args.dispatch_primref_end   = context->dispatch_primref_end;
+    args.context_id             = context_id;
+    args.context                = &scheduler->contexts[context_id];
+    args.num_wgs                = ((args.dispatch_primref_end - args.dispatch_primref_begin) + BFS_WG_SIZE - 1) / BFS_WG_SIZE;
+    args.bvh2_root              = context->bvh2_root;
+    args.bvh2 = SAHBuildGlobals_GetBVH2( globals );
+    args.global_num_primrefs = SAHBuildGlobals_GetTotalPrimRefs( globals );
+    args.do_mask_processing = SAHBuildGlobals_NeedMasks( globals );
+    return args;
+}
+
+struct BFSDispatchArgs get_bfs_args_queue( global struct BFSDispatchQueue* queue,
+                                           global struct VContextScheduler* scheduler,
+                                           global struct SAHBuildGlobals* globals )
+{
+
+    // TODO_OPT:  Load this entire prefix array into SLM instead of searching..
+    //    Or use sub-group ops
+
+    struct BFSDispatchRecord record;
+    uint local_group_id = record_search(&record, queue);
+
+    return get_bfs_args(&record, scheduler, globals, local_group_id);
+}
+
+
+struct BFSDispatchArgs get_bfs_args_from_record( struct BFSDispatchRecord* record,
+                                           global struct VContextScheduler* scheduler,
+                                           global struct SAHBuildGlobals* globals )
+{
+    return get_bfs_args(record, scheduler, globals, 0);
+}
+
+
+struct BFSDispatchArgs get_bfs_args_batchable(
+    global struct BFSDispatchQueue* queue,
+    global struct VContextScheduler* scheduler,
+    global struct SAHBuildGlobals* globals_buffer )
+{
+
+    // TODO_OPT:  Load this entire prefix array into SLM instead of searching..
+    //    Or use sub-group ops
+
+    struct BFSDispatchRecord record;
+    uint local_group_id = record_search(&record, queue);
+
+    global struct SAHBuildGlobals* globals = globals_buffer + record.batch_index;
+
+    return get_bfs_args(&record, scheduler, globals, local_group_id);
+}
+
+
+struct BFSDispatchArgs get_bfs_args_from_record_batchable(
+    struct BFSDispatchRecord* record,
+    global struct VContextScheduler* scheduler,
+    global struct SAHBuildGlobals* globals_buffer )
+{
+    global struct SAHBuildGlobals* globals = globals_buffer + record->batch_index;
+
+    return get_bfs_args(record, scheduler, globals, 0);
+}
+
+struct BFSDispatchArgs get_bfs_args_initial( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals )
+{
+    uint context_id = 0;
+
+    uint num_refs = SAHBuildGlobals_GetTotalPrimRefs( globals );
+
+    struct BFSDispatchArgs args;
+    args.scheduler = scheduler;
+    args.primref_index_in   = SAHBuildGlobals_GetPrimrefIndices_In( globals, false );
+    args.primref_index_out  = SAHBuildGlobals_GetPrimrefIndices_Out( globals, false );
+    args.primref_buffer     = SAHBuildGlobals_GetPrimrefs( globals );
+    args.wg_primref_begin   = get_group_id(0) * BFS_WG_SIZE;
+    args.wg_primref_end     = min( args.wg_primref_begin + BFS_WG_SIZE, num_refs );
+    args.dispatch_primref_begin = 0;
+    args.dispatch_primref_end   = num_refs;
+    args.context_id = context_id;
+    args.context = &scheduler->contexts[context_id];
+    args.num_wgs = ((args.dispatch_primref_end - args.dispatch_primref_begin) + BFS_WG_SIZE - 1) / BFS_WG_SIZE;
+    args.bvh2 = SAHBuildGlobals_GetBVH2( globals );
+    args.bvh2_root = BVH2_GetRoot( args.bvh2 );
+    args.global_num_primrefs = SAHBuildGlobals_GetTotalPrimRefs( globals );
+    args.do_mask_processing = SAHBuildGlobals_NeedMasks(globals);
+    return args;
+}
+
+
+inline void BinMapping_init( struct BinMapping* binMapping, struct AABB3f* centBounds, const uint bins )
+{
+    const float4 eps = 1E-34f;
+    const float4 omega = 1E+34f;
+    float3 l = AABB3f_load_lower( centBounds );
+    float3 u = AABB3f_load_upper( centBounds );
+    float4 diag;
+    diag.xyz = max( eps.xyz, u - l );
+    diag.w = 0;
+    float4 scale = (float4)(0.99f * (float)bins) / diag;
+    scale = select( (float4)(0.0f), scale, diag > eps );
+    scale = select( (float4)(0.0f), scale, diag < omega );
+    binMapping->scale = scale;
+    binMapping->ofs.xyz = l.xyz;
+    binMapping->ofs.w = 0;
+}
+
+
+inline ulong getBestSplit( float3 sah, uint ID, const float4 scale, const ulong defaultSplit )
+{
+    ulong splitX = (((ulong)as_uint( sah.x )) << 32) | ((uint)ID << 2) | 0;
+    ulong splitY = (((ulong)as_uint( sah.y )) << 32) | ((uint)ID << 2) | 1;
+    ulong splitZ = (((ulong)as_uint( sah.z )) << 32) | ((uint)ID << 2) | 2;
+    /* ignore zero sized dimensions */
+    splitX = select( splitX, defaultSplit, (ulong)(scale.x == 0) );
+    splitY = select( splitY, defaultSplit, (ulong)(scale.y == 0) );
+    splitZ = select( splitZ, defaultSplit, (ulong)(scale.z == 0) );
+    ulong bestSplit = min( min( splitX, splitY ), splitZ );
+    bestSplit = sub_group_reduce_min( bestSplit );
+    return bestSplit;
+}
+
+
+
+inline float left_to_right_area16( struct AABB3f* low )
+{
+    struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max( low );
+    return halfArea_AABB3f( &low_prefix );
+}
+
+inline uint left_to_right_counts16( uint low )
+{
+    return sub_group_scan_exclusive_add( low );
+}
+
+inline float right_to_left_area16( struct AABB3f* low )
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+    const uint ID = subgroup_size - 1 - subgroupLocalID;
+    struct AABB3f low_reverse = AABB3f_sub_group_shuffle( low, ID );
+    struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max( &low_reverse );
+    const float low_area = intel_sub_group_shuffle( halfArea_AABB3f( &low_prefix ), ID );
+    return low_area;
+}
+
+inline uint right_to_left_counts16( uint low )
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+    const uint ID = subgroup_size - 1 - subgroupLocalID;
+    const uint low_reverse = intel_sub_group_shuffle( low, ID );
+    const uint low_prefix = sub_group_scan_inclusive_add( low_reverse );
+    return intel_sub_group_shuffle( low_prefix, ID );
+}
+
+inline float2 left_to_right_area32( struct AABB3f* low, struct AABB3f* high )
+{
+    struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max( low );
+    struct AABB3f low_reduce = AABB3f_sub_group_reduce( low );
+    struct AABB3f high_prefix = AABB3f_sub_group_scan_exclusive_min_max( high );
+    AABB3f_extend( &high_prefix, &low_reduce );
+    const float low_area = halfArea_AABB3f( &low_prefix );
+    const float high_area = halfArea_AABB3f( &high_prefix );
+    return (float2)(low_area, high_area);
+}
+
+inline uint2 left_to_right_counts32( uint low, uint high )
+{
+    const uint low_prefix = sub_group_scan_exclusive_add( low );
+    const uint low_reduce = sub_group_reduce_add( low );
+    const uint high_prefix = sub_group_scan_exclusive_add( high );
+    return (uint2)(low_prefix, low_reduce + high_prefix);
+}
+
+inline float2 right_to_left_area32( struct AABB3f* low, struct AABB3f* high )
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+    const uint ID = subgroup_size - 1 - subgroupLocalID;
+    struct AABB3f low_reverse = AABB3f_sub_group_shuffle( high, ID );
+    struct AABB3f high_reverse = AABB3f_sub_group_shuffle( low, ID );
+    struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max( &low_reverse );
+    struct AABB3f low_reduce = AABB3f_sub_group_reduce( &low_reverse );
+    struct AABB3f high_prefix = AABB3f_sub_group_scan_inclusive_min_max( &high_reverse );
+    AABB3f_extend( &high_prefix, &low_reduce );
+    const float low_area = intel_sub_group_shuffle( halfArea_AABB3f( &high_prefix ), ID );
+    const float high_area = intel_sub_group_shuffle( halfArea_AABB3f( &low_prefix ), ID );
+    return (float2)(low_area, high_area);
+}
+
+inline uint2 right_to_left_counts32( uint low, uint high )
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+    const uint ID = subgroup_size - 1 - subgroupLocalID;
+    const uint low_reverse = intel_sub_group_shuffle( high, ID );
+    const uint high_reverse = intel_sub_group_shuffle( low, ID );
+    const uint low_prefix = sub_group_scan_inclusive_add( low_reverse );
+    const uint low_reduce = sub_group_reduce_add( low_reverse );
+    const uint high_prefix = sub_group_scan_inclusive_add( high_reverse ) + low_reduce;
+    return (uint2)(intel_sub_group_shuffle( high_prefix, ID ), intel_sub_group_shuffle( low_prefix, ID ));
+}
+
+inline uint fastDivideBy6_uint( uint v )
+{
+#if 1
+    const ulong u = (ulong)v >> 1;
+    return (uint)((u * 0x55555556ul) >> 32);
+#else
+    return v / 6;
+#endif
+}
+
+inline uint3 fastDivideBy6_uint3( uint3 v )
+{
+    return (uint3)(fastDivideBy6_uint( v.x ), fastDivideBy6_uint( v.y ), fastDivideBy6_uint( v.z ));
+}
+
+#define SAH_LOG_BLOCK_SHIFT 2
+
+inline struct BFS_Split BinInfo_reduce( struct BFS_BinInfo* binInfo, const float4 scale )
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+
+    struct AABB3f boundsX = BinInfo_get_AABB( binInfo, subgroupLocalID, 0 );
+
+    const float lr_areaX = left_to_right_area16( &boundsX );
+    const float rl_areaX = right_to_left_area16( &boundsX );
+
+    struct AABB3f boundsY = BinInfo_get_AABB( binInfo, subgroupLocalID, 1 );
+
+    const float lr_areaY = left_to_right_area16( &boundsY );
+    const float rl_areaY = right_to_left_area16( &boundsY );
+
+    struct AABB3f boundsZ = BinInfo_get_AABB( binInfo, subgroupLocalID, 2 );
+
+    const float lr_areaZ = left_to_right_area16( &boundsZ );
+    const float rl_areaZ = right_to_left_area16( &boundsZ );
+
+    const uint3 counts = BinInfo_get_counts( binInfo, subgroupLocalID );
+
+    const uint lr_countsX = left_to_right_counts16( counts.x );
+    const uint rl_countsX = right_to_left_counts16( counts.x );
+    const uint lr_countsY = left_to_right_counts16( counts.y );
+    const uint rl_countsY = right_to_left_counts16( counts.y );
+    const uint lr_countsZ = left_to_right_counts16( counts.z );
+    const uint rl_countsZ = right_to_left_counts16( counts.z );
+
+    const float3 lr_area = (float3)(lr_areaX, lr_areaY, lr_areaZ);
+    const float3 rl_area = (float3)(rl_areaX, rl_areaY, rl_areaZ);
+
+    const uint3 lr_count = fastDivideBy6_uint3( (uint3)(lr_countsX, lr_countsY, lr_countsZ) + 6 - 1 );
+    const uint3 rl_count = fastDivideBy6_uint3( (uint3)(rl_countsX, rl_countsY, rl_countsZ) + 6 - 1 );
+    float3 sah = fma( lr_area, convert_float3( lr_count ), rl_area * convert_float3( rl_count ) );
+
+    /* first bin is invalid */
+    sah.x = select( (float)(INFINITY), sah.x, subgroupLocalID != 0 );
+    sah.y = select( (float)(INFINITY), sah.y, subgroupLocalID != 0 );
+    sah.z = select( (float)(INFINITY), sah.z, subgroupLocalID != 0 );
+
+    const ulong defaultSplit = (((ulong)as_uint( (float)(INFINITY) )) << 32);
+
+    const ulong bestSplit = getBestSplit( sah, subgroupLocalID, scale, defaultSplit );
+
+    struct BFS_Split split;
+    split.sah = as_float( (uint)(bestSplit >> 32) );
+    split.dim = (uint)bestSplit & 3;
+    split.pos = (uint)bestSplit >> 2;
+
+    return split;
+}
+
+
+struct BFS_BinInfoReduce3_SLM
+{
+    uint sah[3*BFS_NUM_BINS];
+};
+
+
+
+inline struct BFS_Split BinInfo_reduce3( local struct BFS_BinInfoReduce3_SLM* slm, struct BFS_BinInfo* binInfo, const float4 scale )
+{
+    // process each bin/axis combination across sub-groups
+    for (uint i = get_sub_group_id(); i < 3 * BFS_NUM_BINS; i += get_num_sub_groups())
+    {
+        uint my_bin  = i % BFS_NUM_BINS;
+        uint my_axis = i / BFS_NUM_BINS;
+
+        float3 left_lower  = (float3)(INFINITY,INFINITY,INFINITY);
+        float3 left_upper  = -left_lower;
+        float3 right_lower = (float3)(INFINITY,INFINITY,INFINITY);
+        float3 right_upper = -right_lower;
+
+        // load the other bins and assign them to the left or to the right
+        //  of this subgroup's bin
+        uint lane = get_sub_group_local_id();
+        struct AABB3f sg_bins = BinInfo_get_AABB(binInfo,lane,my_axis);
+
+        bool is_left = (lane < my_bin);
+        float3 lower = AABB3f_load_lower(&sg_bins);
+        float3 upper = AABB3f_load_upper(&sg_bins);
+
+        float3 lower_l = select_min( lower, is_left  );
+        float3 upper_l = select_max( upper, is_left  );
+        float3 lower_r = select_min( lower, !is_left );
+        float3 upper_r = select_max( upper, !is_left );
+
+        lower_l = sub_group_reduce_min_float3( lower_l );
+        lower_r = sub_group_reduce_min_float3( lower_r );
+        upper_l = sub_group_reduce_max_float3( upper_l );
+        upper_r = sub_group_reduce_max_float3( upper_r );
+        float3 dl = upper_l - lower_l;
+        float3 dr = upper_r - lower_r;
+        float area_l =  dl.x* (dl.y + dl.z) + (dl.y * dl.z);
+        float area_r =  dr.x* (dr.y + dr.z) + (dr.y * dr.z);
+
+        // get the counts
+        uint sg_bin_count = BinInfo_get_count(binInfo, lane, my_axis);
+        uint count_l = (is_left) ?  sg_bin_count : 0;
+        uint count_r = (is_left) ?  0 : sg_bin_count;
+        count_l = sub_group_reduce_add(count_l);
+        count_r = sub_group_reduce_add(count_r);
+
+        // compute sah
+        count_l = fastDivideBy6_uint(count_l + 6 - 1);
+        count_r = fastDivideBy6_uint(count_r + 6 - 1);
+        float lr_partial = area_l * count_l;
+        float rl_partial = area_r * count_r;
+        float sah = lr_partial + rl_partial;
+
+        // first bin is invalid
+        sah = select((float)(INFINITY), sah, my_bin != 0);
+
+        // ignore zero sized dimensions
+        sah = select( sah, (float)(INFINITY), (scale.x == 0 && my_axis == 0) );
+        sah = select( sah, (float)(INFINITY), (scale.y == 0 && my_axis == 1) );
+        sah = select( sah, (float)(INFINITY), (scale.z == 0 && my_axis == 2) );
+
+        // tuck the axis into the bottom bits of sah cost.
+        //  The result is an integer between 0 and +inf (7F800000)
+        //  If we have 3 axes with infinite sah cost, we will select axis 0
+        slm->sah[i] = (as_uint(sah)&~0x3) | my_axis;
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    // reduce split candidates down to one subgroup
+    //  sah is strictly positive, so integer compares can be used
+    //   which results in a faster sub_group_reduce_min()
+    //
+    uint best_sah = 0xffffffff;
+
+    uint lid = get_sub_group_local_id();
+    if (lid < BFS_NUM_BINS)
+    {
+        best_sah = slm->sah[lid];
+        lid += BFS_NUM_BINS;
+        best_sah = min( best_sah, slm->sah[lid] );
+        lid += BFS_NUM_BINS;
+        best_sah = min( best_sah, slm->sah[lid] );
+    }
+
+    uint reduced_bestsah = sub_group_reduce_min( best_sah );
+    uint best_bin = ctz(intel_sub_group_ballot(best_sah == reduced_bestsah));
+    uint best_axis = as_uint(reduced_bestsah) & 0x3;
+
+    struct BFS_Split ret;
+    ret.sah = as_float(reduced_bestsah);
+    ret.dim = best_axis;
+    ret.pos = best_bin;
+    return ret;
+}
+
+
+struct BFS_BinInfoReduce_SLM
+{
+    struct
+    {
+        float sah;
+        uint bin;
+    } axisInfo[3];
+};
+
+
+
+inline struct BFS_Split BinInfo_reduce2( local struct BFS_BinInfoReduce_SLM* slm, struct BFS_BinInfo* binInfo, const float4 scale, uint num_primrefs)
+{
+    ushort my_axis = get_sub_group_id();
+    ushort my_bin  = get_sub_group_local_id();
+
+    if (my_axis < 3)
+    {
+        struct AABB3f aabb = BinInfo_get_AABB(binInfo, my_bin, my_axis);
+        uint count         = BinInfo_get_count(binInfo, my_bin, my_axis);
+
+        float lr_area = left_to_right_area16(&aabb);
+        float rl_area = right_to_left_area16(&aabb);
+
+        uint lr_count = sub_group_scan_exclusive_add(count);
+        uint rl_count = num_primrefs - lr_count;
+
+        lr_count = fastDivideBy6_uint(lr_count + 6 - 1);
+        rl_count = fastDivideBy6_uint(rl_count + 6 - 1);
+        float lr_partial = lr_area * lr_count;
+        float rl_partial = rl_area * rl_count;
+        float sah = lr_partial + rl_partial;
+
+        // first bin is invalid
+        sah = select((float)(INFINITY), sah, my_bin != 0);
+
+        float best_sah = sub_group_reduce_min( sah );
+        uint best_bin = ctz(intel_sub_group_ballot(sah == best_sah));
+
+        // ignore zero sized dimensions
+        best_sah = select( best_sah, (float)(INFINITY), (scale.x == 0 && my_axis == 0) );
+        best_sah = select( best_sah, (float)(INFINITY), (scale.y == 0 && my_axis == 1) );
+        best_sah = select( best_sah, (float)(INFINITY), (scale.z == 0 && my_axis == 2) );
+
+        if (get_sub_group_local_id() == 0)
+        {
+            slm->axisInfo[my_axis].sah = best_sah;
+            slm->axisInfo[my_axis].bin = best_bin;
+        }
+    }
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    float sah = (float)(INFINITY);
+    if( get_sub_group_local_id() < 3 )
+        sah = slm->axisInfo[get_sub_group_local_id()].sah;
+
+    float bestsah = min(sub_group_broadcast(sah, 0), min(sub_group_broadcast(sah, 1), sub_group_broadcast(sah, 2)));
+    uint bestAxis = ctz( intel_sub_group_ballot(bestsah == sah) );
+
+    struct BFS_Split split;
+    split.sah = bestsah;
+    split.dim = bestAxis;
+    split.pos = slm->axisInfo[bestAxis].bin;
+    return split;
+}
+
+
+inline bool is_left( struct BinMapping* binMapping, struct BFS_Split* split, struct AABB* primref )
+{
+    const uint dim = split->dim;
+    const float lower = primref->lower[dim];
+    const float upper = primref->upper[dim];
+    const float c = lower + upper;
+    const uint pos = convert_uint_rtz( (c - binMapping->ofs[dim]) * binMapping->scale[dim] );
+    return pos < split->pos;
+}
+
+struct BFS_Pass1_SLM
+{
+    struct BFS_BinInfo bin_info;
+//    struct BFS_BinInfoReduce3_SLM reduce3;
+};
+
+
+void DO_BFS_pass1( local struct BFS_Pass1_SLM*  slm,
+                   uint thread_primref_id,
+                   bool thread_primref_valid,
+                   struct BFSDispatchArgs args
+                  )
+{
+    local struct BFS_BinInfo* local_bin_info = &slm->bin_info;
+    global struct VContext* context  = args.context;
+    struct AABB3f centroid_bounds    = BVH2_GetNodeBox( args.bvh2, args.bvh2_root ); // root AABB is initialized to centroid bounds
+
+    struct BinMapping bin_mapping;
+    BinMapping_init( &bin_mapping, &centroid_bounds, BFS_NUM_BINS );
+
+    // fetch this thread's primref
+    PrimRef ref;
+    if ( thread_primref_valid )
+        ref = args.primref_buffer[thread_primref_id];
+
+    // init bin info
+    BinInfo_init( local_bin_info );
+
+    // fence on local bin-info init
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    // merge this thread's primref into local bin info
+    BinInfo_add_primref( &bin_mapping, local_bin_info, &ref, thread_primref_valid );
+
+    // fence on local bin-info update
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    BinInfo_merge(&context->global_bin_info, local_bin_info);
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size(BFS_WG_SIZE,1,1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+BFS_pass1_indexed(
+    global struct VContextScheduler* scheduler,
+    global struct SAHBuildGlobals* sah_globals )
+{
+    local struct BFS_Pass1_SLM slm;
+    struct BFSDispatchArgs args = get_bfs_args_queue( &scheduler->bfs_queue, scheduler, sah_globals );
+
+    bool thread_primref_valid = (args.wg_primref_begin + get_local_id( 0 )) < args.wg_primref_end;
+    uint thread_primref_id = 0;
+    if ( thread_primref_valid )
+        thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id( 0 )];
+
+    DO_BFS_pass1( &slm, thread_primref_id, thread_primref_valid, args );
+}
+
+
+__attribute__( (reqd_work_group_size( BFS_WG_SIZE, 1, 1 )) )
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+BFS_pass1_initial( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals )
+{
+    local struct BFS_Pass1_SLM slm;
+    struct BFSDispatchArgs args = get_bfs_args_initial( scheduler, sah_globals );
+
+    uint thread_primref_id    = args.wg_primref_begin + get_local_id( 0 );
+    bool thread_primref_valid = thread_primref_id < args.wg_primref_end;
+
+    DO_BFS_pass1( &slm, thread_primref_id, thread_primref_valid, args );
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+BFS_pass1_indexed_batchable(
+    global struct VContextScheduler* scheduler,
+    global struct SAHBuildGlobals* globals_buffer )
+{
+    local struct BFS_Pass1_SLM slm;
+    struct BFSDispatchArgs args = get_bfs_args_batchable( &scheduler->bfs_queue, scheduler, globals_buffer );
+
+    bool thread_primref_valid = (args.wg_primref_begin + get_local_id(0)) < args.wg_primref_end;
+    uint thread_primref_id = 0;
+    if (thread_primref_valid)
+        thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id(0)];
+
+    DO_BFS_pass1(&slm, thread_primref_id, thread_primref_valid, args);
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+BFS_pass1_initial_batchable( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer )
+{
+    local struct BFS_Pass1_SLM slm;
+    struct BFSDispatchArgs args = get_bfs_args_batchable( &scheduler->bfs_queue, scheduler, globals_buffer );
+
+    uint thread_primref_id = args.wg_primref_begin + get_local_id(0);
+    bool thread_primref_valid = thread_primref_id < args.wg_primref_end;
+
+    DO_BFS_pass1(&slm, thread_primref_id, thread_primref_valid, args);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+///
+///        BVH2 construction -- BFS Phase Pass2
+///
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct BFS_Pass2_SLM
+{
+    struct BFS_BinInfoReduce3_SLM reduce3;
+    //struct AABB3f left_centroid_bounds;
+    //struct AABB3f right_centroid_bounds;
+    //struct AABB3f left_geom_bounds;
+    //struct AABB3f right_geom_bounds;
+    LRBounds lr_bounds;
+    uint left_count;
+    uint right_count;
+    uint lr_mask;
+    uint left_primref_base;
+    uint right_primref_base;
+//    uint num_wgs;
+
+//    uint output_indices[BFS_WG_SIZE];
+};
+
+
+
+
+
+
+
+void DO_BFS_pass2(
+    local struct BFS_Pass2_SLM* slm,
+    uint thread_primref_id,
+    bool thread_primref_valid,
+    struct BFSDispatchArgs args
+)
+{
+    global struct VContext* context = args.context;
+
+    struct AABB3f centroid_bounds = BVH2_GetNodeBox( args.bvh2, args.bvh2_root );
+
+    // load the thread's primref
+    PrimRef ref;
+    if ( thread_primref_valid )
+        ref = args.primref_buffer[thread_primref_id];
+
+    struct BinMapping bin_mapping;
+    BinMapping_init( &bin_mapping, &centroid_bounds, BFS_NUM_BINS );
+
+    // initialize working SLM space
+    LRBounds_init(&slm->lr_bounds);
+    if(get_local_id(0) == 0)
+    {
+        slm->left_count  = 0;
+        slm->right_count = 0;
+
+        if( args.do_mask_processing )
+            slm->lr_mask = 0;
+    }
+
+    // compute split - every workgroup does the same computation
+    // local barrier inside BinInfo_reduce3
+    struct BFS_Split split = BinInfo_reduce3( &slm->reduce3, &context->global_bin_info,bin_mapping.scale );
+
+    uint wg_prim_count = args.wg_primref_end - args.wg_primref_begin;
+
+    // partition primrefs into L/R subsets...
+    bool go_left = false;
+    if (split.sah == (float)(INFINITY))      // no valid split, split in the middle.. This can happen due to floating-point limit cases in huge scenes
+        go_left = get_local_id(0) < (wg_prim_count / 2);
+    else
+        go_left = is_left( &bin_mapping, &split, &ref );
+
+    // assign this primref a position in the output array, and expand corresponding centroid-bounds
+    uint local_index;
+    {
+        float3 centroid = ref.lower.xyz + ref.upper.xyz;
+
+        uint l_ballot = intel_sub_group_ballot(  go_left && thread_primref_valid );
+        uint r_ballot = intel_sub_group_ballot( !go_left && thread_primref_valid );
+        if (l_ballot)
+        {
+            bool active_lane = l_ballot & (1 << get_sub_group_local_id());
+            float3 Cmin, Cmax, Gmin, Gmax;
+            Cmin = select_min( centroid.xyz, active_lane );
+            Cmax = select_max( centroid.xyz, active_lane );
+            Gmin = select_min( ref.lower.xyz, active_lane );
+            Gmax = select_max( ref.upper.xyz, active_lane );
+
+            Cmin = sub_group_reduce_min_float3( Cmin );
+            Cmax = sub_group_reduce_max_float3( Cmax );
+            Gmin = sub_group_reduce_min_float3( Gmin );
+            Gmax = sub_group_reduce_max_float3( Gmax );
+
+            LRBounds_merge_left( &slm->lr_bounds, Cmin,Cmax,Gmin,Gmax );
+        }
+
+        if (r_ballot)
+        {
+            bool active_lane = r_ballot & (1 << get_sub_group_local_id());
+            float3 Cmin, Cmax, Gmin, Gmax;
+            Cmin = select_min(centroid.xyz, active_lane);
+            Cmax = select_max(centroid.xyz, active_lane);
+            Gmin = select_min(ref.lower.xyz, active_lane);
+            Gmax = select_max(ref.upper.xyz, active_lane);
+
+            Cmin = sub_group_reduce_min_float3(Cmin);
+            Cmax = sub_group_reduce_max_float3(Cmax);
+            Gmin = sub_group_reduce_min_float3(Gmin);
+            Gmax = sub_group_reduce_max_float3(Gmax);
+
+            LRBounds_merge_right( &slm->lr_bounds, Cmin,Cmax,Gmin,Gmax );
+        }
+
+        if( args.do_mask_processing )
+        {
+            uint mask =0;
+            if (thread_primref_valid)
+            {
+                mask = PRIMREF_instanceMask(&ref) ;
+                mask = go_left  ? mask : mask<<8;
+            }
+
+            // TODO OPT:  there is no 'sub_group_reduce_or'  and IGC does not do the reduction trick
+            //   for atomics on sub-group uniform addresses
+            for( uint i= get_sub_group_size()/2; i>0; i/= 2)
+                mask = mask | intel_sub_group_shuffle_down(mask,mask,i);
+            if( get_sub_group_local_id() == 0 )
+                atomic_or_local( &slm->lr_mask, mask );
+        }
+
+        uint l_base = 0;
+        uint r_base = 0;
+        if( get_sub_group_local_id() == 0 && l_ballot )
+            l_base = atomic_add_local( &slm->left_count, popcount(l_ballot) );
+        if( get_sub_group_local_id() == 0 && r_ballot )
+            r_base = atomic_add_local( &slm->right_count, popcount(r_ballot) );
+
+        sub_group_barrier( CLK_LOCAL_MEM_FENCE );
+        l_base = sub_group_broadcast(l_base,0);
+        r_base = sub_group_broadcast(r_base,0);
+
+        l_base = l_base + subgroup_bit_prefix_exclusive( l_ballot );
+        r_base = r_base + subgroup_bit_prefix_exclusive( r_ballot );
+
+        local_index = (go_left) ? l_base : r_base;
+    }
+
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    // merge local into global
+    // TODO_OPT:  Look at spreading some of this across subgroups
+    if ( get_sub_group_id() == 0 )
+    {
+        // allocate primref space for this wg and merge local/global centroid bounds
+        uint num_left  = slm->left_count;
+        {
+            if (num_left && get_sub_group_local_id() == 0)
+            {
+                num_left = atomic_add_global( &context->num_left, num_left );
+                slm->left_primref_base = args.dispatch_primref_begin + num_left;
+            }
+        }
+        uint num_right = slm->right_count;
+        {
+            if (num_right && get_sub_group_local_id() == 0)
+            {
+                num_right = atomic_add_global( &context->num_right, num_right );
+                slm->right_primref_base = (args.dispatch_primref_end - 1) - num_right;
+            }
+        }
+
+        if( args.do_mask_processing && get_sub_group_local_id() == 0 )
+            atomic_or_global( &context->lr_mask, slm->lr_mask );
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    LRBounds_merge( &context->lr_bounds, &slm->lr_bounds );
+
+    // move thread's primref ID into correct position in output index buffer
+    if (thread_primref_valid)
+    {
+        uint pos = go_left ? slm->left_primref_base + local_index
+            : slm->right_primref_base - local_index;
+
+        args.primref_index_out[pos] = thread_primref_id;
+    }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( BFS_WG_SIZE, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void
+BFS_pass2_indexed( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals )
+{
+    local struct BFS_Pass2_SLM slm;
+    struct BFSDispatchArgs args = get_bfs_args_queue( &scheduler->bfs_queue, scheduler, sah_globals );
+
+    bool thread_primref_valid = (args.wg_primref_begin + get_local_id( 0 )) < args.wg_primref_end;
+    uint thread_primref_id = 0;
+    if ( thread_primref_valid )
+        thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id( 0 )];
+
+    DO_BFS_pass2( &slm, thread_primref_id, thread_primref_valid, args );
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( BFS_WG_SIZE, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void
+BFS_pass2_initial( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* sah_globals )
+{
+    local struct BFS_Pass2_SLM slm;
+    struct BFSDispatchArgs args = get_bfs_args_initial( scheduler, sah_globals );
+
+    uint thread_primref_id    = args.wg_primref_begin + get_local_id( 0 );
+    bool thread_primref_valid = thread_primref_id < args.wg_primref_end;
+
+    DO_BFS_pass2( &slm, thread_primref_id, thread_primref_valid, args );
+}
+
+
+__attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+BFS_pass2_indexed_batchable( global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer )
+{
+    local struct BFS_Pass2_SLM slm;
+    struct BFSDispatchArgs args = get_bfs_args_batchable(&scheduler->bfs_queue, scheduler, globals_buffer );
+
+    bool thread_primref_valid = (args.wg_primref_begin + get_local_id(0)) < args.wg_primref_end;
+    uint thread_primref_id = 0;
+    if (thread_primref_valid)
+        thread_primref_id = args.primref_index_in[args.wg_primref_begin + get_local_id(0)];
+
+    DO_BFS_pass2(&slm, thread_primref_id, thread_primref_valid, args);
+
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BFS_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+BFS_pass2_initial_batchable(global struct VContextScheduler* scheduler, global struct SAHBuildGlobals* globals_buffer)
+{
+    local struct BFS_Pass2_SLM slm;
+    struct BFSDispatchArgs args = get_bfs_args_batchable(&scheduler->bfs_queue, scheduler, globals_buffer );
+
+    uint thread_primref_id = args.wg_primref_begin + get_local_id(0);
+    bool thread_primref_valid = thread_primref_id < args.wg_primref_end;
+
+    DO_BFS_pass2(&slm, thread_primref_id, thread_primref_valid, args);
+}
+
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+///
+///        BVH2 construction -- DFS Phase
+///
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct DFSArgs
+{
+    uint primref_base;
+    uint global_bvh2_base;
+    bool do_mask_processing;
+    ushort num_primrefs;
+    global uint* primref_indices_in;
+    global uint* primref_indices_out;
+    global PrimRef* primref_buffer;
+    global struct BVH2* global_bvh2;
+};
+
+
+struct DFSPrimRefAABB
+{
+    half lower[3];
+    half upper[3];
+};
+
+void DFSPrimRefAABB_init( struct DFSPrimRefAABB* bb )
+{
+    bb->lower[0] = 1;
+    bb->lower[1] = 1;
+    bb->lower[2] = 1;
+    bb->upper[0] = 0;
+    bb->upper[1] = 0;
+    bb->upper[2] = 0;
+}
+
+void DFSPrimRefAABB_extend( struct DFSPrimRefAABB* aabb, struct DFSPrimRefAABB* v )
+{
+    aabb->lower[0] = min( aabb->lower[0], v->lower[0] );
+    aabb->lower[1] = min( aabb->lower[1], v->lower[1] );
+    aabb->lower[2] = min( aabb->lower[2], v->lower[2] );
+    aabb->upper[0] = max( aabb->upper[0], v->upper[0] );
+    aabb->upper[1] = max( aabb->upper[1], v->upper[1] );
+    aabb->upper[2] = max( aabb->upper[2], v->upper[2] );
+}
+
+half DFSPrimRefAABB_halfArea( struct DFSPrimRefAABB* aabb )
+{
+    const half3 d = (half3)(aabb->upper[0] - aabb->lower[0], aabb->upper[1] - aabb->lower[1], aabb->upper[2] - aabb->lower[2]);
+    return fma( d.x, (d.y + d.z), d.y * d.z );
+}
+
+struct DFSPrimRef
+{
+    struct DFSPrimRefAABB aabb;
+    ushort2 meta;
+};
+
+void DFSPrimRef_SetBVH2Root( struct DFSPrimRef* ref, ushort root )
+{
+    ref->meta.y = root;
+}
+
+uint DFSPrimRef_GetInputIndex( struct DFSPrimRef* ref )
+{
+    return ref->meta.x;
+}
+
+uint DFSPrimRef_GetBVH2Parent( struct DFSPrimRef* ref )
+{
+    return ref->meta.y;
+}
+
+
+struct PrimRefSet
+{
+    struct DFSPrimRefAABB AABB[DFS_WG_SIZE];
+    ushort2 meta[DFS_WG_SIZE];
+    uint input_indices[DFS_WG_SIZE];
+};
+
+
+
+
+local struct DFSPrimRefAABB* PrimRefSet_GetAABBPointer( local struct PrimRefSet* refs, ushort id )
+{
+    return &refs->AABB[id];
+}
+struct DFSPrimRef PrimRefSet_GetPrimRef( local struct PrimRefSet* refs, ushort id )
+{
+    struct DFSPrimRef r;
+    r.aabb = refs->AABB[id];
+    r.meta = refs->meta[id];
+    return r;
+}
+void PrimRefSet_SetPrimRef( local struct PrimRefSet* refs, struct DFSPrimRef ref, ushort id )
+{
+    refs->AABB[id] = ref.aabb;
+    refs->meta[id] = ref.meta;
+}
+
+void PrimRefSet_SetPrimRef_FullPrecision( struct AABB3f* root_aabb, local struct PrimRefSet* refs, PrimRef ref, ushort id )
+{
+    float3 root_l = AABB3f_load_lower( root_aabb );
+    float3 root_u = AABB3f_load_upper( root_aabb );
+    float3 d = root_u - root_l;
+    float scale = 1.0f / max( d.x, max( d.y, d.z ) );
+
+    float3 l = ref.lower.xyz;
+    float3 u = ref.upper.xyz;
+    half3 lh = convert_half3_rtz( (l - root_l) * scale );
+    half3 uh = convert_half3_rtp( (u - root_l) * scale );
+
+    refs->AABB[id].lower[0] = lh.x;
+    refs->AABB[id].lower[1] = lh.y;
+    refs->AABB[id].lower[2] = lh.z;
+    refs->AABB[id].upper[0] = uh.x;
+    refs->AABB[id].upper[1] = uh.y;
+    refs->AABB[id].upper[2] = uh.z;
+    refs->meta[id].x = id;
+    refs->meta[id].y = 0;
+}
+
+
+
+void DFS_CreatePrimRefSet( struct DFSArgs args,
+                           local struct PrimRefSet* prim_refs )
+{
+    ushort id = get_local_id( 0 );
+    ushort num_primrefs = args.num_primrefs;
+
+    struct AABB3f box = BVH2_GetNodeBox( args.global_bvh2, args.global_bvh2_base );
+    if ( id < num_primrefs )
+    {
+        PrimRef ref = args.primref_buffer[args.primref_indices_in[id]];
+        prim_refs->input_indices[id] = args.primref_indices_in[id];
+        PrimRefSet_SetPrimRef_FullPrecision( &box, prim_refs, ref, id );
+    }
+}
+
+struct ThreadRangeInfo
+{
+    uchar start;
+    uchar local_num_prims;
+    uchar bvh2_root;
+    bool  active;
+};
+
+struct BVHBuildLocals // size:  ~3.8K
+{
+    uchar2                 axis_and_left_count[ DFS_WG_SIZE ];
+    struct ThreadRangeInfo range[ DFS_WG_SIZE ];
+    uint                   sah[ DFS_WG_SIZE ];
+};
+
+#define LOCAL_BVH2_NODE_COUNT (2*(DFS_WG_SIZE) -1)
+
+struct LocalBVH2
+{
+    uint nodes[LOCAL_BVH2_NODE_COUNT];
+    uint num_nodes;
+
+    // bit layout is for a node is
+    //  uchar child_ptr;    // this is right_child_index >> 1.   right child's msb is always 0
+    //  uchar primref_base; // index of the node's first primref.  will be 0 at the root
+    //  uchar parent_dist;  // distance in nodes from this node to its parent
+    //  uchar prim_counter; // number of prims in this subtree.  For a complete tree (256 prims), the root may be off by 1
+
+    // for a WG size of 256, 8b is enough for parent distance, because the tree is built in level order
+    //    the maximum distance between parent and child occurs for a complete tree.
+    //    in this scenario the left-most leaf has index 255, its parent has index 127, the deltas to the children are 128 and 129
+};
+
+
+void LocalBVH2_Initialize( struct LocalBVH2* bvh2, ushort num_prims )
+{
+    bvh2->num_nodes = 1;
+    bvh2->nodes[0] = min(num_prims,(ushort)255);
+}
+
+
+
+void LocalBVH2_Initialize_Presplit(struct LocalBVH2* bvh2, ushort num_prims, ushort left_count, ushort right_count )
+{
+    bvh2->num_nodes = 3;
+    bvh2->nodes[0] = min(num_prims, (ushort)255);
+
+    ushort bvh2_root = 0;
+    ushort child_place = 1;
+
+    uint child_ptr = (child_place + 1) >> 1;
+    bvh2->nodes[bvh2_root] |= (child_ptr) << 24;
+
+    uint parent_dist = child_place - bvh2_root;
+
+    // initialize child nodes
+    ushort primref_base_left = 0;
+    ushort primref_base_right = left_count;
+    uint left = (primref_base_left << 16) + ((parent_dist << 8)) + left_count;
+    uint right = (primref_base_right << 16) + ((parent_dist + 1) << 8) + right_count;
+    bvh2->nodes[child_place] = left;
+    bvh2->nodes[child_place + 1] = right;
+}
+
+
+void LocalBVH2_CreateInnerNode( local struct LocalBVH2* bvh2, ushort bvh2_root, uint primref_base_left, uint primref_base_right )
+{
+    ushort child_place = atomic_add_local( &(bvh2-> num_nodes), 2 );
+
+    uint child_ptr   = (child_place + 1) >> 1;
+    bvh2->nodes[bvh2_root] |= (child_ptr) << 24;
+
+    uint parent_dist = child_place - bvh2_root;
+
+    // initialize child nodes
+    uint left  = (primref_base_left << 16)  + ((parent_dist << 8));
+    uint right = (primref_base_right << 16) + ((parent_dist + 1) << 8);
+    bvh2->nodes[child_place]     = left;
+    bvh2->nodes[child_place + 1] = right;
+}
+
+ushort2 LocalBVH2_GetChildIndices( struct LocalBVH2* bvh2, ushort bvh2_root )
+{
+    ushort right_idx = (bvh2->nodes[bvh2_root] & 0xff000000) >> 23;
+    return (ushort2)(right_idx - 1, right_idx);
+}
+
+
+ushort LocalBVH2_IncrementPrimCount( local struct LocalBVH2* bvh2, ushort bvh2_root )
+{
+    // increment only the lower 8 bits.  Algorithm will not overflow by design
+    return atomic_inc_local( &bvh2->nodes[bvh2_root] ) & 0xff;
+}
+
+ushort LocalBVH2_SetLeafPrimCount(local struct LocalBVH2* bvh2, ushort bvh2_root, ushort count)
+{
+    return bvh2->nodes[bvh2_root] |= (count& 0xff);
+}
+
+bool LocalBVH2_IsRoot( struct LocalBVH2* bvh2, ushort node_id )
+{
+    return node_id == 0;
+}
+
+ushort LocalBVH2_GetLeafPrimrefStart( struct LocalBVH2* bvh2, ushort bvh2_node_id )
+{
+    return (bvh2->nodes[bvh2_node_id] >> 16) & 255;
+}
+
+bool LocalBVH2_IsLeftChild( struct LocalBVH2* bvh2, ushort parent_node, ushort current_node )
+{
+    return (current_node & 1); // nodes are allocated in pairs.  first node is root, left child is an odd index
+}
+
+ushort LocalBVH2_GetParent( struct LocalBVH2* bvh2, ushort node )
+{
+    return node - ((bvh2->nodes[node] >> 8) & 255);
+}
+
+uint LocalBVH2_GetNodeCount( struct LocalBVH2* bvh2 )
+{
+    return bvh2->num_nodes;
+}
+
+bool LocalBVH2_IsLeaf( struct LocalBVH2* bvh2, ushort node_index )
+{
+    return (bvh2->nodes[node_index] & 255) <= TREE_ARITY;
+}
+
+ushort LocalBVH2_GetLeafPrimCount( struct LocalBVH2* bvh2, ushort node_index )
+{
+    return (bvh2->nodes[node_index] & 255);
+}
+
+void DFS_ConstructBVH2( local struct LocalBVH2* bvh2,
+                        local struct PrimRefSet* prim_refs,
+                        ushort bvh2_root,
+                        ushort prim_range_start,
+                        ushort local_num_prims,
+                        ushort global_num_prims,
+                        local struct BVHBuildLocals* locals,
+                        local uint* num_active_threads )
+{
+    ushort tid = get_local_id( 0 );
+    ushort primref_position = tid;
+
+    bool active_thread = tid < global_num_prims;
+
+    // Handle cases where initial binner creates leaves
+    if ( active_thread && local_num_prims <= TREE_ARITY )
+    {
+        struct DFSPrimRef ref = PrimRefSet_GetPrimRef(prim_refs, primref_position);
+        DFSPrimRef_SetBVH2Root(&ref, bvh2_root);
+        PrimRefSet_SetPrimRef(prim_refs, ref, primref_position);
+        active_thread = false;
+        if (primref_position == prim_range_start)
+            atomic_sub_local(num_active_threads, local_num_prims);
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    locals->range[ tid ].start           = prim_range_start;
+    locals->range[ tid ].local_num_prims = local_num_prims;
+    locals->range[ tid ].bvh2_root       = bvh2_root;
+    locals->range[ tid ].active          = active_thread;
+
+    do
+    {
+        if(active_thread && prim_range_start == primref_position)
+            locals->sah[primref_position] = UINT_MAX;
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        if ( active_thread )
+        {
+            local struct DFSPrimRefAABB* my_box = PrimRefSet_GetAABBPointer( prim_refs, primref_position );
+
+            // each thread evaluates a possible split candidate.  Scan primrefs and compute sah cost
+            //  do this axis-by-axis to keep register pressure low
+            float best_sah = INFINITY;
+            ushort best_axis = 3;
+            ushort best_count = 0;
+
+            struct DFSPrimRefAABB box_left[3];
+            struct DFSPrimRefAABB box_right[3];
+            float CSplit[3];
+            ushort count_left[3];
+
+            for ( ushort axis = 0; axis < 3; axis++ )
+            {
+                DFSPrimRefAABB_init( &box_left[axis] );
+                DFSPrimRefAABB_init( &box_right[axis] );
+
+                CSplit[axis] = my_box->lower[axis] + my_box->upper[axis];
+                count_left[axis] = 0;
+            }
+
+            // scan primrefs in our subtree and partition using this thread's prim as a split plane
+            {
+                struct DFSPrimRefAABB box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start );
+
+                for ( ushort p = 1; p < local_num_prims; p++ )
+                {
+                        struct DFSPrimRefAABB next_box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start + p ); //preloading box for next iteration
+
+                        for( ushort axis = 0; axis < 3; axis++ )
+                        {
+                            float c = box.lower[axis] + box.upper[axis];
+
+                            if ( c < CSplit[axis] )
+                            {
+                                // this primitive is to our left.
+                                DFSPrimRefAABB_extend( &box_left[axis], &box );
+                                count_left[axis]++;
+                            }
+                            else
+                            {
+                                // this primitive is to our right
+                                DFSPrimRefAABB_extend( &box_right[axis], &box );
+                            }
+                        }
+
+                        box = next_box;
+                }
+
+                // last iteration without preloading box
+                for( ushort axis = 0; axis < 3; axis++ )
+                {
+                    float c = box.lower[axis] + box.upper[axis];
+
+                    if ( c < CSplit[axis] )
+                    {
+                        // this primitive is to our left.
+                        DFSPrimRefAABB_extend( &box_left[axis], &box );
+                        count_left[axis]++;
+                    }
+                    else
+                    {
+                        // this primitive is to our right
+                        DFSPrimRefAABB_extend( &box_right[axis], &box );
+                    }
+                }
+
+            }
+
+            for ( ushort axis = 0; axis < 3; axis++ )
+            {
+                float Al = DFSPrimRefAABB_halfArea( &box_left[axis] );
+                float Ar = DFSPrimRefAABB_halfArea( &box_right[axis] );
+
+                // Avoid NANs in SAH calculation in the corner case where all prims go right
+                //  In this case we set Al=Ar, because such a split will only be selected if all primrefs
+                //    are co-incident..  In that case, we will fall back to split-in-the-middle and both subtrees
+                //    should store the same quantized area value
+                if ( count_left[axis] == 0 )
+                    Al = Ar;
+
+                // compute sah cost
+                ushort count_right = local_num_prims - count_left[axis];
+                float sah = Ar * count_right + Al * count_left[axis];
+
+                // keep this split if it is better than the previous one, or if the previous one was a corner-case
+                if ( sah < best_sah || best_count == 0 )
+                {
+                    // yes, keep it
+                    best_axis = axis;
+                    best_sah = sah;
+                    best_count = count_left[axis];
+                }
+            }
+
+            // write split information to SLM
+            locals->axis_and_left_count[primref_position].x = best_axis;
+            locals->axis_and_left_count[primref_position].y = best_count;
+            uint sah = as_uint(best_sah);
+            // break ties by axis to ensure deterministic split selection
+            //  otherwise builder can produce non-deterministic tree structure run to run
+            //  based on the ordering of primitives (which can vary due to non-determinism in atomic counters)
+            // Embed split axis and index into sah value; compute min over sah and max over axis
+            sah = ( ( sah & ~1023 ) | ( 2 - best_axis ) << 8 | tid );
+
+            // reduce on split candidates in our local subtree and decide the best one
+            atomic_min_local( &locals->sah[ prim_range_start ], sah);
+        }
+
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        ushort split_index = locals->sah[ prim_range_start ] & 255;
+        ushort split_axis = locals->axis_and_left_count[split_index].x;
+        ushort split_left_count = locals->axis_and_left_count[split_index].y;
+
+        if ( (primref_position == split_index) && active_thread )
+        {
+            // first thread in a given subtree creates the inner node
+            ushort start_left  = prim_range_start;
+            ushort start_right = prim_range_start + split_left_count;
+            if ( split_left_count == 0 )
+                start_right = start_left + (local_num_prims / 2); // handle split-in-the-middle case
+
+            LocalBVH2_CreateInnerNode( bvh2, bvh2_root, start_left, start_right );
+        }
+
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        struct DFSPrimRef ref;
+        ushort new_primref_position;
+
+        if ( active_thread )
+        {
+            ushort2 kids = LocalBVH2_GetChildIndices( bvh2, bvh2_root );
+            bool go_left;
+
+            if ( split_left_count == 0 )
+            {
+                // We chose a split with no left-side prims
+                //  This will only happen if all primrefs are located in the exact same position
+                //   In that case, fall back to split-in-the-middle
+                split_left_count = (local_num_prims / 2);
+                go_left = (primref_position - prim_range_start < split_left_count);
+            }
+            else
+            {
+                // determine what side of the split this thread's primref belongs on
+                local struct DFSPrimRefAABB* my_box    = PrimRefSet_GetAABBPointer( prim_refs, primref_position );
+                local struct DFSPrimRefAABB* split_box = PrimRefSet_GetAABBPointer( prim_refs, split_index );
+                float c = my_box->lower[split_axis] + my_box->upper[split_axis];
+                float Csplit = split_box->lower[split_axis] + split_box->upper[split_axis];
+                go_left = c < Csplit;
+            }
+
+            // adjust state variables for next loop iteration
+            bvh2_root = (go_left) ? kids.x : kids.y;
+            local_num_prims = (go_left) ? split_left_count : (local_num_prims - split_left_count);
+            prim_range_start = (go_left) ? prim_range_start : prim_range_start + split_left_count;
+
+            // determine the new primref position by incrementing a counter in the destination subtree
+            new_primref_position = prim_range_start + LocalBVH2_IncrementPrimCount( bvh2, bvh2_root );
+
+            // load our primref from its previous position
+            ref = PrimRefSet_GetPrimRef( prim_refs, primref_position );
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        if ( active_thread )
+        {
+            // write our primref into its sorted position and note which node it went in
+            DFSPrimRef_SetBVH2Root( &ref, bvh2_root );
+            PrimRefSet_SetPrimRef( prim_refs, ref, new_primref_position );
+            primref_position = new_primref_position;
+
+
+            // deactivate all threads whose subtrees are small enough to form a leaf
+            if ( local_num_prims <= TREE_ARITY )
+            {
+                active_thread = false;
+                if( primref_position == prim_range_start )
+                    atomic_sub_local( num_active_threads, local_num_prims );
+            }
+
+            locals->range[ primref_position ].start           = prim_range_start;
+            locals->range[ primref_position ].local_num_prims = local_num_prims;
+            locals->range[ primref_position ].bvh2_root       = bvh2_root;
+            locals->range[ primref_position ].active          = active_thread;
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        // if we'll have next iteration then load from SLM
+        if(*num_active_threads)
+        {
+            prim_range_start = locals->range[ tid ].start;
+            local_num_prims  = locals->range[ tid ].local_num_prims;
+            bvh2_root        = locals->range[ tid ].bvh2_root;
+            active_thread    = locals->range[ tid ].active;
+            primref_position = tid;
+        }
+        else
+        {
+            break;
+        }
+
+    } while ( true );
+
+}
+
+
+#define REFIT_BIT_DWORDS (LOCAL_BVH2_NODE_COUNT - DFS_WG_SIZE)/32
+
+struct RefitBits
+{
+    uint bits[REFIT_BIT_DWORDS];
+};
+
+struct DFS_SLM
+{
+    union
+    {
+        struct LocalBVH2 bvh2;
+        struct {
+            struct AABB3f centroid_bounds;
+            uint left_count;
+            uint right_count;
+            struct BFS_BinInfo bins;
+            struct BFS_BinInfoReduce3_SLM reduce3;
+        } binning;
+
+    } u1;
+
+    union
+    {
+        struct {
+            struct PrimRefSet prim_refs;
+            struct BVHBuildLocals locals;
+        } pass0;
+
+        struct AABB3f node_boxes[LOCAL_BVH2_NODE_COUNT];
+
+    } u2;
+
+    union
+    {
+        uchar bytes[DFS_WG_SIZE];
+        uint dwords[DFS_WG_SIZE/4];
+    } mask_info;
+
+    struct RefitBits refit_bits;
+
+};
+
+
+void DFS_InitialBinningPass(
+    local struct BFS_BinInfo* bins,
+    local struct BFS_BinInfoReduce3_SLM* reduce3,
+    uniform local struct AABB3f* centroid_bounds,
+    local struct PrimRefSet* refs,
+    local uint* left_counter,
+    local uint* right_counter,
+    ushort num_refs )
+{
+    uint tid = get_local_id(0);
+
+    // initialize SLM structures
+    if (tid == 0)
+    {
+        AABB3f_init(centroid_bounds);
+        *left_counter = 0;
+        *right_counter = 0;
+    }
+
+    BinInfo_init(bins);
+
+    PrimRef ref;
+    struct DFSPrimRef dfs_ref;
+
+    if (tid < num_refs)
+    {
+        dfs_ref = PrimRefSet_GetPrimRef(refs, tid);
+        struct DFSPrimRefAABB box = dfs_ref.aabb;
+        ref.lower.xyz = (float3)(box.lower[0], box.lower[1], box.lower[2]);
+        ref.upper.xyz = (float3)(box.upper[0], box.upper[1], box.upper[2]);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // compute centroid bounds so that we can bin
+    if (tid < num_refs)
+    {
+        float3 centroid = ref.lower.xyz + ref.upper.xyz;
+        Uniform_AABB3f_atomic_merge_local_sub_group_lu(centroid_bounds, centroid, centroid);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // add primrefs to bins
+    struct BinMapping mapping;
+    BinMapping_init(&mapping, centroid_bounds, BFS_NUM_BINS);
+
+    BinInfo_add_primref( &mapping, bins, &ref, tid<num_refs );
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // compute split - every sub_group computes different bin
+    struct BFS_Split split = BinInfo_reduce3(reduce3, bins, mapping.scale);
+
+
+    bool go_left = false;
+    uint local_pos = 0;
+    if (tid < num_refs)
+    {
+        // partition primrefs into L/R subsets...
+        if (split.sah == (float)(INFINITY))      // no valid split, split in the middle.. This can happen due to floating-point limit cases in huge scenes
+            go_left = tid < (num_refs / 2);
+        else
+            go_left = is_left(&mapping, &split, &ref);
+
+        if (go_left)
+            local_pos = atomic_inc_local(left_counter);
+        else
+            local_pos = num_refs - (1+ atomic_inc_local(right_counter));
+
+        PrimRefSet_SetPrimRef(refs, dfs_ref, local_pos);
+    }
+
+}
+
+
+void Do_DFS( struct DFSArgs args, local struct DFS_SLM* slm, local uint* num_active_threads )
+{
+    local struct LocalBVH2* bvh2 = &slm->u1.bvh2;
+
+    global struct BVH2* global_bvh2 = args.global_bvh2;
+
+    PrimRef ref;
+    uint parent_node;
+
+    {
+        local struct BVHBuildLocals* locals = &slm->u2.pass0.locals;
+        local struct PrimRefSet* prim_refs = &slm->u2.pass0.prim_refs;
+
+        DFS_CreatePrimRefSet(args, prim_refs);
+
+        uint local_id = get_local_id(0);
+
+        ushort bvh2_root = 0;
+        ushort prim_range_start = 0;
+        ushort local_num_prims = args.num_primrefs;
+
+        if(local_id == 0)
+            *num_active_threads = local_num_prims;
+
+        // barrier for DFS_CreatePrimRefSet and num_active_threads
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // initial binning pass if number of primrefs is large
+        if( args.num_primrefs > 32 )
+        {
+            DFS_InitialBinningPass(&slm->u1.binning.bins, &slm->u1.binning.reduce3, &slm->u1.binning.centroid_bounds, prim_refs,
+                &slm->u1.binning.left_count, &slm->u1.binning.right_count, args.num_primrefs);
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            ushort left_count = slm->u1.binning.left_count;
+            ushort right_count = args.num_primrefs - left_count;
+            if (get_local_id(0) == 0)
+                LocalBVH2_Initialize_Presplit(bvh2, args.num_primrefs, left_count, right_count);
+
+            bvh2_root        = (local_id < left_count) ? 1 : 2;
+            local_num_prims = (local_id < left_count) ? left_count : right_count;
+            prim_range_start = (local_id < left_count) ? 0 : left_count;
+        }
+        else
+        {
+            if (get_local_id(0) == 0)
+                LocalBVH2_Initialize(bvh2, args.num_primrefs);
+        }
+
+        DFS_ConstructBVH2( bvh2, prim_refs, bvh2_root, prim_range_start, local_num_prims, args.num_primrefs, locals, num_active_threads);
+
+        // move the prim refs into their sorted position
+        //  keep this thread's primref around for later use
+        if ( local_id < args.num_primrefs )
+        {
+            struct DFSPrimRef dfs_ref = PrimRefSet_GetPrimRef( prim_refs, local_id );
+
+            uint input_id = DFSPrimRef_GetInputIndex( &dfs_ref );
+
+            parent_node = DFSPrimRef_GetBVH2Parent( &dfs_ref );
+
+            uint primref_index = prim_refs->input_indices[input_id];
+            ref = args.primref_buffer[primref_index];
+            args.primref_indices_out[local_id] = primref_index;
+            args.primref_indices_in[local_id] = primref_index;
+            // these buffers are not read again until the end of kernel
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+    }
+
+
+    // initialize flags for determining when subtrees are done refit
+    if ( get_local_id( 0 ) < REFIT_BIT_DWORDS )
+        slm->refit_bits.bits[get_local_id( 0 )] = 0;
+
+
+    // stash full-precision primref AABBs in slm storage
+    local struct AABB3f* slm_boxes = &slm->u2.node_boxes[0];
+    bool active_thread = get_local_id( 0 ) < args.num_primrefs;
+    if( active_thread )
+    {
+        AABB3f_set( &slm_boxes[get_local_id( 0 )], ref.lower.xyz, ref.upper.xyz );
+
+        // stash instance masks in SLM storage
+        if( args.do_mask_processing )
+            slm->mask_info.bytes[get_local_id(0)] = PRIMREF_instanceMask( &ref );
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    // Refit leaf nodes
+    uint box_index;
+    if ( active_thread )
+    {
+        // the thread for the first primref in every leaf is the one that will ascend
+        // remaining threads merge their AABB/mask into the first one and terminate
+        uint first_ref = LocalBVH2_GetLeafPrimrefStart( bvh2, parent_node );
+        if ( first_ref != get_local_id( 0 ) )
+        {
+            AABB3f_atomic_merge_local_lu( &slm_boxes[first_ref], ref.lower.xyz, ref.upper.xyz );
+
+            if( args.do_mask_processing )
+            {
+                uint dword_index = first_ref/4;
+                uint shift       = (first_ref%4)*8;
+                uint mask = PRIMREF_instanceMask(&ref) << shift;
+                atomic_or_local( &slm->mask_info.dwords[dword_index], mask );
+            }
+            active_thread = false; // switch off all primref threads except the first one
+        }
+
+        box_index = first_ref;
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    if ( active_thread )
+    {
+        uint current_node = parent_node;
+        parent_node = LocalBVH2_GetParent( bvh2, current_node );
+
+        // write out the leaf node's AABB
+        uint num_prims = LocalBVH2_GetLeafPrimCount( bvh2, current_node );
+        uint prim_offs = args.primref_base + LocalBVH2_GetLeafPrimrefStart( bvh2, current_node );
+
+        uint mask = 0xff;
+        if( args.do_mask_processing )
+            mask = slm->mask_info.bytes[box_index];
+
+        BVH2_WriteLeafNode( global_bvh2, args.global_bvh2_base + current_node, &slm_boxes[box_index], prim_offs, num_prims, mask );
+
+        // we no longer need the BVH2 bits for this node, so re-purpose the memory to store the AABB index
+        bvh2->nodes[current_node] = box_index;
+
+        // toggle flag bit in parent node.  The second thread to flip the bit is the one that gets to proceed
+        uint thread_mask = (1 << (parent_node % 32));
+        if ( (atomic_xor_local( &slm->refit_bits.bits[parent_node / 32], thread_mask ) & thread_mask) == 0 )
+            active_thread = false;
+    }
+
+    // count how many active threads in sub_group we have and increment wg's number of active threads
+    uint sg_active = sub_group_reduce_add(active_thread ? 1 : 0);
+    if(get_sub_group_local_id() == 0)
+    {
+        atomic_add_local(num_active_threads, sg_active);
+    }
+
+    // refit internal nodes:
+    // walk up the tree and refit AABBs
+
+    do
+    {
+        barrier( CLK_LOCAL_MEM_FENCE ); // we need this barrier because we need to make sure all threads read num_active_threads before modifying it
+        if ( active_thread )
+        {
+            uint current_node = parent_node;
+            parent_node = LocalBVH2_GetParent( bvh2, current_node );
+
+            // pull left/right box indices from current node
+            ushort2 kids = LocalBVH2_GetChildIndices( bvh2, current_node );
+
+            uint left_box = bvh2->nodes[kids.x];
+            uint right_box = bvh2->nodes[kids.y];
+
+            struct AABB3f left = slm_boxes[left_box];
+            struct AABB3f right = slm_boxes[right_box];
+            AABB3f_extend( &left, &right );
+
+            uint2 child_offsets = (uint2)(
+                args.global_bvh2_base + kids.x,
+                args.global_bvh2_base + kids.y);
+
+            uint mask = 0xff;
+            if( args.do_mask_processing )
+            {
+                mask = slm->mask_info.bytes[left_box]
+                     | slm->mask_info.bytes[right_box];
+                slm->mask_info.bytes[left_box] = mask;
+            }
+
+            BVH2_WriteInnerNode( args.global_bvh2, args.global_bvh2_base+current_node, &left, child_offsets, mask );
+
+            slm_boxes[left_box] = left;
+            bvh2->nodes[current_node] = left_box;
+
+            // stop at the root
+            if ( LocalBVH2_IsRoot( bvh2, current_node ) )
+            {
+                active_thread = false;
+                atomic_dec_local(num_active_threads);
+            }
+            else
+            {
+                // toggle flag bit in parent node.  The second thread to flip the bit is the one that gets to proceed
+                uint mask = (1 << (parent_node % 32));
+                if ( (atomic_xor_local( &slm->refit_bits.bits[parent_node / 32], mask ) & mask) == 0 )
+                {
+                    active_thread = false;
+                    atomic_dec_local(num_active_threads);
+                }
+            }
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+    } while ( *num_active_threads > 0 );
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size(DFS_WG_SIZE,1,1) ))
+__attribute__( (intel_reqd_sub_group_size(16)) )
+kernel void
+DFS( global struct VContextScheduler* scheduler,
+     global struct SAHBuildGlobals* globals_buffer )
+{
+    local struct DFS_SLM slm;
+    local struct DFSDispatchRecord record;
+    local uint num_active_threads;
+
+    if ( get_local_id( 0 ) == 0  )
+    {
+        // pop an entry off the DFS dispatch queue
+        //uint wg_index = atomic_dec_global( &scheduler->num_dfs_wgs ) - 1;
+        //record = scheduler->dfs_queue.records[wg_index];
+
+        // TODO:  The version above races, but is considerably faster... investigate
+        uint wg_index = get_group_id(0);
+        record = scheduler->dfs_queue.records[wg_index];
+        write_mem_fence( CLK_LOCAL_MEM_FENCE );
+        atomic_dec_global( &scheduler->num_dfs_wgs );
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+
+    bool odd_pass = record.tree_depth & 1;
+
+    global struct SAHBuildGlobals* sah_globals = globals_buffer + record.batch_index;
+
+    struct DFSArgs args;
+    args.num_primrefs = record.num_primrefs;
+    args.primref_indices_in   = SAHBuildGlobals_GetPrimrefIndices_In( sah_globals, odd_pass );
+    args.primref_indices_out  = SAHBuildGlobals_GetPrimrefIndices_Out( sah_globals, odd_pass );
+    args.primref_buffer       = SAHBuildGlobals_GetPrimrefs( sah_globals );
+    args.global_bvh2          = SAHBuildGlobals_GetBVH2( sah_globals );
+    args.primref_indices_in  += record.primref_base;
+    args.primref_indices_out += record.primref_base;
+    args.primref_base         = record.primref_base;
+    args.global_bvh2_base     = record.bvh2_base;
+    args.do_mask_processing   = SAHBuildGlobals_NeedMasks( sah_globals );
+
+    Do_DFS( args, &slm, &num_active_threads );
+
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+///
+///        BVH2 to BVH6
+///
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
+struct BuildFlatTreeArgs
+{
+    ushort leaf_size_in_bytes;
+    ushort leaf_type;
+    ushort inner_node_type;
+    bool do_mask_processing;
+
+    global uint* primref_indices;
+    global PrimRef* primref_buffer;
+    global struct Globals* globals;
+    global struct BVHBase* bvh_base;
+    global struct BVH2* bvh2;
+};
+
+
+// lane i in the return value is the index of the ith largest primref in the input
+// the return value can be used with shuffle() to move data into its sorted position
+//  the elements of 'key' must be unique.. only the first 6 elements are sorted
+varying ushort SUBGROUP_get_sort_indices_N6( varying uint key )
+{
+    // each lane computes the number of items larger than it
+    // this is its position in the descending order
+    //   TODO_OPT:  Compiler can vectorize these uint16 adds by packing into lower and upper halves of same GPR.... make sure it does it
+    //     if compiler is not generating optimal code, consider moving to Cm
+
+    varying ushort cmp0 = (sub_group_broadcast(key, 0) > key) ? 1 : 0;
+    varying ushort cmp1 = (sub_group_broadcast(key, 1) > key) ? 1 : 0;
+    varying ushort cmp2 = (sub_group_broadcast(key, 2) > key) ? 1 : 0;
+    varying ushort cmp3 = (sub_group_broadcast(key, 3) > key) ? 1 : 0;
+    varying ushort cmp4 = (sub_group_broadcast(key, 4) > key) ? 1 : 0;
+    varying ushort cmp5 = (sub_group_broadcast(key, 5) > key) ? 1 : 0;
+    varying ushort a = cmp0 + cmp2 + cmp4;
+    varying ushort b = cmp1 + cmp3 + cmp5;
+    varying ushort num_larger = a + b;
+
+    // each lane determines which of the input elements it should pull
+    varying ushort lane = get_sub_group_local_id();
+    a  = (sub_group_broadcast(num_larger, 0) == lane) ? 0 : 0;
+    b  = (sub_group_broadcast(num_larger, 1) == lane) ? 1 : 0;
+    a += (sub_group_broadcast(num_larger, 2) == lane) ? 2 : 0;
+    b += (sub_group_broadcast(num_larger, 3) == lane) ? 3 : 0;
+    a += (sub_group_broadcast(num_larger, 4) == lane) ? 4 : 0;
+    b += (sub_group_broadcast(num_larger, 5) == lane) ? 5 : 0;
+    return a + b;
+}
+
+uint SUBGROUP_area_to_sort_key( varying float area, uniform ushort num_children )
+{
+    varying ushort lane = get_sub_group_local_id();
+    area = (lane < num_children) ? area : 0;        // put inactive nodes last
+
+    // drop LSBs and break ties by lane number to ensure unique keys
+    // use descending lane IDs to ensure that sort is stable if the upper MSBs are equal.
+    //     If we do not do this it can lead to non-deterministic tree structure
+    return (as_uint(area) & 0xffffff80) + (lane^(get_sub_group_size()-1));
+}
+
+// lane i in the return value is the index of the ith largest primref in the input
+// the return value can be used with shuffle() to move data into its sorted position
+//  the elements of 'key' must be unique.. only the first 6 elements are sorted
+varying ushort SUBGROUP_get_sort_indices_N6_2xSIMD8_in_SIMD16( varying uint key )
+{
+    // each lane computes the number of items larger than it
+    // this is its position in the descending order
+    //   TODO_OPT:  Compiler can vectorize these uint16 adds by packing into lower and upper halves of same GPR.... make sure it does it
+    //     if compiler is not generating optimal code, consider moving to Cm
+
+    varying ushort cmp0 = (sub_group_broadcast(key, 0) > key) ? 1 : 0;
+    varying ushort cmp1 = (sub_group_broadcast(key, 1) > key) ? 1 : 0;
+    varying ushort cmp2 = (sub_group_broadcast(key, 2) > key) ? 1 : 0;
+    varying ushort cmp3 = (sub_group_broadcast(key, 3) > key) ? 1 : 0;
+    varying ushort cmp4 = (sub_group_broadcast(key, 4) > key) ? 1 : 0;
+    varying ushort cmp5 = (sub_group_broadcast(key, 5) > key) ? 1 : 0;
+    varying ushort a = cmp0 + cmp2 + cmp4;
+    varying ushort b = cmp1 + cmp3 + cmp5;
+    varying ushort num_larger = a + b;
+
+    varying ushort cmp0_1 = (sub_group_broadcast(key, 8) > key) ? 1 : 0;
+    varying ushort cmp1_1 = (sub_group_broadcast(key, 9) > key) ? 1 : 0;
+    varying ushort cmp2_1 = (sub_group_broadcast(key, 10) > key) ? 1 : 0;
+    varying ushort cmp3_1 = (sub_group_broadcast(key, 11) > key) ? 1 : 0;
+    varying ushort cmp4_1 = (sub_group_broadcast(key, 12) > key) ? 1 : 0;
+    varying ushort cmp5_1 = (sub_group_broadcast(key, 13) > key) ? 1 : 0;
+    varying ushort a_1 = cmp0_1 + cmp2_1 + cmp4_1;
+    varying ushort b_1 = cmp1_1 + cmp3_1 + cmp5_1;
+    varying ushort num_larger_1 = a_1 + b_1;
+
+    // each lane determines which of the input elements it should pull
+    varying ushort lane = get_sub_group_local_id();
+    if(lane < 8)
+    {
+        a  = (sub_group_broadcast(num_larger, 0) == lane) ? 0 : 0;
+        b  = (sub_group_broadcast(num_larger, 1) == lane) ? 1 : 0;
+        a += (sub_group_broadcast(num_larger, 2) == lane) ? 2 : 0;
+        b += (sub_group_broadcast(num_larger, 3) == lane) ? 3 : 0;
+        a += (sub_group_broadcast(num_larger, 4) == lane) ? 4 : 0;
+        b += (sub_group_broadcast(num_larger, 5) == lane) ? 5 : 0;
+    }
+    else
+    {
+        a  = (sub_group_broadcast(num_larger_1, 8)  == lane-8) ? 8 : 8;
+        b  = (sub_group_broadcast(num_larger_1, 9)  == lane-8) ? 1 : 0;
+        a += (sub_group_broadcast(num_larger_1, 10) == lane-8) ? 2 : 0;
+        b += (sub_group_broadcast(num_larger_1, 11) == lane-8) ? 3 : 0;
+        a += (sub_group_broadcast(num_larger_1, 12) == lane-8) ? 4 : 0;
+        b += (sub_group_broadcast(num_larger_1, 13) == lane-8) ? 5 : 0;
+    }
+
+    return a + b;
+}
+
+uint SUBGROUP_area_to_sort_key_2xSIMD8_in_SIMD16( varying float area, uniform ushort num_children )
+{
+    varying ushort lane = get_sub_group_local_id() % 8;
+    area = (lane < num_children) ? area : 0;        // put inactive nodes last
+
+    // drop LSBs and break ties by lane number to ensure unique keys
+    // use descending lane IDs to ensure that sort is stable if the upper MSBs are equal.
+    //     If we do not do this it can lead to non-deterministic tree structure
+    return (as_uint(area) & 0xffffff80) + (lane^7);
+}
+
+ushort SUBGROUP_BuildFlatTreeNode(
+    uniform struct BuildFlatTreeArgs args,
+    uniform uint bvh2_root,
+    uniform struct InternalNode* qnode,
+    uniform uint qnode_index,
+    varying uint3* sg_children_out // if an inner node is created, receives the indices of the 6 child nodes (X), and the QNode position (y), and num_children(z)
+                                   //  if a leaf is created, receives number of primrefs (z)
+) // return value is the number of child nodes or 0 for a leaf
+{
+    global struct BVH2* bvh2 = args.bvh2;
+    varying ushort lane = get_sub_group_local_id();
+
+    global struct BVHBase* base = args.bvh_base;
+
+
+    if ( !BVH2_IsInnerNode( bvh2, bvh2_root ) )
+    {
+        uniform ushort num_prims   = BVH2_GetLeafPrimCount( bvh2, bvh2_root );
+        uniform uint primref_start = BVH2_GetLeafPrimStart( bvh2, bvh2_root );
+        varying uint primref_index = primref_start + ((lane < num_prims) ? lane : 0);
+
+        varying uint ref_id = args.primref_indices[primref_index];
+        varying PrimRef ref = args.primref_buffer[ref_id];
+        uniform char* leaf_mem_base = (char*)BVHBase_GetQuadLeaves( args.bvh_base );
+        uniform char* leaf_mem = leaf_mem_base + primref_start * args.leaf_size_in_bytes;
+
+        uniform int offset = (int)(leaf_mem - (char*)qnode);
+        offset = offset >> 6;
+
+        varying uint key = SUBGROUP_area_to_sort_key(AABB_halfArea(&ref), num_prims );
+        varying ushort sort_index = SUBGROUP_get_sort_indices_N6(key);
+        ref = PrimRef_sub_group_shuffle(&ref, sort_index);
+        ref_id = intel_sub_group_shuffle(ref_id, sort_index);
+
+        if (lane < num_prims)
+            args.primref_indices[primref_index] = ref_id;
+
+        uint global_num_prims = args.globals->numPrimitives;
+        char* bvh_mem = (char*) args.bvh_base;
+
+        if(lane < num_prims)
+            args.primref_indices[primref_index + global_num_prims] = qnode - (struct InternalNode*)bvh_mem;
+
+        if (args.leaf_type == NODE_TYPE_INSTANCE)
+            subgroup_setInstanceQBVHNodeN( offset, &ref, num_prims, (struct QBVHNodeN*)qnode, lane < num_prims ? PRIMREF_instanceMask(&ref) : 0 );
+        else
+            subgroup_setQBVHNodeN( offset, args.leaf_type, &ref, num_prims, (struct QBVHNodeN*)qnode, BVH_NODE_DEFAULT_MASK );
+
+        sg_children_out->z = num_prims;
+        return 0;
+    }
+    else
+    {
+        // collapse BVH2 into BVH6.
+        // We will spread the root node's children across the subgroup, and keep adding SIMD lanes until we have enough
+        uniform ushort num_children = 2;
+
+        uniform uint2 kids = BVH2_GetChildIndices( bvh2, bvh2_root );
+        varying uint sg_bvh2_node = kids.x;
+        if ( lane == 1 )
+            sg_bvh2_node = kids.y;
+
+        do
+        {
+            // choose the inner node with maximum area to replace.
+            // Its left child goes in its old location.  Its right child goes in a new lane
+
+            // TODO_OPT:  We re-read the AABBs again and again to compute area
+            //   ... store per-lane boxes instead and pre-compute areas
+
+            varying float sg_area = BVH2_GetNodeArea( bvh2, sg_bvh2_node );
+            varying bool sg_is_inner = BVH2_IsInnerNode( bvh2, sg_bvh2_node );
+            sg_area = (sg_is_inner && lane < num_children) ? sg_area : 0; // prevent early exit if the largest child is a leaf
+
+            uniform float max_area = sub_group_reduce_max_N6( sg_area );
+            varying bool sg_reducable = max_area == sg_area && (lane < num_children) && sg_is_inner;
+            uniform uint mask = intel_sub_group_ballot( sg_reducable );
+
+            // TODO_OPT:  Some of these ops seem redundant.. look at trimming further
+
+            if ( mask == 0 )
+                break;
+
+            // choose the inner node with maximum area to replace
+            uniform ushort victim_child = ctz( mask );
+            uniform uint victim_node = sub_group_broadcast( sg_bvh2_node, victim_child );
+            kids = BVH2_GetChildIndices( bvh2, victim_node );
+
+            if ( lane == victim_child )
+                sg_bvh2_node = kids.x;
+            else if ( lane == num_children )
+                sg_bvh2_node = kids.y;
+
+            num_children++;
+
+        } while ( num_children < TREE_ARITY );
+
+        // allocate inner node space
+        uniform uint kids_offset;
+        if (get_sub_group_local_id() == 0)
+            kids_offset = allocate_inner_nodes( args.bvh_base, num_children );
+        kids_offset = sub_group_broadcast(kids_offset, 0);
+
+        uniform struct QBVHNodeN* kid = (((struct QBVHNodeN*)args.bvh_base) + kids_offset);
+        uniform int offset = (int)((char*)kid - (char*)qnode) >> 6;
+
+#if 0
+        uniform uint kids_offset;
+        if ( get_sub_group_local_id() == 0 )
+            kids_offset = alloc_node_mem( args.globals, sizeof( struct QBVHNodeN ) * num_children );
+        kids_offset = sub_group_broadcast( kids_offset, 0 );
+
+
+        // create inner node
+        uniform struct QBVHNodeN* kid = (struct QBVHNodeN*) ((char*)(args.bvh_base) + kids_offset);
+        uniform int offset = (int)((char*)kid - (char*)qnode) >> 6;
+#endif
+        uniform uint child_type = args.inner_node_type;
+
+        // sort child nodes in descending order by AABB area
+        varying struct AABB box   = BVH2_GetAABB( bvh2, sg_bvh2_node );
+        varying uint key          = SUBGROUP_area_to_sort_key(AABB_halfArea(&box), num_children );
+        varying ushort sort_index = SUBGROUP_get_sort_indices_N6(key);
+        box          = AABB_sub_group_shuffle(&box, sort_index);
+        sg_bvh2_node = intel_sub_group_shuffle(sg_bvh2_node, sort_index);
+
+        uniform uint node_mask = (args.do_mask_processing) ? BVH2_GetMask( bvh2, bvh2_root ) : 0xff;
+
+        subgroup_setQBVHNodeN( offset, child_type, &box, num_children, (struct QBVHNodeN*)qnode, node_mask );
+
+        // return child information
+        *sg_children_out = (uint3)(sg_bvh2_node, qnode_index + offset + get_sub_group_local_id(), num_children );
+        return num_children;
+    }
+}
+
+ushort SUBGROUP_BuildFlatTreeNode_2xSIMD8_in_SIMD16(
+    uniform struct BuildFlatTreeArgs args,
+    varying uint bvh2_root,
+    varying struct InternalNode* qnode_base,
+    varying uint qnode_index,
+    varying uint3* sg_children_out, // if an inner node is created, receives the indices of the 6 child nodes (X), and the QNode position (y), and num_children(z)
+                                   //  if a leaf is created, receives number of primrefs (z)
+    bool active_lane
+) // return value is the number of child nodes or 0 for a leaf
+{
+    global struct BVH2* bvh2 = args.bvh2;
+    varying ushort SIMD16_lane = get_sub_group_local_id();
+    varying ushort SIMD8_lane = get_sub_group_local_id() % 8;
+    varying ushort SIMD8_id = get_sub_group_local_id() / 8;
+    varying ushort lane = get_sub_group_local_id();
+    global struct BVHBase* base = args.bvh_base;
+
+    struct BVH2NodeMetaData nodeMetaData = BVH2_GetNodeMetaData( bvh2, bvh2_root );
+
+    bool is_leaf = active_lane && !BVH2NodeMetaData_IsInnerNode( &nodeMetaData );
+    bool is_inner = active_lane && BVH2NodeMetaData_IsInnerNode( &nodeMetaData );
+
+    uchar mask = BVH_NODE_DEFAULT_MASK;
+    if(is_inner)
+        mask = (args.do_mask_processing) ? BVH2NodeMetaData_GetMask( &nodeMetaData ) : 0xff;
+
+    int offset;
+
+    varying struct InternalNode* qnode = qnode_base + qnode_index;
+    // TOOD: we don't need unions, I left them only for readability
+    union {
+        uint num_prims;
+        uint num_children;
+    } lane_num_data;
+
+    union {
+        PrimRef ref; // this is in fact AABB
+        struct AABB box;
+    } lane_box_data;
+
+    union {
+        uint ref_id;
+        uint sg_bvh2_node;
+    } lane_id_data;
+
+    // for leafs
+    varying uint primref_index;
+
+    if(is_leaf)
+    {
+        lane_num_data.num_prims   = BVH2NodeMetaData_GetLeafPrimCount( &nodeMetaData );
+        uint primref_start = BVH2NodeMetaData_GetLeafPrimStart( &nodeMetaData );
+        primref_index = primref_start + ((SIMD8_lane < lane_num_data.num_prims) ? SIMD8_lane : 0);
+
+        lane_id_data.ref_id = args.primref_indices[primref_index];
+        lane_box_data.ref = args.primref_buffer[lane_id_data.ref_id];
+        char* leaf_mem_base = (char*)BVHBase_GetQuadLeaves( args.bvh_base );
+        char* leaf_mem = leaf_mem_base + primref_start * args.leaf_size_in_bytes;
+
+        offset = (int)(leaf_mem - (char*)qnode);
+        offset = offset >> 6;
+    }
+
+
+    if(intel_sub_group_ballot(is_inner))
+    {
+        // collapse BVH2 into BVH6.
+        // We will spread the root node's children across the subgroup, and keep adding SIMD lanes until we have enough
+
+        uint2 kids;
+        if(is_inner)
+        {
+            lane_num_data.num_children = 2;
+            kids = BVH2_GetChildIndices( bvh2, bvh2_root );
+
+            lane_id_data.sg_bvh2_node = kids.x;
+            if ( SIMD8_lane == 1 )
+                lane_id_data.sg_bvh2_node = kids.y;
+        }
+
+        bool active = is_inner;
+        do
+        {
+            // choose the inner node with maximum area to replace.
+            // Its left child goes in its old location.  Its right child goes in a new lane
+
+            // TODO_OPT:  We re-read the AABBs again and again to compute area
+            //   ... store per-lane boxes instead and pre-compute areas
+
+            varying float sg_area = 0;
+            varying bool sg_is_inner = false;
+            if(active)
+            {
+                sg_area = BVH2_GetNodeArea( bvh2, lane_id_data.sg_bvh2_node );
+                sg_is_inner = BVH2_IsInnerNode( bvh2, lane_id_data.sg_bvh2_node );
+                sg_area = (sg_is_inner && SIMD8_lane < lane_num_data.num_children) ? sg_area : 0; // prevent early exit if the largest child is a leaf
+            }
+
+            float max_area = sub_group_reduce_max_N6_2xSIMD8_in_SIMD16( sg_area );
+            varying bool sg_reducable = max_area == sg_area && sg_is_inner && (SIMD8_lane < lane_num_data.num_children);
+            uint mask = intel_sub_group_ballot( sg_reducable ) & (0xFF << SIMD8_id * 8); // we'll end up with two different masks for two SIMD8 in SIMD16 due to bits masking
+
+            // TODO_OPT:  Some of these ops seem redundant.. look at trimming further
+
+            if ( mask == 0 )
+                active = false;
+
+            // choose the inner node with maximum area to replace
+            ushort victim_child = ctz( mask );
+            uint victim_node = intel_sub_group_shuffle( lane_id_data.sg_bvh2_node, victim_child );
+            if(active)
+            {
+                kids = BVH2_GetChildIndices( bvh2, victim_node );
+
+                if ( SIMD16_lane == victim_child ) // we use SIMD16_lane, cause victim_child was calculated based on SIMD16 i.e. second node will have victim from 8..13
+                    lane_id_data.sg_bvh2_node = kids.x;
+                else if ( SIMD8_lane == lane_num_data.num_children )
+                    lane_id_data.sg_bvh2_node = kids.y;
+
+                lane_num_data.num_children++;
+
+                if(lane_num_data.num_children >= TREE_ARITY)
+                    active = false;
+            }
+
+        } while ( intel_sub_group_ballot(active) ); // if any active, then continue
+
+        // sum children from both halfs of SIMD16 to allocate nodes only once per sub_group
+        uniform ushort num_children = is_inner ? lane_num_data.num_children : 0;
+        uniform ushort first_SIMD8_num_children = sub_group_broadcast(num_children, 0);
+        uniform ushort second_SIMD8_num_children = sub_group_broadcast(num_children, 8);
+
+        num_children = first_SIMD8_num_children + second_SIMD8_num_children;
+        uint kids_offset;
+
+        // allocate inner node space
+        if(num_children && SIMD16_lane == 0)
+            kids_offset = allocate_inner_nodes( args.bvh_base, num_children );
+        kids_offset = sub_group_broadcast(kids_offset, 0);
+        if((is_inner))
+        {
+            kids_offset += SIMD8_id * first_SIMD8_num_children;
+
+            struct QBVHNodeN* kid = (((struct QBVHNodeN*)args.bvh_base) + kids_offset);
+
+            offset = (int)((char*)kid - (char*)qnode) >> 6;
+            lane_box_data.box = BVH2_GetAABB( bvh2, lane_id_data.sg_bvh2_node );
+        }
+    }
+
+    // sort child nodes in descending order by AABB area
+    varying uint key          = SUBGROUP_area_to_sort_key_2xSIMD8_in_SIMD16(AABB_halfArea(&lane_box_data.box), lane_num_data.num_children );
+    varying ushort sort_index = SUBGROUP_get_sort_indices_N6_2xSIMD8_in_SIMD16(key);
+    lane_box_data.box         = PrimRef_sub_group_shuffle(&lane_box_data.box, sort_index);
+    lane_id_data.sg_bvh2_node = intel_sub_group_shuffle(lane_id_data.sg_bvh2_node, sort_index);
+
+    char* bvh_mem = (char*) args.bvh_base;
+    if (is_leaf && SIMD8_lane < lane_num_data.num_prims)
+    {
+        args.primref_indices[primref_index] = lane_id_data.ref_id;
+        args.primref_indices[primref_index + args.globals->numPrimitives] = qnode - (struct InternalNode*)bvh_mem;
+    }
+
+    bool degenerated = false;
+    uint node_type = is_leaf ? args.leaf_type : args.inner_node_type;
+
+    if(args.leaf_type == NODE_TYPE_INSTANCE)
+        degenerated = subgroup_setInstanceBox_2xSIMD8_in_SIMD16(&lane_box_data.box, lane_num_data.num_children, &mask, SIMD8_lane < lane_num_data.num_prims ? PRIMREF_instanceMask(&lane_box_data.ref) : 0, is_leaf);
+
+    subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(offset, node_type, &lane_box_data.box, lane_num_data.num_children, mask, (struct QBVHNodeN*)(qnode), degenerated, active_lane);
+
+    // return child information
+    if(is_inner)
+    {
+        sg_children_out->x = lane_id_data.sg_bvh2_node;
+        sg_children_out->y = qnode_index + offset + SIMD8_lane;
+    }
+
+    sg_children_out->z = lane_num_data.num_children;
+
+    return is_inner ? lane_num_data.num_children : 0;
+}
+
+void check_primref_integrity( global struct SAHBuildGlobals* globals )
+{
+    global uint* primref_in = SAHBuildGlobals_GetPrimrefIndices_In( globals, 0 );
+    global uint* primref_out = SAHBuildGlobals_GetPrimrefIndices_Out( globals, 0 );
+    dword num_primrefs = SAHBuildGlobals_GetTotalPrimRefs( globals );
+    if ( get_local_id( 0 ) == 0 )
+    {
+        for ( uint i = 0; i < num_primrefs; i++ )
+        {
+            primref_out[i] = 0;
+        }
+
+        for ( uint i = 0; i < num_primrefs; i++ )
+            primref_out[primref_in[i]]++;
+
+        for ( uint i = 0; i < num_primrefs; i++ )
+            if ( primref_out[i] != 1 )
+                printf( "Foo: %u   %u\n", i, primref_out[i] );
+    }
+}
+
+
+
+
+void check_bvh2(global struct SAHBuildGlobals* globals )
+{
+    global struct BVH2* bvh2 = SAHBuildGlobals_GetBVH2(globals);
+    global uint* primref_in = SAHBuildGlobals_GetPrimrefIndices_In(globals, 0);
+    global uint* primref_out = SAHBuildGlobals_GetPrimrefIndices_Out(globals, 0);
+    dword num_primrefs = SAHBuildGlobals_GetTotalPrimRefs(globals);
+
+    if (get_local_id(0) == 0)
+    {
+        for (uint i = 0; i < num_primrefs; i++)
+            primref_out[i] = 0;
+
+        uint stack[256];
+        uint sp=0;
+        uint r = BVH2_GetRoot(bvh2);
+        stack[sp++] = r;
+        while (sp)
+        {
+            r = stack[--sp];
+            if (BVH2_IsInnerNode(bvh2,r))
+            {
+                uint2 kids = BVH2_GetChildIndices( bvh2, r);
+                if (kids.x >= bvh2->num_nodes || kids.y >= bvh2->num_nodes)
+                {
+                    printf("BVH2!! Bad node index found!\n");
+                    return;
+                }
+
+                stack[sp++] = kids.x;
+                stack[sp++] = kids.y;
+            }
+            else
+            {
+                uint ref = BVH2_GetLeafPrimStart(bvh2,r);
+                uint count = BVH2_GetLeafPrimCount(bvh2,r);
+                if( count == 0 )
+                {
+                    printf("BVH2!! Empty leaf found!\n");
+                    return;
+                }
+                for (uint i = 0; i < count; i++)
+                {
+                    if (ref + i > num_primrefs)
+                    {
+                        printf("BVH2!! Bad leaf range!\n");
+                        return;
+                    }
+                    uint c = primref_out[ref+i];
+                    if (c != 0)
+                    {
+                        printf("BVH2!! overlapped prim ranges\n");
+                        return;
+                    }
+                    primref_out[ref+i] = 1;
+                    if (primref_in[ref + i] >= num_primrefs)
+                    {
+                        printf("BAD PRIMREF ID FOUND!\n");
+                        return;
+                    }
+                }
+            }
+        }
+    }
+
+    printf("bvh2 is ok!\n");
+}
+
+
+#if 0
+// TODO_OPT:  Enable larger WGs.  WGSize 512 at SIMD8 hangs on Gen9, but Gen12 can go bigger
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size(256,1,1)) )
+__attribute__( (intel_reqd_sub_group_size(8) ) )
+kernel void
+build_qnodes( global struct SAHBuildGlobals* globals, global struct VContextScheduler* scheduler )
+{
+    globals = globals + (scheduler->num_trivial_builds + scheduler->num_single_builds);
+    globals = globals + get_group_id(0);
+
+
+    struct BuildFlatTreeArgs args;
+    args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes( globals );
+    args.leaf_type          = SAHBuildGlobals_GetLeafType( globals );
+    args.inner_node_type    = SAHBuildGlobals_GetInternalNodeType( globals );
+    args.primref_indices    = SAHBuildGlobals_GetPrimrefIndices_In( globals, 0 );
+    args.primref_buffer     = SAHBuildGlobals_GetPrimrefs( globals );
+    args.bvh_base           = SAHBuildGlobals_GetBVHBase( globals );
+    args.bvh2               = SAHBuildGlobals_GetBVH2( globals );
+    args.globals            = (global struct Globals*) globals->p_globals;
+    args.do_mask_processing = SAHBuildGlobals_NeedMasks( globals );
+
+    dword alloc_backpointers = SAHBuildGlobals_NeedBackPointers( globals );
+    global uint2* root_buffer = (global uint2*) globals->p_qnode_root_buffer;
+    global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes( args.bvh_base );
+    global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base );
+
+    local uint nodes_produced;
+    if ( get_sub_group_id() == 0 )
+    {
+        // allocate first node
+        if (get_sub_group_local_id() == 0)
+            allocate_inner_nodes( args.bvh_base, 1 );
+
+        // first subgroup does first node
+        varying uint3 children_info;
+        uniform ushort num_children = SUBGROUP_BuildFlatTreeNode(args, BVH2_GetRoot(args.bvh2), qnodes, 0, &children_info );
+
+        if ( get_sub_group_local_id() < num_children )
+            root_buffer[get_sub_group_local_id()] = children_info.xy;
+
+        if ( alloc_backpointers )
+        {
+            // set root's backpointer
+            if( get_sub_group_local_id() == 0 )
+                back_pointers[0] = (0xffffffc0) | (children_info.z << 3);
+
+            // point child backpointers at the parent
+            if( get_sub_group_local_id() < num_children )
+                back_pointers[children_info.y] = 0;
+        }
+
+        if ( get_sub_group_local_id() == 0 )
+            nodes_produced = num_children;
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE );
+
+
+    uniform uint buffer_index = get_sub_group_id();
+    uniform bool sg_active    = buffer_index < nodes_produced;
+
+    while ( work_group_any( sg_active ) )
+    {
+        if( sg_active )
+        {
+            uniform uint bvh2_node    = root_buffer[buffer_index].x;
+            uniform uint qnode_index  = root_buffer[buffer_index].y;
+
+            // build a node
+            varying uint3 children_info;
+            uniform ushort num_children = SUBGROUP_BuildFlatTreeNode( args, bvh2_node, qnodes + qnode_index, qnode_index, &children_info );
+
+            // handle backpointers
+            if ( alloc_backpointers )
+            {
+                // update this node's backpointer with child count
+                if ( get_sub_group_local_id() == 0 )
+                    back_pointers[qnode_index] |= (children_info.z << 3);
+
+                // point child backpointers at parent
+                if ( get_sub_group_local_id() < num_children )
+                    back_pointers[children_info.y] = (qnode_index << 6);
+            }
+
+            if ( num_children )
+            {
+                // allocate space in the child buffer
+                uint root_buffer_position = 0;
+                if ( get_sub_group_local_id() == 0 )
+                    root_buffer_position = atomic_add_local( &nodes_produced, num_children );
+                root_buffer_position = sub_group_broadcast( root_buffer_position, 0 );
+
+                // store child indices in root buffer
+                if ( get_sub_group_local_id() < num_children )
+                    root_buffer[root_buffer_position + get_sub_group_local_id()] = children_info.xy;
+            }
+        }
+
+        // sync everyone
+        work_group_barrier( CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE,
+                            memory_scope_work_group );
+
+
+        if( sg_active )
+            buffer_index += get_num_sub_groups();
+
+        sg_active = (buffer_index < nodes_produced);
+    }
+}
+#endif
+
+
+
+
+
+
+
+inline bool buffer_may_overflow( uint capacity, uint current_size, uint elements_processed_per_sub_group )
+{
+    uint num_consumed = min( get_num_sub_groups() * elements_processed_per_sub_group, current_size );
+    uint space_available = (capacity - current_size) + num_consumed;
+    uint space_needed = TREE_ARITY * num_consumed;
+    return space_available < space_needed;
+}
+
+inline uint build_qnodes_pc(
+    global struct SAHBuildGlobals* globals,
+    bool alloc_backpointers,
+    bool process_masks,
+    uint first_qnode,
+    uint first_bvh2_node,
+
+    local uint2* SLM_local_root_buffer,
+    local uint* SLM_ring_tail,
+    const uint  RING_SIZE
+)
+
+{
+    struct BuildFlatTreeArgs args;
+    args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes( globals );
+    args.leaf_type = SAHBuildGlobals_GetLeafType( globals );
+    args.inner_node_type = SAHBuildGlobals_GetInternalNodeType( globals );
+    args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In( globals, 0 );
+    args.primref_buffer = SAHBuildGlobals_GetPrimrefs( globals );
+    args.bvh_base = SAHBuildGlobals_GetBVHBase( globals );
+    args.bvh2 = SAHBuildGlobals_GetBVH2( globals );
+    args.globals = (global struct Globals*) globals->p_globals;
+    args.do_mask_processing = process_masks;
+
+    global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes( args.bvh_base );
+    global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base );
+
+    // first subgroup adds first node
+    if ( get_sub_group_id() == 0 && get_sub_group_local_id() == 0)
+    {
+        SLM_local_root_buffer[0].x = first_bvh2_node;
+        SLM_local_root_buffer[0].y = first_qnode;
+        *SLM_ring_tail = 1;
+
+    }
+
+    uint ring_head = 0;
+    uint ring_tail = 1;
+    uint ring_size = 1;
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    const uniform uint elements_processed_in_sg = 2;
+
+    while ( ring_size > 0 && !buffer_may_overflow( RING_SIZE, ring_size, elements_processed_in_sg ) )
+    {
+        ushort SIMD16_lane = get_sub_group_local_id();
+
+        // SIMD16 as 2xSIMD8
+        ushort SIMD8_lane = get_sub_group_local_id() % 8;
+        ushort SIMD8_id = get_sub_group_local_id() / 8;
+        bool active_lane;
+
+        uniform uint nodes_consumed = min( get_num_sub_groups() * elements_processed_in_sg, ring_size ); // times two because we process two nodes in subgroup
+        uniform bool sg_active = get_sub_group_id() * elements_processed_in_sg < nodes_consumed;
+        ushort num_children = 0;
+        varying uint3 children_info = 0;
+
+        uint bvh2_node = 0;
+        uint qnode_index = 0;
+
+        if (sg_active)
+        {
+            ushort consumed_pos = get_sub_group_id() * elements_processed_in_sg + SIMD8_id;
+            active_lane = consumed_pos < nodes_consumed ? true : false;
+            consumed_pos = consumed_pos < nodes_consumed ? consumed_pos : consumed_pos-1;
+
+            uint buffer_index = (ring_head + consumed_pos) % RING_SIZE;
+
+            bvh2_node = SLM_local_root_buffer[buffer_index].x;
+            qnode_index = SLM_local_root_buffer[buffer_index].y;
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        if (sg_active)
+        {
+            // build a node
+            num_children = SUBGROUP_BuildFlatTreeNode_2xSIMD8_in_SIMD16(args, bvh2_node, qnodes, qnode_index, &children_info, active_lane);
+
+            // handle backpointers
+            // TODO_OPT:  This should be separate shaders not a runtime branch
+            //     doing it this way for now because GRLTLK does not make dynamic shader selection on host very easy.
+            //     this needs to change... GRLTLK should
+
+            if (alloc_backpointers && active_lane)
+            {
+                // update this node's backpointer with child count
+                if (SIMD8_lane == 0)
+                    back_pointers[qnode_index] |= (children_info.z << 3);
+
+                // point child backpointers at parent
+                if (SIMD8_lane < num_children)
+                    back_pointers[children_info.y] = (qnode_index << 6);
+            }
+
+            // save data
+
+            uniform ushort first_SIMD8_num_children  = sub_group_broadcast(num_children, 0);
+            uniform ushort second_SIMD8_num_children = sub_group_broadcast(num_children, 8);
+            uniform ushort SIMD16_num_children = first_SIMD8_num_children + second_SIMD8_num_children;
+
+            uint root_buffer_position = 0;
+
+            // allocate space in the child buffer
+            if (SIMD16_lane == 0 && SIMD16_num_children)
+                root_buffer_position = atomic_add_local(SLM_ring_tail, SIMD16_num_children);
+
+            root_buffer_position = sub_group_broadcast( root_buffer_position, 0 );
+            root_buffer_position += SIMD8_id * first_SIMD8_num_children; // update offset for second half of SIMD16
+
+            // store child indices in root buffer
+            if (SIMD8_lane < num_children)
+            {
+                uint store_pos = (root_buffer_position + SIMD8_lane) % RING_SIZE;
+                SLM_local_root_buffer[store_pos] = children_info.xy;
+            }
+        }
+
+        // sync everyone
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        ring_head += nodes_consumed;
+        ring_tail = *SLM_ring_tail;
+        ring_size = ring_tail - ring_head;
+    }
+
+    return ring_head;
+}
+
+
+
+
+inline void amplify_and_spill(
+    global struct SAHBuildGlobals* globals,
+    dword alloc_backpointers,
+    uint first_qnode,
+    uint first_bvh2_node,
+    global uint2* global_root_buffer,
+    local uint* root_buffer_counter,
+    const uint  RING_SIZE
+)
+
+{
+    struct BuildFlatTreeArgs args;
+    args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes(globals);
+    args.leaf_type = SAHBuildGlobals_GetLeafType(globals);
+    args.inner_node_type = SAHBuildGlobals_GetInternalNodeType(globals);
+    args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In(globals, 0);
+    args.primref_buffer = SAHBuildGlobals_GetPrimrefs(globals);
+    args.bvh_base = SAHBuildGlobals_GetBVHBase(globals);
+    args.bvh2 = SAHBuildGlobals_GetBVH2(globals);
+    args.globals = (global struct Globals*) globals->p_globals;
+
+    global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes(args.bvh_base);
+    global uint* back_pointers = (global uint*) BVHBase_GetBackPointers(args.bvh_base);
+
+
+    varying uint3 children_info;
+    uniform ushort num_children = SUBGROUP_BuildFlatTreeNode(args, first_bvh2_node, qnodes + first_qnode, first_qnode, &children_info);
+
+    if (alloc_backpointers)
+    {
+        // set first node's backpointer
+        if (get_sub_group_local_id() == 0)
+        {
+            // if first node is root, use root sentinel in backpointer
+            //   otherwise, need to merge the child count in with the parent offset (which was already put there by the parent's thread)
+            uint bp = 0xffffffc0;
+            if (first_qnode != 0)
+                bp = back_pointers[first_qnode];
+            bp |= (children_info.z << 3);
+
+            back_pointers[first_qnode] = bp;
+        }
+
+        // point child backpointers at the parent
+        if (get_sub_group_local_id() < num_children)
+            back_pointers[children_info.y] = (first_qnode << 6);
+    }
+
+    if (num_children)
+    {
+        uint spill_pos = 0;
+        if (get_sub_group_local_id() == 0)
+            spill_pos = atomic_add_local(root_buffer_counter,num_children);
+
+        spill_pos = sub_group_broadcast(spill_pos, 0);
+
+        if (get_sub_group_local_id() < num_children)
+            global_root_buffer[spill_pos+get_sub_group_local_id()] = children_info.xy;
+    }
+
+}
+
+
+
+
+inline void build_qnodes_pc_kickoff_func(
+    global struct SAHBuildGlobals* globals,
+    global uint2* root_buffer,
+    bool alloc_backpointers,
+    bool process_masks,
+
+    local uint2* SLM_local_root_buffer,
+    local uint* SLM_spill_pos,
+    local uint* SLM_ring_tail,
+    int RING_SIZE
+)
+{
+    // allocate first node
+    if ( get_sub_group_id() == 0 && get_sub_group_local_id() == 0 )
+        allocate_inner_nodes( SAHBuildGlobals_GetBVHBase(globals), 1 );
+
+    *SLM_spill_pos=0;
+
+    uint ring_head = build_qnodes_pc( globals, alloc_backpointers, process_masks,
+                     0, BVH2_GetRoot(SAHBuildGlobals_GetBVH2(globals)), SLM_local_root_buffer, SLM_ring_tail, RING_SIZE );
+
+
+    uint n = *SLM_ring_tail - ring_head;
+    if (n > 0)
+    {
+#if 0
+        // do an additional round of amplification so we can get more nodes into the root buffer and go wider in the next phase
+        /// JDB TODO: this is causing hangs on DG2 for metro, so disabling for now...
+        for (uint i = get_sub_group_id(); i < n; i+= get_num_sub_groups() )
+        {
+            uint consume_pos = (ring_head + i) % RING_SIZE;
+            uniform uint bvh2_root = SLM_local_root_buffer[consume_pos].x;
+            uniform uint qnode_root = SLM_local_root_buffer[consume_pos].y;
+
+            amplify_and_spill( globals, alloc_backpointers, qnode_root, bvh2_root, root_buffer, SLM_spill_pos, RING_SIZE );
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+#else
+        for (uint i = get_local_id(0); i < n; i += get_local_size(0))
+            root_buffer[i] = SLM_local_root_buffer[(ring_head + i) % RING_SIZE];
+#endif
+
+        if (get_local_id(0) == 0)
+        {
+            globals->root_buffer_num_produced = n;
+            globals->root_buffer_num_produced_hi = 0;
+            globals->root_buffer_num_consumed = 0;
+            globals->root_buffer_num_consumed_hi = 0;
+        }
+    }
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 256, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void
+build_qnodes_pc_kickoff(
+    global struct SAHBuildGlobals* globals,
+    global uint2* root_buffer,
+    dword sah_flags
+)
+{
+    bool alloc_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS;
+    bool process_masks = sah_flags & SAH_FLAG_NEED_MASKS;
+
+
+    const int RING_SIZE = 64;
+
+    local uint2 SLM_local_root_buffer[RING_SIZE];
+    local uint SLM_spill_pos;
+    local uint SLM_ring_tail;
+
+    build_qnodes_pc_kickoff_func(globals,
+                                 root_buffer,
+                                 alloc_backpointers,
+                                 process_masks,
+                                 SLM_local_root_buffer,
+                                 &SLM_spill_pos,
+                                 &SLM_ring_tail,
+                                 RING_SIZE
+                                 );
+}
+
+
+
+
+inline void build_qnodes_pc_amplify_func(
+    global struct SAHBuildGlobals* globals,
+    global uint2* root_buffer,
+    bool alloc_backpointers,
+    bool process_masks,
+
+    local uint2* SLM_local_root_buffer,
+    local uint*  SLM_broadcast,
+    local uint*  SLM_ring_tail,
+    int RING_SIZE
+    )
+{
+    // TODO_OPT:  Probably don't need this atomic.. could clear 'num_consumed' every time
+    //     and just use get_group_id()
+    //
+
+    if (get_local_id(0) == 0)
+        *SLM_broadcast = atomic_inc_global(&globals->root_buffer_num_consumed);
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    uniform uint consume_pos = *SLM_broadcast;
+    uniform uint bvh2_root = root_buffer[consume_pos].x;
+    uniform uint qnode_root = root_buffer[consume_pos].y;
+
+    uint ring_head = build_qnodes_pc(globals, alloc_backpointers,process_masks,
+        qnode_root, bvh2_root, SLM_local_root_buffer, SLM_ring_tail, RING_SIZE);
+
+    // TODO_OPT:  Instead of spilling the nodes, do one more round of amplification and write
+    //   generated children directly into the root buffer.  This should allow faster amplification
+
+    // spill root buffer contents
+    uint n = *SLM_ring_tail - ring_head;
+    if (n > 0)
+    {
+
+        if (get_local_id(0) == 0)
+            *SLM_broadcast = atomic_add_global(&globals->root_buffer_num_produced, n);
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+        uint produce_pos = *SLM_broadcast;
+
+        for (uint i = get_local_id(0); i < n; i += get_local_size(0))
+            root_buffer[produce_pos + i] = SLM_local_root_buffer[(ring_head + i) % RING_SIZE];
+    }
+}
+
+
+
+
+
+// Process two nodes per wg during amplification phase.
+// DOing it this way ensures maximum parallelism
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void
+build_qnodes_pc_amplify(
+    global struct SAHBuildGlobals* globals,
+    global uint2* root_buffer,
+    dword sah_flags )
+{
+    bool alloc_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS;
+
+    struct BuildFlatTreeArgs args;
+    args.leaf_size_in_bytes = SAHBuildGlobals_GetLeafSizeInBytes(globals);
+    args.leaf_type = SAHBuildGlobals_GetLeafType(globals);
+    args.inner_node_type = SAHBuildGlobals_GetInternalNodeType(globals);
+    args.primref_indices = SAHBuildGlobals_GetPrimrefIndices_In(globals, 0);
+    args.primref_buffer = SAHBuildGlobals_GetPrimrefs(globals);
+    args.bvh_base = SAHBuildGlobals_GetBVHBase(globals);
+    args.bvh2 = SAHBuildGlobals_GetBVH2(globals);
+    args.globals = (global struct Globals*) globals->p_globals;
+    args.do_mask_processing = sah_flags & SAH_FLAG_NEED_MASKS;
+
+    global struct InternalNode* qnodes = (global struct InternalNode*) BVHBase_GetInternalNodes(args.bvh_base);
+    global uint* back_pointers = (global uint*) BVHBase_GetBackPointers(args.bvh_base);
+
+    ushort SIMD16_lane = get_sub_group_local_id();
+
+    // SIMD16 as 2xSIMD8
+    ushort SIMD8_lane = get_sub_group_local_id() % 8;
+    ushort SIMD8_id = get_sub_group_local_id() / 8;
+    bool active_lane = false;
+
+    uint consume_pos;
+    consume_pos = globals->root_buffer_num_consumed + get_group_id(0) * 2; // times 2 because we process two nodes in workgroup
+    consume_pos += SIMD8_id;
+
+    active_lane = consume_pos < globals->root_buffer_num_to_consume ? true : false;
+    consume_pos = consume_pos < globals->root_buffer_num_to_consume ? consume_pos : consume_pos-1;
+
+    uint first_bvh2_node = root_buffer[consume_pos].x;
+    uint first_qnode = root_buffer[consume_pos].y;
+
+    varying uint3 children_info;
+    ushort num_children = SUBGROUP_BuildFlatTreeNode_2xSIMD8_in_SIMD16(args, first_bvh2_node, qnodes, first_qnode, &children_info, active_lane);
+
+    if (alloc_backpointers && active_lane)
+    {
+        // set first node's backpointer
+        if (SIMD8_lane == 0)
+        {
+            // if first node is root, use root sentinel in backpointer
+            //   otherwise, need to merge the child count in with the parent offset (which was already put there by the parent's thread)
+            uint bp = 0xffffffc0;
+            if (first_qnode != 0)
+                bp = back_pointers[first_qnode];
+            bp |= (children_info.z << 3);
+
+            back_pointers[first_qnode] = bp;
+        }
+
+        // point child backpointers at the parent
+        if (SIMD8_lane < num_children)
+            back_pointers[children_info.y] = (first_qnode << 6);
+    }
+
+    // save data
+    {
+        // sum children from both halfs of SIMD16 to do only one atomic per sub_group
+        uint produce_pos;
+        uniform ushort first_SIMD8_num_children = sub_group_broadcast(num_children, 0);
+        uniform ushort second_SIMD8_num_children = sub_group_broadcast(num_children, 8);
+        uniform ushort SIMD16_num_children = first_SIMD8_num_children + second_SIMD8_num_children;
+
+        if (SIMD16_lane == 0 && SIMD16_num_children)
+            produce_pos = atomic_add_global(&globals->root_buffer_num_produced, SIMD16_num_children);
+
+        produce_pos = sub_group_broadcast(produce_pos, 0);
+        produce_pos += SIMD8_id * first_SIMD8_num_children; // update offset for second half of SIMD16
+
+        if (SIMD8_lane < num_children)
+        {
+            root_buffer[produce_pos + SIMD8_lane] = children_info.xy;
+        }
+    }
+}
+
+
+//////////
+//
+// Batched version of qnode creation
+//
+//////////
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+kernel void
+build_qnodes_init_scheduler_batched(global struct QnodeScheduler* scheduler, dword num_builds, dword num_max_qnode_global_root_buffer_entries)
+{
+
+    scheduler->batched_build_offset = scheduler->num_trivial_builds + scheduler->num_single_builds;
+    scheduler->batched_build_count = num_builds - scheduler->batched_build_offset;
+    scheduler->num_max_qnode_global_root_buffer_entries = num_max_qnode_global_root_buffer_entries;
+
+    const uint num_builds_to_process = scheduler->batched_build_count;
+    const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries;
+
+    scheduler->batched_builds_to_process = num_builds_to_process;
+    scheduler->num_qnode_grb_curr_entries = (num_builds_to_process + 15) / 16; // here we store number of workgroups for "build_qnodes_begin_batchable" kernel
+    scheduler->num_qnode_grb_new_entries = num_builds_to_process;
+    scheduler->qnode_global_root_buffer.curr_entries_offset = max_qnode_grb_entries;
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void
+build_qnodes_begin_batchable(global struct QnodeScheduler* scheduler,
+                             global struct SAHBuildGlobals* builds_globals)
+{
+    const uint tid = get_group_id(0) * get_local_size(0) + get_local_id(0);
+
+    const uint num_builds_to_process = scheduler->batched_builds_to_process;
+
+    if(tid < num_builds_to_process)
+    {
+        const uint build_idx = scheduler->batched_build_offset + tid;
+
+        uint bvh2_node = BVH2_GetRoot(SAHBuildGlobals_GetBVH2(&builds_globals[build_idx]));
+        uint qnode = 0;
+        struct QNodeGlobalRootBufferEntry entry = { bvh2_node, qnode, build_idx, 1};
+        scheduler->qnode_global_root_buffer.entries[tid] = entry;
+
+        builds_globals[build_idx].root_buffer_num_produced = 0;
+        builds_globals[build_idx].root_buffer_num_produced_hi = 0;
+        builds_globals[build_idx].root_buffer_num_consumed = 0;
+        builds_globals[build_idx].root_buffer_num_consumed_hi = 0;
+
+        // allocate first node for this build
+        //allocate_inner_nodes( SAHBuildGlobals_GetBVHBase(&builds_globals[build_idx]), 1 );
+        SAHBuildGlobals_GetBVHBase(&builds_globals[build_idx])->nodeDataCur++;
+    }
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 1, 1, 1 )) )
+kernel void
+build_qnodes_scheduler(global struct QnodeScheduler* scheduler)
+{
+    const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries;
+
+    uint new_entries = min(scheduler->num_qnode_grb_new_entries, max_qnode_grb_entries);
+
+    scheduler->num_qnode_grb_curr_entries = new_entries;
+    scheduler->num_qnode_grb_new_entries = 0;
+    scheduler->qnode_global_root_buffer.curr_entries_offset = scheduler->qnode_global_root_buffer.curr_entries_offset ? 0 : max_qnode_grb_entries;
+}
+
+
+
+
+// TODO_OPT:  Enable larger WGs.  WGSize 512 at SIMD8 hangs on Gen9, but Gen12 can go bigger
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 32, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void
+build_qnodes_pc_amplify_batched(
+    global struct SAHBuildGlobals* builds_globals,
+    global struct QnodeScheduler* scheduler
+    )
+{
+    const uint group_id = get_group_id(0);
+
+    global struct QNodeGlobalRootBuffer* global_root_buffer = &scheduler->qnode_global_root_buffer;
+    const uint curr_entries_offset = global_root_buffer->curr_entries_offset;
+    struct QNodeGlobalRootBufferEntry entry = global_root_buffer->entries[curr_entries_offset + group_id];
+
+    const uint build_id = entry.build_idx;
+
+    global struct SAHBuildGlobals* globals = &builds_globals[build_id];
+    global uint2* root_buffer = (global uint2*)globals->p_qnode_root_buffer;
+    bool alloc_backpointers = SAHBuildGlobals_NeedBackPointers(globals);
+    bool process_masks = SAHBuildGlobals_NeedMasks(globals);
+
+    const int RING_SIZE = 32; // for 2 SGs, 16 should result in 2 rounds:  one SG produces 6, then 2 SGs consume 2 and produce 12
+                              // for 4 SGs, 32 results in 2 rounds:  one SG produces 6, 4 SGs consume 4 and produce 24, resulting in 26
+
+    local uint2 SLM_local_root_buffer[RING_SIZE];
+    local uint  SLM_broadcast;
+    local uint  SLM_ring_tail;
+    local uint  SLM_grb_broadcast;
+
+
+    //// This below can be moved to separate function if needed for TLAS ////
+
+    uniform uint bvh2_root = entry.bvh2_node;
+    uniform uint qnode_root = entry.qnode;
+
+    uint ring_head = build_qnodes_pc(globals, alloc_backpointers, process_masks,
+        qnode_root, bvh2_root, SLM_local_root_buffer, &SLM_ring_tail, RING_SIZE);
+
+    // spill root buffer contents
+    uint n = SLM_ring_tail - ring_head;
+    if (n > 0)
+    {
+        const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries;
+
+        if (get_local_id(0) == 0)
+        {
+            SLM_grb_broadcast = atomic_add_global(&scheduler->num_qnode_grb_new_entries, n);
+
+            if(SLM_grb_broadcast >= max_qnode_grb_entries) // if global_root_buffer is full, then make space in build's root_buffer
+                SLM_broadcast = atomic_add_global(&globals->root_buffer_num_produced, n);
+            else if( (SLM_grb_broadcast + n) >= max_qnode_grb_entries) // if we exceed global_root_buffer with our entries, then make space in build's root_buffer
+                SLM_broadcast = atomic_add_global(&globals->root_buffer_num_produced, n - (max_qnode_grb_entries - SLM_grb_broadcast));
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        uint produce_pos = SLM_broadcast;
+
+        uint grb_produce_num = n; // grb stands for global_root_buffer
+        uint lrb_produce_num = 0; // lrb stands for local root buffer, meaning this build's root_buffer
+
+        if(SLM_grb_broadcast >= max_qnode_grb_entries) // if global_root_buffer is full, don't write to it
+        {
+            grb_produce_num = 0;
+            lrb_produce_num = n;
+        }
+        else if( (SLM_grb_broadcast + n) >= max_qnode_grb_entries) // if we exceed global_root_buffer with our entries, then decrease amount of entries and store rest in build's root buffer
+        {
+            grb_produce_num = max_qnode_grb_entries - SLM_grb_broadcast;
+            lrb_produce_num = n - grb_produce_num;
+        }
+
+        // save data to global_root_buffer
+        for(uint i = get_local_id(0); i < grb_produce_num; i += get_local_size(0))
+        {
+            const uint2 slm_record = SLM_local_root_buffer[(ring_head + i) % RING_SIZE];
+
+            struct QNodeGlobalRootBufferEntry new_entry;
+            new_entry.bvh2_node = slm_record.x;
+            new_entry.qnode = slm_record.y;
+            new_entry.build_idx = entry.build_idx;
+
+            const uint new_entries_offset = curr_entries_offset ? 0 : max_qnode_grb_entries;
+            global_root_buffer->entries[new_entries_offset + SLM_grb_broadcast + i] = new_entry;
+        }
+
+        // if anything left, write to build's root buffer
+        for (uint i = get_local_id(0); i < lrb_produce_num; i += get_local_size(0))
+            root_buffer[produce_pos + i] = SLM_local_root_buffer[(ring_head + i + grb_produce_num) % RING_SIZE];
+    }
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 16, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void
+build_qnodes_try_to_fill_grb_batched(
+    global struct SAHBuildGlobals* builds_globals,
+    global struct QnodeScheduler* scheduler
+    )
+{
+    const uint build_id = scheduler->batched_build_offset + get_group_id(0);
+    global struct SAHBuildGlobals* globals = &builds_globals[build_id];
+    global uint2* root_buffer = (global uint2*)globals->p_qnode_root_buffer;
+
+    global struct QNodeGlobalRootBuffer* qnode_root_buffer = (global struct QNodeGlobalRootBuffer*)&scheduler->qnode_global_root_buffer;
+
+    const uint num_produced = globals->root_buffer_num_produced;
+    const uint num_consumed = globals->root_buffer_num_consumed;
+    const uint entries =  num_produced - num_consumed; // entries to build's root buffer
+
+    if(!entries)
+        return;
+
+    uint global_root_buffer_offset;
+    if(get_local_id(0) == 0)
+        global_root_buffer_offset = atomic_add_global(&scheduler->num_qnode_grb_new_entries, entries);
+
+    global_root_buffer_offset = sub_group_broadcast(global_root_buffer_offset, 0);
+
+    const uint max_qnode_grb_entries = scheduler->num_max_qnode_global_root_buffer_entries;
+
+    if(global_root_buffer_offset >= max_qnode_grb_entries) // if global_root_buffer is full, then return
+        return;
+
+    uint global_root_buffer_produce_num = entries;
+    if(global_root_buffer_offset + entries >= max_qnode_grb_entries) // if we exceed global_root_buffer with our entries, then reduce number of entries to push
+        global_root_buffer_produce_num = max_qnode_grb_entries - global_root_buffer_offset;
+
+    for(uint i = get_local_id(0); i < global_root_buffer_produce_num; i += get_local_size(0))
+    {
+        const uint2 entry = root_buffer[num_consumed + i];
+
+        struct QNodeGlobalRootBufferEntry new_entry;
+        new_entry.bvh2_node = entry.x;
+        new_entry.qnode = entry.y;
+        new_entry.build_idx = build_id;
+
+        const uint new_entries_offset = qnode_root_buffer->curr_entries_offset ? 0 : max_qnode_grb_entries;
+        qnode_root_buffer->entries[new_entries_offset + global_root_buffer_offset + i] = new_entry;
+    }
+
+    if(get_local_id(0) == 0)
+        globals->root_buffer_num_consumed += global_root_buffer_produce_num;
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_DFS.cl b/src/intel/vulkan/grl/gpu/bvh_build_DFS.cl
new file mode 100644
index 00000000000..1f64ef3fbe2
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_DFS.cl
@@ -0,0 +1,2025 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "intrinsics.h"
+#include "AABB3f.h"
+#include "AABB.h"
+#include "GRLGen12.h"
+#include "quad.h"
+#include "common.h"
+#include "instance.h"
+
+#include "api_interface.h"
+
+#include "binned_sah_shared.h"
+
+
+#if 0
+#define LOOP_TRIPWIRE_INIT uint _loop_trip=0;
+
+#define LOOP_TRIPWIRE_INCREMENT(max_iterations) \
+    _loop_trip++;\
+    if ( _loop_trip > max_iterations  )\
+    {\
+        if( get_local_id(0) == 0 )\
+            printf( "@@@@@@@@@@@@@@@@@@@@ TRIPWIRE!!!!!!!!!!! group=%u\n", get_group_id(0) );\
+        break;\
+    }
+#else
+
+#define LOOP_TRIPWIRE_INIT 
+#define LOOP_TRIPWIRE_INCREMENT(max_iterations)
+
+#endif
+
+
+// =========================================================
+//             DFS
+// =========================================================
+
+// there are 128 threads x SIMD16 == 2048 lanes in a DSS
+//   There is 128KB of SLM.  Upper limit of 64KB per WG, so target is 2 groups of 1024 lanes @ 64K each
+//     --> Full occupancy requires using less than 64B per lane
+//
+//   Groups of 256 lanes gives us 16KB per group
+//
+
+// We use subgroups very heavily here in order to avoid 
+//    use of per-thread scratch space for intermediate values
+
+#define DFS_WG_SIZE 256
+#define DFS_NUM_SUBGROUPS 16
+#define DFS_BVH2_NODE_COUNT (2*(DFS_WG_SIZE)-1) 
+#define TREE_ARITY 6
+
+// FlatTree node limits:
+// these are the derivations if we always collapse to one primitive and pack nodes as tightly as possible
+//   If BVH2 construction is allowed to terminate early and place multiple prims in a leaf, these numbers will be too low
+#if 0  
+   
+// maximum flattree size is the number of inner nodes in a full M-ary tree with one leaf per primitive
+//  This is given by I = (L-1)/(M-1)
+//  For a 256 thread workgroup, L=256, M=6, this gives: 51
+#define DFS_MAX_FLATTREE_NODES 51
+
+
+// A flattree leaf is a node which contains only primitives.  
+//
+//  The maximum number of leaves is related to the number of nodes as:
+//   L(N) = ((M-1)*N + 1) / M
+//
+#define DFS_MAX_FLATTREE_LEAFS 43  // = 43 for 256 thread WG (L=256, M=6)
+
+#else
+
+//  This is the result of estimate_qbvh6_nodes(256)
+
+#define DFS_MAX_FLATTREE_LEAFS 256 
+#define DFS_MAX_FLATTREE_NODES 307 // 256 fat-leaves + 51 inner nodes.  51 = ceil(256/5)
+#define DFS_MAX_FLATTREE_DEPTH 52  // number of inner nodes in the worst-case tree
+
+#endif
+
+#define uniform
+#define varying
+
+
+struct DFSArgs
+{
+    global struct BVHBase* bvh_base;
+    global PrimRef* primref_buffer;
+    ushort leaf_node_type;
+    ushort inner_node_type;
+    ushort leaf_size_in_bytes;
+    bool need_backpointers;
+    bool need_masks;
+    ushort num_primrefs;
+    global uint* primref_index_buffer;
+};
+
+
+struct DFSPrimRefAABB
+{
+    half lower[3];
+    half upper[3];
+};
+
+GRL_INLINE void DFSPrimRefAABB_init( struct DFSPrimRefAABB* bb )
+{
+    bb->lower[0] = 1;
+    bb->lower[1] = 1;
+    bb->lower[2] = 1;
+    bb->upper[0] = 0;
+    bb->upper[1] = 0;
+    bb->upper[2] = 0;
+}
+
+GRL_INLINE void DFSPrimRefAABB_extend( struct DFSPrimRefAABB* aabb, struct DFSPrimRefAABB* v )
+{
+    aabb->lower[0] = min( aabb->lower[0], v->lower[0] );
+    aabb->lower[1] = min( aabb->lower[1], v->lower[1] );
+    aabb->lower[2] = min( aabb->lower[2], v->lower[2] );
+    aabb->upper[0] = max( aabb->upper[0], v->upper[0] );
+    aabb->upper[1] = max( aabb->upper[1], v->upper[1] );
+    aabb->upper[2] = max( aabb->upper[2], v->upper[2] );
+}
+
+GRL_INLINE float DFSPrimRefAABB_halfArea( struct DFSPrimRefAABB* aabb )
+{
+    const half3 d = (half3)(aabb->upper[0] - aabb->lower[0], aabb->upper[1] - aabb->lower[1], aabb->upper[2] - aabb->lower[2]);
+    return fma( d.x, (d.y + d.z), d.y * d.z );
+}
+
+GRL_INLINE struct DFSPrimRefAABB DFSPrimRefAABB_sub_group_reduce( struct DFSPrimRefAABB* aabb )
+{
+    struct DFSPrimRefAABB bounds;
+    bounds.lower[0] = sub_group_reduce_min( aabb->lower[0] );
+    bounds.lower[1] = sub_group_reduce_min( aabb->lower[1] );
+    bounds.lower[2] = sub_group_reduce_min( aabb->lower[2] );
+    bounds.upper[0] = sub_group_reduce_max( aabb->upper[0] );
+    bounds.upper[1] = sub_group_reduce_max( aabb->upper[1] );
+    bounds.upper[2] = sub_group_reduce_max( aabb->upper[2] );
+    return bounds;
+}
+
+struct DFSPrimRef
+{
+    struct DFSPrimRefAABB aabb;
+    uint2 meta;
+};
+
+struct PrimRefMeta
+{
+    uchar2 meta;
+};
+
+GRL_INLINE uint PrimRefMeta_GetInputIndex( struct PrimRefMeta* it )
+{
+    return it->meta.x;
+}
+GRL_INLINE uint PrimRefMeta_GetInstanceMask( struct PrimRefMeta* it )
+{
+    return it->meta.y;
+}
+
+
+struct PrimRefSet
+{
+    struct AABB3f root_aabb;
+    struct DFSPrimRefAABB AABB[DFS_WG_SIZE];
+    uint2 meta[DFS_WG_SIZE];
+
+};
+
+GRL_INLINE local struct DFSPrimRefAABB* PrimRefSet_GetAABBPointer( local struct PrimRefSet* refs, ushort id )
+{
+    return &refs->AABB[id];
+}
+
+GRL_INLINE float PrimRefSet_GetMaxAABBArea( local struct PrimRefSet* refs )
+{
+    float3 root_l = AABB3f_load_lower( &refs->root_aabb );
+    float3 root_u = AABB3f_load_upper( &refs->root_aabb );
+    float3 d = root_u - root_l;
+    float scale = 1.0f / max( d.x, max( d.y, d.z ) );
+
+    half3 dh = convert_half3_rtp( d * scale );
+    return fma( dh.x, (dh.y + dh.z), dh.y * dh.z );
+}
+
+GRL_INLINE float3 ulp3( float3 v ) {
+
+    return fabs(v) * FLT_EPSILON;
+}
+
+GRL_INLINE struct AABB PrimRefSet_ConvertAABB( local struct PrimRefSet* refs, struct DFSPrimRefAABB* box )
+{
+    float3 root_l = AABB3f_load_lower( &refs->root_aabb );
+    float3 root_u = AABB3f_load_upper( &refs->root_aabb );
+    float3 d = root_u - root_l;
+    float scale = max( d.x, max( d.y, d.z ) );
+
+    float3 l = convert_float3_rtz( (half3)(box->lower[0], box->lower[1], box->lower[2]) );
+    float3 u = convert_float3_rtp( (half3)(box->upper[0], box->upper[1], box->upper[2]) );
+    l =  l * scale + root_l ;
+    u =  u * scale + root_l ;
+
+    // clamping is necessary in case that a vertex lies exactly in the upper AABB plane.  
+    //   If we use unclamped values, roundoff error in the scale factor calculation can cause us
+    //   to snap to a flattened AABB that lies outside of the original one, resulting in missed geometry.
+    u = min( u, root_u );
+    l = min( l, root_u );
+
+    struct AABB r;
+    r.lower.xyz = l.xyz;
+    r.upper.xyz = u.xyz;
+    return r;
+}
+
+GRL_INLINE PrimRef PrimRefSet_GetFullPrecisionAABB( local struct PrimRefSet* refs, ushort id )
+{
+    struct AABB r;
+    r = PrimRefSet_ConvertAABB( refs, &refs->AABB[id] );
+    r.lower.w = 0;
+    r.upper.w = 0;
+    return r;
+}
+
+GRL_INLINE uint PrimRefSet_GetInputIndex( local struct PrimRefSet* refs, ushort id )
+{
+    return refs->meta[id].x;
+}
+
+GRL_INLINE uint PrimRefSet_GetInstanceMask( local struct PrimRefSet* refs, ushort id )
+{
+    return refs->meta[id].y;
+}
+GRL_INLINE struct PrimRefMeta PrimRefSet_GetMeta( local struct PrimRefSet* refs, ushort id )
+{
+    struct PrimRefMeta meta;
+    meta.meta.x = refs->meta[id].x;
+    meta.meta.y = refs->meta[id].y;
+    return meta;
+}
+
+
+GRL_INLINE struct DFSPrimRef PrimRefSet_GetPrimRef( local struct PrimRefSet* refs, ushort id )
+{
+    struct DFSPrimRef r;
+    r.aabb = refs->AABB[id];
+    r.meta = refs->meta[id];
+    return r;
+}
+
+
+GRL_INLINE void PrimRefSet_SetPrimRef_FullPrecision( local struct PrimRefSet* refs, PrimRef ref, ushort id )
+{
+    
+    float3 root_l = AABB3f_load_lower( &refs->root_aabb );
+    float3 root_u = AABB3f_load_upper( &refs->root_aabb );
+    float3 d      = root_u - root_l;
+    float scale   = 1.0f / max(d.x, max(d.y,d.z));
+    
+    float3 l = ref.lower.xyz;
+    float3 u = ref.upper.xyz;
+    half3 lh = convert_half3_rtz( (l - root_l) * scale );
+    half3 uh = convert_half3_rtp( (u - root_l) * scale );
+
+    refs->AABB[id].lower[0] = lh.x;
+    refs->AABB[id].lower[1] = lh.y;
+    refs->AABB[id].lower[2] = lh.z;
+    refs->AABB[id].upper[0] = uh.x;
+    refs->AABB[id].upper[1] = uh.y;
+    refs->AABB[id].upper[2] = uh.z;
+    refs->meta[id].x = id;
+    refs->meta[id].y = PRIMREF_instanceMask(&ref);
+
+
+}
+
+GRL_INLINE void PrimRefSet_SetPrimRef( local struct PrimRefSet* refs, struct DFSPrimRef ref, ushort id )
+{
+    refs->AABB[id] = ref.aabb;
+    refs->meta[id] = ref.meta;
+}
+
+GRL_INLINE struct AABB3f PrimRefSet_GetRootAABB( local struct PrimRefSet* refs )
+{
+    return refs->root_aabb;
+}
+
+GRL_INLINE void SUBGROUP_PrimRefSet_Initialize( local struct PrimRefSet* refs )
+{
+    if ( get_sub_group_local_id() == 0 )
+        AABB3f_init( &refs->root_aabb ); // TODO_OPT: subgroup-vectorized version of AABB3f_init    
+}
+
+
+GRL_INLINE void PrimRefSet_Printf( local struct PrimRefSet* refs, ushort num_prims )
+{
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+    if ( get_local_id( 0 ) == 0 )
+    {
+        printf( "Scene AABB:\n" );
+        struct AABB3f rootBox = PrimRefSet_GetRootAABB( refs );
+        AABB3f_print( &rootBox );
+        
+        float ma = PrimRefSet_GetMaxAABBArea( refs );
+
+        for ( uint i = 0; i < num_prims; i++ )
+        {
+            printf( "Ref: %u\n", i );
+            struct AABB r = PrimRefSet_GetFullPrecisionAABB( refs, i );
+            AABB_print( &r );          
+
+            float a = DFSPrimRefAABB_halfArea( PrimRefSet_GetAABBPointer( refs, i ) );
+            printf( "Scaled Area: %f / %f = %f \n", a, ma, a / ma );
+
+        }
+    }
+    barrier( CLK_LOCAL_MEM_FENCE );
+}
+
+
+
+GRL_INLINE void PrimRefSet_CheckBounds( local struct PrimRefSet* refs, ushort num_prims, PrimRef* primref_buffer )
+{
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+    if ( get_local_id( 0 ) == 0 )
+    {
+
+        for ( uint i = 0; i < num_prims; i++ )
+        {
+            PrimRef ref = primref_buffer[i];
+            struct AABB r2 = PrimRefSet_GetFullPrecisionAABB( refs, i );
+
+            struct DFSPrimRefAABB* box = &refs->AABB[i];
+            float3 l = convert_float3_rtz( (half3)(box->lower[0], box->lower[1], box->lower[2]) );
+            float3 u = convert_float3_rtp( (half3)(box->upper[0], box->upper[1], box->upper[2]) );
+
+            printf( " halfs:{%x,%x,%x}{%x,%x,%x}\n", as_uint(l.x), as_uint(l.y), as_uint(l.z), as_uint(u.x), as_uint(u.y), as_uint(u.z) );
+
+            printf( " {%f,%f,%f} {%f,%f,%f}    {%f,%f,%f} {%f,%f,%f} {%u,%u,%u,%u,%u,%u}\n",
+                ref.lower.x, ref.lower.y, ref.lower.z, r2.lower.x, r2.lower.y, r2.lower.z,
+                ref.upper.x, ref.upper.y, ref.upper.z, r2.upper.x, r2.upper.y, r2.upper.z,
+                r2.lower.x <= ref.lower.x,
+                r2.lower.y <= ref.lower.y,
+                r2.lower.z <= ref.lower.z,
+
+                r2.upper.x >= ref.upper.x,
+                r2.upper.y >= ref.upper.y,
+                r2.upper.z >= ref.upper.z );
+
+        }
+
+    }
+    barrier( CLK_LOCAL_MEM_FENCE );
+}
+
+
+
+struct LocalBVH2
+{
+    uint num_nodes;
+    uint nodes[DFS_BVH2_NODE_COUNT];
+
+    // nodes are a bitfield:
+    //    bits 8:0 (9b)     ==> number of primrefs in this subtree
+    //    
+    //    bits 17:9 (9b)    ==> for an inner node:  contains offset to a pair of children
+    //                      ==> for a leaf node: contains index of the first primref in this leaf
+    //
+    //    bits 30:18 (13b)  ==> quantized AABB area (relative to root box)
+    //    bit 31 (1b)       ==> is_inner flag
+    //
+    // NOTE: The left child offset of any node is always odd.. therefore, it is possible to recover a bit if we need it
+    //        by storing only the 8 MSBs
+};
+
+#define DFS_BVH2_AREA_QUANT 8191.0f
+
+
+
+GRL_INLINE void SUBGROUP_LocalBVH2_Initialize( local struct LocalBVH2* tree, ushort num_prims )
+{
+    tree->num_nodes = 1; // include the root node
+    tree->nodes[0] = num_prims; // initialize root node as a leaf containing the full subtree
+    
+}
+
+GRL_INLINE void LocalBVH2_CreateInnerNode( local struct LocalBVH2* tree, ushort node_index,
+                           ushort start_left, ushort start_right,
+                           ushort quantized_left_area, ushort quantized_right_area )
+{
+    uint child_pos   = atomic_add_local( &tree->num_nodes, 2 );
+  
+    // set the inner node flag and child position in the parent
+    // leave the other bits intact
+    uint parent_node = tree->nodes[node_index];
+    parent_node |= 0x80000000;
+    parent_node = (parent_node & ~(0x1ff<<9)) | (child_pos << 9);
+    tree->nodes[node_index] = parent_node;
+
+    // setup children as leaf nodes with prim-count zero
+    uint left_child  = (convert_uint(start_left) << 9)  | (convert_uint( quantized_left_area )  << 18);
+    uint right_child = (convert_uint(start_right) << 9) | (convert_uint( quantized_right_area ) << 18);
+    tree->nodes[child_pos]      = left_child;
+    tree->nodes[child_pos + 1]  = right_child;
+
+}
+
+GRL_INLINE ushort LocalBVH2_IncrementPrimCount( local struct LocalBVH2* tree, ushort node_index )
+{
+    // increment only the lower bits.  Given correct tree construction algorithm this will not overflow into MSBs
+    return (atomic_inc_local( &tree->nodes[node_index] )) & 0x1ff; 
+}
+
+GRL_INLINE ushort LocalBVH2_GetNodeArea( local struct LocalBVH2* tree, ushort nodeID )
+{
+    return (tree->nodes[nodeID] >> 18) & 0x1FFF;
+}
+
+GRL_INLINE bool LocalBVH2_IsInnerNode( local struct LocalBVH2* tree, ushort nodeID )
+{
+    return (tree->nodes[nodeID] & 0x80000000) != 0;
+}
+
+
+GRL_INLINE ushort2 LocalBVH2_GetChildIndices( local struct LocalBVH2* tree, ushort nodeID )
+{
+    ushort idx = ((tree->nodes[nodeID] >> 9) & 0x1FF);
+    return (ushort2)(idx, idx + 1);
+}
+
+GRL_INLINE ushort LocalBVH2_GetSubtreePrimCount( local struct LocalBVH2* tree, ushort node )
+{
+    return tree->nodes[node] & 0x1FF;
+}
+
+GRL_INLINE ushort LocalBVH2_GetLeafPrimStart( local struct LocalBVH2* tree, ushort node )
+{
+    return ((tree->nodes[node] >> 9) & 0x1FF);
+}
+
+
+GRL_INLINE void LocalBVH2_Printf( local struct LocalBVH2* tree )
+{
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    if ( get_local_id( 0 ) == 0 )
+    {
+        printf( "Nodes: %u\n", tree->num_nodes );
+
+        for ( uint i = 0; i < tree->num_nodes; i++ )
+        {
+            uint num_prims = LocalBVH2_GetSubtreePrimCount( tree, i );
+            printf( "%3u : 0x%08x  %3u 0x%04x ", i, tree->nodes[i], num_prims, LocalBVH2_GetNodeArea(tree,i) ); 
+            if ( LocalBVH2_IsInnerNode( tree, i ) )
+            {
+                ushort2 kids = LocalBVH2_GetChildIndices( tree, i );
+                printf( " INNER ( %3u %3u )\n", kids.x, kids.y );
+            }
+            else
+            {
+                printf( " LEAF {" );
+                for ( uint j = 0; j < num_prims; j++ )
+                    printf( " %3u ", LocalBVH2_GetLeafPrimStart( tree, i ) + j );
+                printf( "}\n" );
+            }
+        }
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+}
+
+struct FlatTreeInnerNode
+{
+    uint DW0;                // lower 16b are index of corresponding LocalBVH2 node.. Bits 30:16  are an atomic flag used during refit.  Bit 31 is a leaf marker
+    ushort parent_index;
+    ushort first_child;
+    uchar index_in_parent;
+    uchar num_children;
+    
+    //struct DFSPrimRefAABB AABB;
+};
+
+struct FlatTree
+{
+    uint num_nodes;
+    uint qnode_byte_offset; // byte offset from the BVHBase to the flat-tree's first QNode
+    uint qnode_base_index; 
+    
+    struct FlatTreeInnerNode nodes[DFS_MAX_FLATTREE_NODES];   
+    uchar primref_back_pointers[DFS_WG_SIZE];
+};
+
+GRL_INLINE void FlatTree_Printf( local struct FlatTree* flat_tree )
+{
+    barrier( CLK_LOCAL_MEM_FENCE );
+    if ( get_local_id( 0 ) == 0 )
+    {
+        printf( "NumNodes: %u\n", flat_tree->num_nodes );
+        for ( uint i = 0; i < flat_tree->num_nodes; i++ )
+        {
+            ushort bvh2_node = flat_tree->nodes[i].DW0 & 0xffff;
+            printf( "%2u  Parent: %2u  Index_in_parent: %u, NumKids: %u  FirstKid: %3u bvh2: %3u DW0: 0x%x\n",
+                i,
+                flat_tree->nodes[i].parent_index,
+                flat_tree->nodes[i].index_in_parent,
+                flat_tree->nodes[i].num_children,
+                flat_tree->nodes[i].first_child,
+                bvh2_node,
+                flat_tree->nodes[i].DW0 );
+        }
+    }
+    barrier( CLK_LOCAL_MEM_FENCE );
+}
+
+
+
+
+GRL_INLINE ushort FlatTree_GetNodeCount( local struct FlatTree* flat_tree )
+{
+    return flat_tree->num_nodes;
+}
+
+GRL_INLINE uint FlatTree_GetParentIndex( local struct FlatTree* flat_tree, ushort id )
+{
+    return flat_tree->nodes[id].parent_index;
+}
+
+GRL_INLINE ushort FlatTree_GetBVH2Root( local struct FlatTree* flat_tree, ushort node_index )
+{
+    return (flat_tree->nodes[node_index].DW0) & 0xffff;
+}
+
+GRL_INLINE ushort FlatTree_GetNumChildren( local struct FlatTree* flat_tree, ushort node_index )
+{
+    return flat_tree->nodes[node_index].num_children;
+}
+
+GRL_INLINE bool FlatTree_IsLeafNode( local struct FlatTree* flat_tree, ushort node_index )
+{
+    return (flat_tree->nodes[node_index].DW0 & 0x80000000) != 0;
+}
+
+
+GRL_INLINE uint FlatTree_GetQNodeByteOffset( struct FlatTree* flat_tree, ushort node_index )
+{
+    return flat_tree->qnode_byte_offset + node_index * sizeof(struct QBVHNodeN);
+}
+
+GRL_INLINE uint FlatTree_GetQNodeIndex( struct FlatTree* flat_tree, ushort node_index )
+{
+    return flat_tree->qnode_base_index + node_index;
+}
+
+GRL_INLINE void FlatTree_AllocateQNodes( struct FlatTree* flat_tree, struct DFSArgs args )
+{
+    uint node_base = 64*allocate_inner_nodes( args.bvh_base, flat_tree->num_nodes );
+    flat_tree->qnode_base_index  = (node_base - BVH_ROOT_NODE_OFFSET) / sizeof( struct QBVHNodeN );
+    flat_tree->qnode_byte_offset = node_base;
+}
+
+GRL_INLINE ushort FlatTree_GetFirstChild( struct FlatTree* flat_tree, ushort node_index )
+{
+    return flat_tree->nodes[node_index].first_child;
+}
+
+GRL_INLINE ushort FlatTree_GetPrimRefStart( struct FlatTree* flat_tree, ushort node_index )
+{
+    return flat_tree->nodes[node_index].first_child;
+}
+GRL_INLINE ushort FlatTree_GetPrimRefCount( struct FlatTree* flat_tree, ushort node_index )
+{
+    return flat_tree->nodes[node_index].num_children;
+}
+
+GRL_INLINE uint FlatTree_BuildBackPointer( local struct FlatTree* flat_tree, ushort node_index )
+{
+    uint parent_index = flat_tree->nodes[node_index].parent_index + flat_tree->qnode_base_index;
+    parent_index = (parent_index << 6) | (FlatTree_GetNumChildren( flat_tree, node_index ) << 3);
+    return parent_index;
+}
+
+
+GRL_INLINE void SUBGROUP_FlatTree_Initialize( uniform local struct FlatTree* flat_tree, struct DFSArgs args )
+{
+    if ( get_sub_group_local_id() == 0 )
+    {
+        flat_tree->num_nodes    = 1;
+        flat_tree->nodes[0].DW0 = 0; // point first node at BVH2 root node, which is assumed to be at index zero
+    }
+    
+}
+/*
+GRL_INLINE void SUBGROUP_FlatTree_ReduceAndSetAABB( uniform local struct FlatTree* flat_tree,
+                                         uniform ushort node_index,
+                                         varying local struct DFSPrimRefAABB* box )
+{
+    // TODO_OPT: Replace this with an optimized reduction which exploits the fact that we only ever have 6 active lanes
+    //       Try using the "negated max" trick here to compute min/max simultaneously, with max in top 6 lanes
+    //          This will replace 6 reductions with 3
+    
+    // TODO_OPT:  This only utilizes up to 6 SIMD lanes.  We can use up to 12 of them by putting
+    //  min into even lanes, and -max into odd lanes, and using a manual min-reduction on pairs of lanes
+
+    struct DFSPrimRefAABB bb = DFSPrimRefAABB_sub_group_reduce( box );
+    if( get_sub_group_local_id() )
+        flat_tree->nodes[node_index].AABB = bb;
+}
+*/
+
+GRL_INLINE void SUBGROUP_FlatTree_CreateInnerNode( uniform local struct FlatTree* flat_tree,
+                                        uniform ushort flat_tree_root,
+                                        varying ushort sg_child_bvh2_root,
+                                        uniform ushort num_children )
+{
+    uniform uint lane = get_sub_group_local_id();
+    
+    // increment counter to allocate new nodes.. set required root node fields
+    uniform uint child_base;
+    if ( lane == 0 )
+    {
+        child_base = atomic_add_local( &flat_tree->num_nodes, num_children );
+        flat_tree->nodes[flat_tree_root].first_child  = (uchar) child_base;
+        flat_tree->nodes[flat_tree_root].num_children = num_children;
+
+        // initialize mask bits for this node's live children
+        uint child_mask = ((1 << num_children) - 1) << 16;
+        flat_tree->nodes[flat_tree_root].DW0 |= child_mask;
+    }
+
+    child_base = sub_group_broadcast( child_base, 0 );
+
+    // initialize child nodes
+    if ( lane < num_children )
+    {
+        varying uint child = child_base + lane;
+        flat_tree->nodes[child].DW0 = sg_child_bvh2_root;
+        flat_tree->nodes[child].index_in_parent = lane;
+        flat_tree->nodes[child].parent_index = flat_tree_root;
+    }
+
+}
+
+
+
+GRL_INLINE void SUBGROUP_FlatTree_CreateLeafNode( uniform local struct FlatTree* flat_tree, 
+                                       uniform ushort flat_tree_root,
+                                       uniform ushort primref_start,
+                                       uniform ushort num_prims )
+{
+    ushort lane = get_sub_group_local_id();
+    if ( lane < num_prims )
+    {
+        flat_tree->primref_back_pointers[primref_start + lane] = (uchar) flat_tree_root;
+        if ( lane == 0 )
+        {
+            flat_tree->nodes[flat_tree_root].first_child  = (uchar) primref_start;
+            flat_tree->nodes[flat_tree_root].num_children = (uchar) num_prims;
+            flat_tree->nodes[flat_tree_root].DW0 |= 0x80000000;
+        }
+    }
+}
+
+
+GRL_INLINE uniform bool SUBGROUP_FlatTree_SignalRefitComplete( uniform local struct FlatTree* flat_tree, uniform ushort* p_node_index )
+{
+    uniform ushort node_index       = *p_node_index;
+    uniform ushort parent           = flat_tree->nodes[node_index].parent_index;
+    uniform ushort index_in_parent  = flat_tree->nodes[node_index].index_in_parent;
+
+    // clear the corresponding mask bit in the parent node
+    uniform uint child_mask         = (0x10000 << index_in_parent);
+    uniform uint old_mask_bits = 0;
+    if( get_sub_group_local_id() == 0 )
+        old_mask_bits = atomic_xor( &flat_tree->nodes[parent].DW0, child_mask );
+
+    old_mask_bits = sub_group_broadcast( old_mask_bits, 0 );
+
+    // if we cleared the last mask bit, this subgroup proceeds up the tree and refits the next node
+    //  otherwise, it looks for something else to do
+    if ( ((old_mask_bits^child_mask) & 0xffff0000) == 0 )
+    {
+        *p_node_index = parent;
+        return true;
+    }
+
+    return false;
+}
+
+/*
+GRL_INLINE local struct DFSPrimRefAABB* FlatTree_GetChildAABB( local struct FlatTree* flat_tree, 
+                                            local struct PrimRefSet* prim_refs, 
+                                            ushort node_index, ushort child_index )
+{
+    ushort child_id = FlatTree_GetFirstChild( flat_tree, node_index ) + child_index;
+
+    if( !FlatTree_IsLeafNode( flat_tree, node_index ) )
+        return &flat_tree->nodes[child_id].AABB;
+    else
+        return PrimRefSet_GetAABBPointer( prim_refs, child_id );
+}
+*/
+GRL_INLINE uint FlatTree_GetPrimRefBackPointer( local struct FlatTree* flat_tree, ushort primref_index )
+{
+    return flat_tree->primref_back_pointers[primref_index] * sizeof(struct QBVHNodeN) + flat_tree->qnode_byte_offset;
+}
+
+
+GRL_INLINE void FlatTree_check_boxes(local struct FlatTree* flat_tree, 
+    global struct AABB* primref_buffer, 
+    local struct AABB3f* boxes,
+    local struct PrimRefMeta* meta )
+
+{
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (get_local_id(0) == 0)
+    {
+        printf("checking flattree bounds...\n");
+
+        for (uint i = 0; i < flat_tree->num_nodes; i++)
+        {            
+            struct AABB rb;
+            rb.lower.xyz = AABB3f_load_lower(&boxes[i]);
+            rb.upper.xyz = AABB3f_load_upper(&boxes[i]);
+
+            uint offs  = FlatTree_GetFirstChild( flat_tree, i );
+            uint count = FlatTree_GetNumChildren( flat_tree, i );
+
+            for (uint c = 0; c < count; c++)
+            {
+                struct AABB lb;
+                if (FlatTree_IsLeafNode( flat_tree, i ))
+                {
+                    lb = primref_buffer[ PrimRefMeta_GetInputIndex( &meta[offs+c] ) ];
+                }
+                else
+                {
+                    lb.lower.xyz = AABB3f_load_lower(&boxes[ offs+c ]);
+                    lb.upper.xyz = AABB3f_load_upper(&boxes[ offs+c ]);
+                }
+
+                if( !AABB_subset( &lb, &rb ) )
+                    printf("Bad bounds!!  child %u of %u   %f : %f  %f : %f %f : %f    %f : %f  %f : %f %f : %f \n",
+                        c, i ,
+                        rb.lower.x, rb.upper.x, rb.lower.y, rb.upper.y, rb.lower.z, rb.upper.z,
+                        lb.lower.x, lb.upper.x, lb.lower.y, lb.upper.y, lb.lower.z, lb.upper.z
+                        );
+            }
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+
+struct FlatTreeScheduler
+{
+    int   num_leafs;
+    uint  writeout_produce_count;
+    uint  writeout_consume_count;
+    uint  active_subgroups;
+    uint  num_built_nodes;
+    uint  num_levels;   // number of depth levels in the tree
+
+    //uchar leaf_indices[DFS_MAX_FLATTREE_LEAFS];     // indices of leaf FlatTree nodes to be refitted
+    //uchar writeout_indices[DFS_MAX_FLATTREE_NODES]; // indices of flattree nodes to be written out or collapsed
+
+    ushort level_ordered_nodes[DFS_MAX_FLATTREE_NODES]; // node indices sorted by depth (pre-order, high depth before low depth)
+    ushort level_start[DFS_MAX_FLATTREE_DEPTH]; // first node at given level in the level-ordered node array
+    uint level_count[DFS_MAX_FLATTREE_DEPTH];  // number of nodes at given level
+};
+
+GRL_INLINE void SUBGROUP_FlatTreeScheduler_Initialize( uniform local struct FlatTreeScheduler* scheduler )
+{
+    scheduler->num_built_nodes = 0;
+    scheduler->num_leafs = 0;
+    scheduler->writeout_produce_count = 0;
+    scheduler->writeout_consume_count = 0;
+    scheduler->active_subgroups = DFS_NUM_SUBGROUPS;
+}
+/*
+GRL_INLINE void SUBGROUP_FlatTreeScheduler_QueueLeafForRefit( uniform local struct FlatTreeScheduler* scheduler,
+                                                   uniform ushort leaf )
+{
+    if ( get_sub_group_local_id() == 0 )
+        scheduler->leaf_indices[atomic_inc( &scheduler->num_leafs )] = leaf;
+}*/
+
+GRL_INLINE void SUBGROUP_FlatTreeScheduler_SignalNodeBuilt( uniform local struct FlatTreeScheduler* scheduler, uniform ushort node )
+{
+    if ( get_sub_group_local_id() == 0 )
+        atomic_inc_local( &scheduler->num_built_nodes );
+}
+
+GRL_INLINE uint FlatTreeScheduler_GetNumBuiltNodes( uniform local struct FlatTreeScheduler* scheduler )
+{
+    return scheduler->num_built_nodes;
+}
+
+/*
+GRL_INLINE void SUBGROUP_FlatTreeScheduler_QueueNodeForWriteOut( uniform local struct FlatTreeScheduler* scheduler, uniform ushort node )
+{
+    if ( get_sub_group_local_id() == 0 )
+        scheduler->writeout_indices[atomic_inc( &scheduler->writeout_produce_count )] = node;
+}*/
+
+/*
+GRL_INLINE bool SUBGROUP_FlatTreeScheduler_GetRefitTask( uniform local struct FlatTreeScheduler* scheduler, uniform ushort* leaf_idx )
+{
+    // schedule the leaves in reverse order to ensure that later leaves
+    //   complete before earlier ones.. This prevents contention during the WriteOut stage
+    // 
+    // There is a barrier between this function and 'QueueLeafForRefit' so we can safely decrement the same counter 
+    //   that we incremented earlier
+    varying int idx = 0;
+    if( get_sub_group_local_id() == 0 )
+        idx = atomic_dec( &scheduler->num_leafs ); 
+
+    sub_group_barrier( CLK_LOCAL_MEM_FENCE );
+    idx = sub_group_broadcast( idx, 0 );
+    
+    if ( idx <= 0 )
+        return false;
+
+    *leaf_idx = scheduler->leaf_indices[idx-1];
+    return true;
+}*/
+
+/*
+// Signal the scheduler that a subgroup has reached the DONE state.
+//  Return true if this is the last subgroup to be done
+void SUBGROUP_FlatTreeScheduler_SubGroupDone( local struct FlatTreeScheduler* scheduler )
+{
+    if ( get_sub_group_local_id() == 0 )
+        atomic_dec( &scheduler->active_subgroups );
+}
+*/
+
+/*
+
+#define STATE_SCHEDULE_REFIT    0x1234
+#define STATE_SCHEDULE_WRITEOUT 0x5679
+#define STATE_REFIT             0xabcd
+#define STATE_WRITEOUT          0xefef
+#define STATE_DONE              0xaabb
+
+// Get a flattree node to write out.  Returns the new scheduler state
+GRL_INLINE ushort SUBGROUP_FlatTreeScheduler_GetWriteOutTask( uniform local struct FlatTreeScheduler* scheduler,
+                                                   uniform ushort num_nodes,
+                                                   uniform ushort* node_idx )
+{
+    uniform ushort return_state = STATE_WRITEOUT;
+    uniform ushort idx = 0;
+    if ( get_sub_group_local_id() == 0 )
+    {
+        idx = atomic_inc( &scheduler->writeout_consume_count );     
+   
+        if ( idx >= scheduler->writeout_produce_count )
+        {
+            // more consumers than there are produced tasks....
+
+            if ( scheduler->writeout_produce_count == num_nodes )
+            {
+                // if all nodes have been written out, flattening is done
+                return_state = STATE_DONE;
+            }
+            else
+            {
+                // some writeout tasks remain, and have not been produced by refit threads yet
+                //   we need to put this one back
+                atomic_dec( &scheduler->writeout_consume_count );
+                return_state = STATE_SCHEDULE_WRITEOUT;
+            }
+        }
+        else
+        {
+            // scheduled successfully 
+            idx = scheduler->writeout_indices[idx];
+        }
+    }
+
+    *node_idx = sub_group_broadcast( idx, 0 );
+    return sub_group_broadcast( return_state, 0 );
+
+}
+*/
+
+
+/*
+GRL_INLINE void FlatTreeScheduler_Printf( local struct FlatTreeScheduler* scheduler )
+{
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    if ( get_local_id( 0 ) == 0 )
+    {
+        printf( "***SCHEDULER***\n" );
+        printf( "built_nodes=%u  active_sgs=%u  leafs=%u wo_p=%u  wo_c=%u\n", scheduler->num_built_nodes, scheduler->active_subgroups, scheduler->num_leafs,
+            scheduler->writeout_produce_count, scheduler->writeout_consume_count );
+        printf( "leafs for refit: {" );
+
+        int nleaf = max( scheduler->num_leafs, 0 );
+
+        for ( uint i = 0; i < nleaf; i++ )
+            printf( "%u ", scheduler->leaf_indices[i] );
+        printf( "}\n" );
+
+        printf( "writeout queue: %u:%u {", scheduler->writeout_produce_count, scheduler->writeout_consume_count );
+        for ( uint i = 0; i < scheduler->writeout_produce_count; i++ )
+            printf( "%u ", scheduler->writeout_indices[i] );
+        printf( "}\n" );
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+}
+*/
+
+
+GRL_INLINE void SUBGROUP_BuildFlatTreeNode( local struct LocalBVH2* bvh2,
+                                 local struct FlatTree* flat_tree,
+                                 local struct FlatTreeScheduler* scheduler,
+                                 uniform ushort flat_tree_root )
+{
+    varying ushort lane = get_sub_group_local_id();
+    varying ushort bvh2_root = FlatTree_GetBVH2Root( flat_tree, flat_tree_root );
+
+    if ( !LocalBVH2_IsInnerNode( bvh2, bvh2_root ) )
+    {
+        uniform ushort num_prims        = LocalBVH2_GetSubtreePrimCount( bvh2, bvh2_root );
+        uniform ushort primref_start    = LocalBVH2_GetLeafPrimStart( bvh2, bvh2_root );
+
+        SUBGROUP_FlatTree_CreateLeafNode( flat_tree, flat_tree_root, primref_start, num_prims );
+    }
+    else
+    {
+        // collapse BVH2 into BVH6.
+        // We will spread the root node's children across the subgroup, and keep adding SIMD lanes until we have enough
+        uniform ushort num_children = 2;
+
+        uniform ushort2 kids =  LocalBVH2_GetChildIndices( bvh2, bvh2_root );
+        varying ushort sg_bvh2_node = kids.x;
+        if ( lane == 1 )
+            sg_bvh2_node = kids.y;
+
+        do
+        {            
+            // choose the inner node with maximum area to replace.  
+            // Its left child goes in its old location.  Its right child goes in a new lane
+
+            varying ushort sg_area   = LocalBVH2_GetNodeArea( bvh2, sg_bvh2_node );
+            varying bool sg_is_inner = LocalBVH2_IsInnerNode( bvh2, sg_bvh2_node );
+            sg_area = (sg_is_inner && lane < num_children) ? sg_area : 0; // prevent early exit if the largest child is a leaf
+            
+            uniform ushort max_area  = sub_group_reduce_max( sg_area );
+            varying bool sg_reducable = max_area == sg_area && (lane < num_children) && sg_is_inner;
+            uniform uint mask         = intel_sub_group_ballot( sg_reducable );
+
+            // TODO_OPT:  Some of these ops seem redundant.. look at trimming further
+            // TODO_OPT:  sub_group_reduce_max results in too many instructions...... unroll the loop and specialize it..
+            //       or ask IGC to give us a version that declares a static maximum number of subgroups to use
+
+            if ( mask == 0 )
+                break;
+
+            // choose the inner node with maximum area to replace
+            uniform ushort victim_child = ctz( mask );
+            uniform ushort victim_node  = sub_group_broadcast( sg_bvh2_node, victim_child );
+            uniform ushort2 kids        = LocalBVH2_GetChildIndices( bvh2, victim_node );
+
+            if ( lane == victim_child )
+                sg_bvh2_node = kids.x;
+            else if ( lane == num_children )
+                sg_bvh2_node = kids.y;
+            
+            
+            num_children++;
+
+
+        }while ( num_children < TREE_ARITY );
+
+        SUBGROUP_FlatTree_CreateInnerNode( flat_tree, flat_tree_root, sg_bvh2_node, num_children );
+    }
+
+}
+
+
+GRL_INLINE void SUBGROUP_DFS_BuildFlatTree( uniform local struct LocalBVH2* bvh2,
+                                 uniform local struct FlatTree* flat_tree,
+                                 uniform local struct FlatTreeScheduler* scheduler
+                                )
+{
+
+    uniform ushort flat_tree_node_index = get_sub_group_id();
+    uniform ushort num_nodes     = 1;
+    uniform ushort num_built     = 0;
+    
+    uint tid = get_local_id(0);
+    if (tid < DFS_MAX_FLATTREE_DEPTH)
+    {
+        scheduler->level_start[tid] = DFS_MAX_FLATTREE_NODES;
+        scheduler->level_count[tid] = 0;
+        scheduler->num_levels = 0;
+    }
+
+    LOOP_TRIPWIRE_INIT;
+
+    do
+    {
+        // process one flat tree node per sub group, as many as are available
+        //
+        //  The first pass will only run one sub-group, the second up to 6, the third up to 36, and so on
+        //     nodes will be processed in breadth-first order, but they are not guaranteed to be stored in this order
+        //      due to use of atomic counters for node allocation
+        //
+        if ( flat_tree_node_index < num_nodes )
+        {
+            SUBGROUP_BuildFlatTreeNode( bvh2, flat_tree, scheduler, flat_tree_node_index );
+            SUBGROUP_FlatTreeScheduler_SignalNodeBuilt( scheduler, flat_tree_node_index );            
+            flat_tree_node_index += get_num_sub_groups();
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        // bump up the node count if new nodes were created
+        // stop as soon as all flattree nodes have been processed
+        num_nodes = FlatTree_GetNodeCount( flat_tree );
+        num_built = FlatTreeScheduler_GetNumBuiltNodes( scheduler );
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+        
+        LOOP_TRIPWIRE_INCREMENT( 300 );
+
+    } while ( num_built < num_nodes );
+  
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+
+    // determine depth of each node, compute node ranges and counts for each depth level, 
+    //  and prepare a depth-ordered node index array
+    uint depth = 0;
+    uint level_pos = 0;
+    for( uint i=tid; i<num_nodes; i += get_local_size(0) )
+    {
+        // compute depth of this node
+        uint node_index = i;
+        while ( node_index != 0 )
+        {
+            node_index = FlatTree_GetParentIndex( flat_tree, node_index );
+            depth++;
+        }
+
+        // assign this node a position within it's depth level
+        level_pos = atomic_inc_local( &scheduler->level_count[depth] );
+    
+        // compute total number of levels 
+        atomic_max_local( &scheduler->num_levels, depth+1 );
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    for( uint i=tid; i<num_nodes; i += get_local_size(0) )
+    {
+        // prefix-sum level start positions.  Re-computed for each thread
+        // TODO:  Hierarchical reduction ??
+        uint level_start=0;
+        for( uint d=0; d<depth; d++ )
+            level_start += scheduler->level_count[d];
+
+        scheduler->level_start[depth] = level_start;
+
+        // scatter node indices into level-ordered node array
+        scheduler->level_ordered_nodes[level_start + level_pos] = tid;
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+}
+
+/*
+GRL_INLINE bool SUBGROUP_RefitNode( uniform local struct FlatTree* flat_tree,
+                         uniform local struct PrimRefSet* prim_refs,
+                         uniform ushort* p_node_index )
+{
+
+    // fetch and reduce child AABBs across the subgroup
+    uniform ushort node_index = *p_node_index;
+    uniform ushort num_kids = FlatTree_GetNumChildren( flat_tree, node_index );
+    varying ushort sg_child_index = (get_sub_group_local_id() < num_kids) ? get_sub_group_local_id() : 0;
+
+    varying local struct DFSPrimRefAABB* box = FlatTree_GetChildAABB( flat_tree, prim_refs, node_index, sg_child_index );
+
+    SUBGROUP_FlatTree_ReduceAndSetAABB( flat_tree, node_index, box );
+
+    if ( node_index == 0 )
+        return false; // if we just refitted the root, we can stop now
+
+    // signal the parent node that this node was refitted.  If this was the last child to be refitted
+    //    returns true and sets 'node_index' to the parent node, so that this thread can continue refitting
+    return SUBGROUP_FlatTree_SignalRefitComplete( flat_tree, p_node_index );
+}*/
+
+GRL_INLINE struct QBVHNodeN* qnode_ptr( BVHBase* bvh_mem, uint byte_offset )
+{
+    return (struct QBVHNodeN*)(((char*)bvh_mem) + byte_offset);
+}
+
+GRL_INLINE void SUBGROUP_WriteQBVHNode(        
+        uniform local struct FlatTree* flat_tree,
+        uniform local struct PrimRefMeta* primref_meta,
+        uniform local struct AABB3f* boxes,
+        uniform ushort flat_tree_root,
+        uniform struct DFSArgs args,
+        uniform local uchar* masks
+      )
+{
+
+    
+    uniform ushort num_children = FlatTree_GetNumChildren( flat_tree, flat_tree_root );
+    uniform bool is_leaf        = FlatTree_IsLeafNode( flat_tree, flat_tree_root );
+
+    varying ushort lane = get_sub_group_local_id();
+    varying ushort sg_child_index = (lane < num_children) ? lane : 0;
+
+    uniform ushort child_base = FlatTree_GetFirstChild( flat_tree, flat_tree_root );
+
+    varying struct AABB sg_box4;
+    if (FlatTree_IsLeafNode( flat_tree, flat_tree_root ))
+    {
+        // fetch AABBs for primrefs               
+        sg_box4 = args.primref_buffer[ PrimRefMeta_GetInputIndex( &primref_meta[child_base + sg_child_index] ) ];                    
+        
+    }
+    else
+    {
+        // fetch AABBs for child nodes
+        sg_box4.lower.xyz = AABB3f_load_lower( &boxes[child_base+sg_child_index] );
+        sg_box4.upper.xyz = AABB3f_load_upper( &boxes[child_base+sg_child_index] );
+    }
+
+
+    struct QBVHNodeN* qnode = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, flat_tree_root ) );
+
+    uniform int offset;
+    uniform uint child_type;
+    if ( is_leaf )
+    { 
+        char* leaf_mem = (char*)BVHBase_GetQuadLeaves( args.bvh_base );
+
+        leaf_mem += ( FlatTree_GetPrimRefStart( flat_tree, flat_tree_root )) * args.leaf_size_in_bytes;
+
+        offset = (int)(leaf_mem - (char*)qnode);
+        child_type = args.leaf_node_type;
+    }
+    else
+    {
+        struct QBVHNodeN* kid = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, FlatTree_GetFirstChild( flat_tree, flat_tree_root ) ) );
+        offset = (int) ((char*)kid - (char*)qnode);
+        child_type = args.inner_node_type;
+    }
+    offset = offset >> 6;
+
+    if (child_type == NODE_TYPE_INSTANCE)
+    {
+        uint instanceMask = PrimRefMeta_GetInstanceMask( &primref_meta[child_base + sg_child_index] );
+        subgroup_setInstanceQBVHNodeN( offset, &sg_box4, num_children, qnode, lane < num_children ? instanceMask : 0 );
+    }
+    else
+    {
+        uint mask = BVH_NODE_DEFAULT_MASK;
+        if( args.need_masks )
+            mask = masks[flat_tree_root];
+
+        subgroup_setQBVHNodeN( offset, child_type, &sg_box4, num_children, qnode, mask );
+    }
+
+    if ( args.need_backpointers )
+    {
+        global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base );
+        uint idx = FlatTree_GetQNodeIndex( flat_tree, flat_tree_root );
+        uint bp = FlatTree_BuildBackPointer( flat_tree, flat_tree_root );
+        back_pointers[idx] = bp;
+    }
+
+    /*
+    // TODO_OPT:  Eventually this section should also handle leaf splitting due to mixed primref types
+    //    For now this is done by the leaf creation pipeline, but that path should probably be refactored
+    //      such that all inner node creation is done in one place
+
+    uniform ushort num_children = FlatTree_GetNumChildren( flat_tree, flat_tree_root );
+    uniform bool is_leaf        = FlatTree_IsLeafNode( flat_tree, flat_tree_root );
+
+    varying ushort lane = get_sub_group_local_id();
+    varying ushort sg_child_index = (lane < num_children) ? lane : 0;
+
+    varying local struct DFSPrimRefAABB* sg_box = FlatTree_GetChildAABB( flat_tree, prim_refs, flat_tree_root, sg_child_index );
+
+    varying struct AABB sg_box4 = PrimRefSet_ConvertAABB( prim_refs, sg_box );
+    
+    struct QBVHNodeN* qnode = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, flat_tree_root ) );
+
+    uniform int offset;
+    uniform uint child_type;
+    if ( is_leaf )
+    { 
+        char* leaf_mem = (char*)BVHBase_GetQuadLeaves( args.bvh_base );
+
+        leaf_mem += ( FlatTree_GetPrimRefStart( flat_tree, flat_tree_root )) * args.leaf_size_in_bytes;
+
+        offset = (int)(leaf_mem - (char*)qnode);
+        child_type = args.leaf_node_type;
+    }
+    else
+    {
+        struct QBVHNodeN* kid = qnode_ptr( args.bvh_base, FlatTree_GetQNodeByteOffset( flat_tree, FlatTree_GetFirstChild( flat_tree, flat_tree_root ) ) );
+        offset = (int) ((char*)kid - (char*)qnode);
+        child_type = args.inner_node_type;
+    }
+    offset = offset >> 6;
+    
+    if (child_type == NODE_TYPE_INSTANCE)
+    {
+        uint instanceMask = PrimRefSet_GetInstanceMask( prim_refs, FlatTree_GetPrimRefStart(flat_tree, flat_tree_root) + lane );
+        subgroup_setInstanceQBVHNodeN( offset, &sg_box4, num_children, qnode, lane < num_children ? instanceMask : 0 );
+    }
+    else
+        subgroup_setQBVHNodeN( offset, child_type, &sg_box4, num_children, qnode );
+
+    if ( args.need_backpointers )
+    {
+        global uint* back_pointers = (global uint*) BVHBase_GetBackPointers( args.bvh_base );
+        uint idx = FlatTree_GetQNodeIndex( flat_tree, flat_tree_root );
+        uint bp = FlatTree_BuildBackPointer( flat_tree, flat_tree_root );
+        back_pointers[idx] = bp;
+    }
+    */
+}
+
+/*
+GRL_INLINE void SUBGROUP_DFS_RefitAndWriteOutFlatTree(
+    uniform local struct FlatTree* flat_tree,
+    uniform local struct PrimRefSet* prim_refs,
+    uniform local struct FlatTreeScheduler* scheduler,
+    uniform struct DFSArgs args)
+{
+
+    uniform ushort state = STATE_SCHEDULE_REFIT;
+    uniform ushort node_index = 0;
+    uniform ushort num_nodes = FlatTree_GetNodeCount(flat_tree);
+
+    {
+        LOOP_TRIPWIRE_INIT;
+
+        bool active = true;
+        bool continue_refit = false;
+        while (1)
+        {
+            if (active)
+            {
+                if (continue_refit || SUBGROUP_FlatTreeScheduler_GetRefitTask(scheduler, &node_index))
+                {
+                    continue_refit = SUBGROUP_RefitNode(flat_tree, prim_refs, &node_index);
+                }
+                else
+                {
+                    active = false;
+                    if (get_sub_group_local_id() == 0)
+                        atomic_dec(&scheduler->active_subgroups);
+
+                    sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+                }
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE); // finish all atomics
+            if (scheduler->active_subgroups == 0)
+                break;
+            barrier(CLK_LOCAL_MEM_FENCE); // finish all checks.. prevent race between thread which loops around and thread which doesn't
+
+            LOOP_TRIPWIRE_INCREMENT(200);
+        }
+    }
+
+    for (uint i = get_sub_group_id(); i < num_nodes; i += get_num_sub_groups())
+        SUBGROUP_WriteQBVHInnerNodes(flat_tree, prim_refs, i, args);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+
+    // JDB:  Version below attempts to interleave refit and qnode write-out
+    //  This could theoretically reduce thread idle time, but it is more complex and does more atomics for scheduling
+
+#if 0
+    // after we've constructed the flat tree (phase 1), there are two things that need to happen:
+    //   PHASE 2:  Refit the flat tree, computing all of the node ABBs
+    //   PHASE 3:  Write the nodes out to memory
+    //
+    //  all of this is sub-group centric.  Different subgroups can execute phases 2 and 3 concurrently
+    //    
+
+    // TODO_OPT:  The scheduling algorithm might need to be re-thought.
+    //  Fused EUs are very hard to reason about.   It's possible that by scheduling independent
+    //  SGs in this way we would lose a lot of performance due to fused EU serialization.
+    //     Needs to be tested experimentally if such a thing is possible
+
+    uniform ushort state = STATE_SCHEDULE_REFIT;
+    uniform ushort node_index = 0;
+    uniform ushort num_nodes = FlatTree_GetNodeCount(flat_tree);
+
+    LOOP_TRIPWIRE_INIT;
+
+    do
+    {
+        // barrier necessary to protect access to scheduler->active_subgroups
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if (state == STATE_SCHEDULE_REFIT)
+        {
+            if (SUBGROUP_FlatTreeScheduler_GetRefitTask(scheduler, &node_index))
+                state = STATE_REFIT;
+            else
+                state = STATE_SCHEDULE_WRITEOUT; // fallthrough
+        }
+        if (state == STATE_SCHEDULE_WRITEOUT)
+        {
+            state = SUBGROUP_FlatTreeScheduler_GetWriteOutTask(scheduler, num_nodes, &node_index);
+            if (state == STATE_DONE)
+                SUBGROUP_FlatTreeScheduler_SubGroupDone(scheduler);
+        }
+
+
+        // A barrier is necessary to ensure that 'QueueNodeForWriteOut' is synchronized with 'GetWriteOutTask'
+        //  Note that in theory we could have the write-out tasks spin until the refit tasks clear, which would make this barrier unnecessary
+        //   However, we cannot do this safely on SKUs which do not support independent subgroup forward progress.
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if (state == STATE_REFIT)
+        {
+            uniform ushort prev_node = node_index;
+            uniform bool continue_refit = SUBGROUP_RefitNode(flat_tree, prim_refs, &node_index);
+
+            SUBGROUP_FlatTreeScheduler_QueueNodeForWriteOut(scheduler, prev_node);
+
+            if (!continue_refit)
+                state = STATE_SCHEDULE_REFIT;
+        }
+        else if (state == STATE_WRITEOUT)
+        {
+            SUBGROUP_WriteQBVHInnerNodes(flat_tree, prim_refs, node_index, args);
+            state = STATE_SCHEDULE_WRITEOUT;
+        }
+        // A barrier is necessary to ensure that 'QueueNodeForWriteOut' is synchronized with 'GetWriteOutTask'
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        LOOP_TRIPWIRE_INCREMENT(200);
+
+    } while (scheduler->active_subgroups > 0);
+
+#endif
+}
+*/
+
+GRL_INLINE void DFS_CreatePrimRefSet( struct DFSArgs args,
+                           local struct PrimRefSet* prim_refs )
+{
+    ushort id = get_local_id( 0 );
+    ushort num_primrefs = args.num_primrefs;
+
+
+    PrimRef ref;
+    struct AABB3f local_aabb;
+    if ( id < num_primrefs )
+    {
+        ref = args.primref_buffer[id];
+        AABB3f_set_lower( &local_aabb, ref.lower.xyz );
+        AABB3f_set_upper( &local_aabb, ref.upper.xyz );
+    }
+    else
+    {
+        AABB3f_init( &local_aabb );
+    }
+
+    AABB3f_atomic_merge_localBB_nocheck( &prim_refs->root_aabb, &local_aabb );
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    if ( id < num_primrefs )
+        PrimRefSet_SetPrimRef_FullPrecision( prim_refs, ref, id );    
+}
+
+
+
+struct BVHBuildLocals
+{
+    float  Al[DFS_WG_SIZE];
+    float  Ar[DFS_WG_SIZE];
+    uchar2 axis_and_left_count[ DFS_WG_SIZE ];
+    uint   sah[DFS_WG_SIZE];
+    uint   num_active_threads;
+};
+
+
+GRL_INLINE void DFS_ConstructBVH2( local struct LocalBVH2* bvh2, 
+                        local struct PrimRefSet* prim_refs, 
+                        ushort num_prims,
+                        local struct BVHBuildLocals* locals )
+{   
+    ushort tid = get_local_id( 0 );
+
+    ushort bvh2_root         = 0;
+    ushort prim_range_start  = 0;
+    ushort primref_position = tid;
+
+    bool active_thread       = tid < num_prims;
+    float root_area  = PrimRefSet_GetMaxAABBArea( prim_refs );
+    float area_scale = DFS_BVH2_AREA_QUANT / root_area;
+    
+    locals->num_active_threads = num_prims;
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    LOOP_TRIPWIRE_INIT;
+
+    do
+    {
+        if(active_thread && prim_range_start == primref_position)
+            locals->sah[primref_position] = UINT_MAX;
+
+        if ( active_thread )
+        {            
+            local struct DFSPrimRefAABB* my_box = PrimRefSet_GetAABBPointer( prim_refs, primref_position );
+
+            // each thread evaluates a possible split candidate.  Scan primrefs and compute sah cost
+            //  do this axis-by-axis to keep register pressure low
+            float best_sah    = INFINITY;
+            ushort best_axis  = 3;
+            ushort best_count = 0;
+            float best_al     = INFINITY;
+            float best_ar     = INFINITY;
+
+            struct DFSPrimRefAABB box_left[3];
+            struct DFSPrimRefAABB box_right[3];
+            float CSplit[3];
+            ushort count_left[3];
+
+            for ( ushort axis = 0; axis < 3; axis++ )
+            {
+                DFSPrimRefAABB_init( &box_left[axis] );
+                DFSPrimRefAABB_init( &box_right[axis] );
+
+                CSplit[axis] = my_box->lower[axis] + my_box->upper[axis];
+                count_left[axis] = 0;
+            }
+
+            // scan primrefs in our subtree and partition using this thread's prim as a split plane
+            {
+                struct DFSPrimRefAABB box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start );
+            
+                for ( ushort p = 1; p < num_prims; p++ )
+                {
+                        struct DFSPrimRefAABB next_box = *PrimRefSet_GetAABBPointer( prim_refs, prim_range_start + p ); //preloading box for next iteration
+            
+                        for( ushort axis = 0; axis < 3; axis++ )
+                        {
+                            float c = box.lower[axis] + box.upper[axis];
+            
+                            if ( c < CSplit[axis] ) 
+                            {
+                                // this primitive is to our left. 
+                                DFSPrimRefAABB_extend( &box_left[axis], &box );
+                                count_left[axis]++;
+                            }
+                            else
+                            {
+                                // this primitive is to our right
+                                DFSPrimRefAABB_extend( &box_right[axis], &box );
+                            }
+                        }
+            
+                        box = next_box;
+                }
+
+                // last iteration without preloading box
+                for( ushort axis = 0; axis < 3; axis++ )
+                {
+                    float c = box.lower[axis] + box.upper[axis];
+            
+                    if ( c < CSplit[axis] ) 
+                    {
+                        // this primitive is to our left. 
+                        DFSPrimRefAABB_extend( &box_left[axis], &box );
+                        count_left[axis]++;
+                    }
+                    else
+                    {
+                        // this primitive is to our right
+                        DFSPrimRefAABB_extend( &box_right[axis], &box );
+                    }
+                }
+            }
+
+            for ( ushort axis = 0; axis < 3; axis++ )
+            {
+                float Al = DFSPrimRefAABB_halfArea( &box_left[axis]  );
+                float Ar = DFSPrimRefAABB_halfArea( &box_right[axis] );
+                
+                // Avoid NANs in SAH calculation in the corner case where all prims go right
+                //  In this case we set Al=Ar, because such a split will only be selected if all primrefs
+                //    are co-incident..  In that case, we will fall back to split-in-the-middle and both subtrees 
+                //    should store the same quantized area value
+                if ( count_left[axis] == 0 )
+                    Al = Ar; 
+
+                // compute sah cost
+                ushort count_right = num_prims - count_left[axis];
+                float sah = Ar * count_right + Al * count_left[axis];
+                
+                // keep this split if it is better than the previous one, or if the previous one was a corner-case
+                if ( sah < best_sah || best_count == 0 )
+                {
+                    // yes, keep it
+                    best_axis   = axis;
+                    best_sah    = sah;
+                    best_count  = count_left[axis];
+                    best_al     = Al;
+                    best_ar     = Ar;
+                }               
+            }
+
+
+            // write split information to SLM
+            locals->Al[primref_position]             = best_al;
+            locals->Ar[primref_position]             = best_ar;
+            locals->axis_and_left_count[primref_position].x = best_axis;
+            locals->axis_and_left_count[primref_position].y = best_count;
+
+            uint sah = as_uint(best_sah);
+            // break ties by axis to ensure deterministic split selection
+            //  otherwise builder can produce non-deterministic tree structure run to run
+            //  based on the ordering of primitives (which can vary due to non-determinism in atomic counters)
+            // Embed split axis and index into sah value; compute min over sah and max over axis
+            sah = ( ( sah & ~1023 ) | ( 2 - best_axis ) << 8 | primref_position );
+
+            // reduce on split candidates in our local subtree and decide the best one
+            atomic_min_local( &locals->sah[ prim_range_start ], sah);
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        ushort split_index      = locals->sah[ prim_range_start ] & 255;
+        ushort split_axis       = locals->axis_and_left_count[split_index].x;
+        ushort split_left_count = locals->axis_and_left_count[split_index].y;
+        float split_al          = locals->Al[split_index];
+        float split_ar          = locals->Ar[split_index];
+
+        if ( (primref_position == prim_range_start) && active_thread )
+        {
+            // first thread in a given subtree creates the inner node
+            ushort quantized_left_area  = convert_ushort_rtn( split_al * area_scale );
+            ushort quantized_right_area = convert_ushort_rtn( split_ar * area_scale );
+            ushort start_left  = prim_range_start;
+            ushort start_right = prim_range_start + split_left_count;
+            if ( split_left_count == 0 )
+                start_right = start_left + (num_prims / 2); // handle split-in-the-middle case
+
+            LocalBVH2_CreateInnerNode( bvh2, bvh2_root, 
+                                      start_left, start_right,
+                                      quantized_left_area, quantized_right_area );
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        struct DFSPrimRef ref;
+        ushort new_primref_position;
+
+        if ( active_thread )
+        {
+            ushort2 kids = LocalBVH2_GetChildIndices( bvh2, bvh2_root );
+            bool go_left;
+
+            if ( split_left_count == 0 )
+            {
+                // We chose a split with no left-side prims
+                //  This will only happen if all primrefs are located in the exact same position
+                //   In that case, fall back to split-in-the-middle
+                split_left_count = (num_prims / 2);
+                go_left = (primref_position - prim_range_start < split_left_count);
+            }
+            else
+            {
+                // determine what side of the split this thread's primref belongs on
+                local struct DFSPrimRefAABB* my_box     = PrimRefSet_GetAABBPointer( prim_refs, primref_position );
+                local struct DFSPrimRefAABB* split_box  = PrimRefSet_GetAABBPointer( prim_refs, split_index );
+                float c = my_box->lower[split_axis] + my_box->upper[split_axis];
+                float Csplit = split_box->lower[split_axis] + split_box->upper[split_axis];
+                go_left = c < Csplit;                
+            }
+
+            // adjust state variables for next loop iteration
+            bvh2_root                    = (go_left) ? kids.x : kids.y;
+            num_prims                    = (go_left) ? split_left_count : (num_prims - split_left_count);
+            prim_range_start             = (go_left) ? prim_range_start : prim_range_start + split_left_count;
+
+            // determine the new primref position by incrementing a counter in the destination subtree
+            new_primref_position = prim_range_start + LocalBVH2_IncrementPrimCount( bvh2, bvh2_root );
+            
+            // load our primref from its previous position
+            ref = PrimRefSet_GetPrimRef( prim_refs, primref_position );
+        }
+        
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        if ( active_thread )
+        {
+            // write our primref into its sorted position
+            PrimRefSet_SetPrimRef( prim_refs, ref, new_primref_position );
+            primref_position = new_primref_position;
+
+            // deactivate all threads whose subtrees are small enough to form a leaf
+            if ( num_prims <= TREE_ARITY )
+            {
+                active_thread = false;
+                atomic_dec_local( &locals->num_active_threads );
+            }
+        } 
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+
+        LOOP_TRIPWIRE_INCREMENT( 50 );
+
+
+    } while ( locals->num_active_threads > 0 );
+
+
+}
+
+
+
+// fast path for #prims <= TREE_ARITY
+GRL_INLINE void Trivial_DFS( struct DFSArgs args )
+{
+
+    ushort tid = get_local_id( 0 );
+
+    PrimRef myRef;
+    AABB_init( &myRef );
+    if( tid < args.num_primrefs )
+        myRef = args.primref_buffer[tid];
+
+    uint node_offset;
+    if ( tid == 0 )
+        node_offset = 64*allocate_inner_nodes( args.bvh_base, 1 );
+    node_offset = sub_group_broadcast(node_offset,0);
+
+    char* bvh_mem = (char*) args.bvh_base;
+    struct QBVHNodeN* qnode  = (struct QBVHNodeN*) (bvh_mem + node_offset);
+
+    uint child_type = args.leaf_node_type;
+    uint prim_base  = args.bvh_base->quadLeafStart*64 ;
+
+    char* leaf_mem = bvh_mem + prim_base;
+    int offset = (int)( leaf_mem  - (char*)qnode );
+
+    if (child_type == NODE_TYPE_INSTANCE)
+    {
+        subgroup_setInstanceQBVHNodeN( offset >> 6, &myRef, args.num_primrefs, qnode, tid < args.num_primrefs ? PRIMREF_instanceMask(&myRef) : 0  );
+    }
+    else
+        subgroup_setQBVHNodeN( offset >> 6, child_type, &myRef, args.num_primrefs, qnode, BVH_NODE_DEFAULT_MASK );
+
+    if ( tid < args.num_primrefs )
+    {
+        global uint* primref_back_pointers = args.primref_index_buffer + args.num_primrefs;
+        uint bp = node_offset;
+
+        // TODO_OPT:  Leaf creation pipeline can be made simpler by having a sideband buffer containing
+        //    fatleaf index + position in fatleaf for each primref, instead of forcing leaf creation shader to reconstruct it
+        //   should also probably do the fat-leaf splitting here
+        args.primref_buffer[tid]        = myRef;
+        args.primref_index_buffer[tid]  = tid;
+
+        primref_back_pointers[tid] = bp / sizeof(struct QBVHNodeN);
+
+        if ( tid == 0 && args.need_backpointers )
+        {
+            uint bp = ((uint)-1) << 6;
+            bp |= (args.num_primrefs) << 3;
+            *(InnerNode_GetBackPointer(BVHBase_GetBackPointers( args.bvh_base ),0)) = bp;
+        }
+    }
+}
+
+
+
+
+
+void SUBGROUP_DFS_ComputeFlatTreeBoxesAndMasks( uniform local struct FlatTree* flat_tree,
+                                                uniform local struct FlatTreeScheduler* flat_scheduler,
+                                                uniform local struct AABB3f* boxes,
+                                                uniform local struct PrimRefMeta* primref_meta,
+                                                uniform global struct AABB* primref_buffer,
+                                                uniform local uchar* masks,
+                                                bool need_masks )
+
+{
+    uniform int num_levels = (int) flat_scheduler->num_levels;
+    varying ushort lane = get_sub_group_local_id();
+
+    // iterate over depth levels in the tree... deepest to shallowest
+    for (uniform int level = num_levels - 1; level >= 0; level--)
+    {
+        // loop over a range of flattree nodes at this level, one node per sub-group
+        // TODO_OPT:  Try  and enable this code to process two nodes in a SIMD16 subgroup
+        uniform ushort level_start      = flat_scheduler->level_start[level];
+        uniform ushort level_node_count = flat_scheduler->level_count[level];
+        
+        for (uniform ushort i = get_sub_group_id(); i < level_node_count; i += get_num_sub_groups())
+        {
+            uniform ushort node_index = flat_scheduler->level_ordered_nodes[ level_start + i ];
+
+            varying struct AABB box;
+            AABB_init(&box);
+
+            uniform uint child_base   = FlatTree_GetFirstChild( flat_tree, node_index );
+            uniform uint num_children = FlatTree_GetNumChildren( flat_tree, node_index );
+            varying uint child_index  = child_base + ((lane<num_children)?lane : 0);
+
+            varying uint mask = 0xff;
+            if (FlatTree_IsLeafNode( flat_tree, node_index ))
+            {
+                // fetch AABBs for primrefs               
+                box = primref_buffer[ PrimRefMeta_GetInputIndex( &primref_meta[child_index] ) ];      
+                if( need_masks )
+                    mask = PRIMREF_instanceMask(&box);
+            }
+            else
+            {
+                // fetch AABBs for child nodes                
+                box.lower.xyz = AABB3f_load_lower( &boxes[child_index] );
+                box.upper.xyz = AABB3f_load_upper( &boxes[child_index] );
+                if ( need_masks )
+                    mask = masks[child_index];                
+            }
+
+
+            // reduce and write box
+            box = AABB_sub_group_reduce_N6( &box );
+            if( lane == 0 )
+                AABB3f_set( &boxes[node_index], box.lower.xyz, box.upper.xyz );
+
+            if( need_masks )
+            {
+                mask = sub_group_reduce_or_N6(mask);
+                masks[node_index] = mask;
+            }
+
+        }
+
+        barrier( CLK_LOCAL_MEM_FENCE );
+    }
+}
+
+
+void SUBGROUP_DFS_WriteNodes( 
+    uniform local struct FlatTree* flat_tree,
+    uniform local struct AABB3f* boxes,
+    uniform local struct PrimRefMeta* primref_meta,
+    uniform struct DFSArgs args, 
+    uniform local uchar* masks
+    )
+
+{
+    uniform uint num_nodes = FlatTree_GetNodeCount(flat_tree);
+    
+    for ( uniform uint i = get_sub_group_id(); i < num_nodes; i += get_num_sub_groups() )
+    {
+        SUBGROUP_WriteQBVHNode( flat_tree, primref_meta, boxes, i, args, masks );       
+    }
+
+}
+
+
+
+
+struct Single_WG_build_SLM
+{
+    struct FlatTree           flat_tree;  
+    struct FlatTreeScheduler  flat_scheduler; 
+    struct PrimRefMeta primitive_meta[DFS_WG_SIZE];
+
+    union
+    {
+        struct{   
+            struct PrimRefSet         prim_refs;           
+            struct LocalBVH2          bvh2;  
+            struct BVHBuildLocals     bvh2_locals;
+        } s1;
+
+        struct {
+            struct AABB3f boxes[DFS_MAX_FLATTREE_NODES];
+            uchar masks[DFS_MAX_FLATTREE_NODES];
+        } s2;
+    } u;
+
+};
+
+
+GRL_INLINE void execute_single_WG_build( 
+        struct DFSArgs args,    
+        local struct Single_WG_build_SLM* slm
+    )
+{
+    
+    ushort tid = get_local_id( 0 );
+    
+    //
+    // Initialize the various SLM structures.  Different sub-groups take different init paths.
+    //    NOTE: even numbered subgroups here to avoid the fused-EU serialization bug
+    //
+    if ( get_sub_group_id() == 0 )
+        SUBGROUP_FlatTree_Initialize( &slm->flat_tree, args );
+    else if ( get_sub_group_id() == 2 )
+        SUBGROUP_LocalBVH2_Initialize( &slm->u.s1.bvh2, args.num_primrefs );
+    else if ( get_sub_group_id() == 4 )
+        SUBGROUP_FlatTreeScheduler_Initialize( &slm->flat_scheduler );
+    else if ( get_sub_group_id() == 6 )
+        SUBGROUP_PrimRefSet_Initialize( &slm->u.s1.prim_refs );
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    // load the PrimRefs  
+    DFS_CreatePrimRefSet( args, &slm->u.s1.prim_refs );
+   
+    // build the BVH2
+    DFS_ConstructBVH2( &slm->u.s1.bvh2, &slm->u.s1.prim_refs, args.num_primrefs, &slm->u.s1.bvh2_locals );
+   
+    // copy out metadata for primrefs now that they have been sorted
+    if( tid < args.num_primrefs )
+    {
+        slm->primitive_meta[tid] = PrimRefSet_GetMeta( &slm->u.s1.prim_refs, tid );
+    }
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    // collapse into a FlatTree
+    SUBGROUP_DFS_BuildFlatTree( &slm->u.s1.bvh2, &slm->flat_tree, &slm->flat_scheduler );
+
+    // allocate output QBVH6 nodes
+    if ( get_local_id( 0 ) == 0 )
+        FlatTree_AllocateQNodes( &slm->flat_tree, args );
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    SUBGROUP_DFS_ComputeFlatTreeBoxesAndMasks( &slm->flat_tree, &slm->flat_scheduler, &slm->u.s2.boxes[0], slm->primitive_meta, args.primref_buffer, slm->u.s2.masks, args.need_masks );
+    
+    //FlatTree_Printf( &slm->flat_tree );
+    //FlatTree_check_boxes ( &slm->flat_tree, args.primref_buffer, &slm->u.s2.boxes[0], slm->primitive_meta );
+
+    SUBGROUP_DFS_WriteNodes( &slm->flat_tree, &slm->u.s2.boxes[0], slm->primitive_meta, args, slm->u.s2.masks );
+
+   
+    // generate sorted primref index buffer and backpointers to feed the leaf creation pipeilne
+    if ( tid < args.num_primrefs )
+    {
+        uint input_index = PrimRefMeta_GetInputIndex(&slm->primitive_meta[tid]);
+
+        uint bp = FlatTree_GetPrimRefBackPointer( &slm->flat_tree, tid );
+        global uint* primref_back_pointers = args.primref_index_buffer + args.num_primrefs;
+
+        args.primref_index_buffer[tid] = input_index;
+
+        primref_back_pointers[tid] = bp / sizeof(struct QBVHNodeN);
+
+        if ( tid == 0 && args.need_backpointers  )
+        {
+            *(InnerNode_GetBackPointer(BVHBase_GetBackPointers( args.bvh_base ),0)) |= ((uint)-1) << 6;
+        }
+    }
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( DFS_WG_SIZE, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void DFS( global struct Globals* globals,
+                 global char* bvh_mem,
+                 global PrimRef* primref_buffer,
+                 global uint* primref_index_buffer,
+                 uint alloc_backpointers
+                 )
+{
+    struct DFSArgs args;
+    args.bvh_base             = (global struct BVHBase*) bvh_mem;
+    args.leaf_node_type       = globals->leafPrimType;
+    args.inner_node_type      = NODE_TYPE_INTERNAL;
+    args.leaf_size_in_bytes   = globals->leafSize;
+    args.primref_buffer       = primref_buffer;
+    args.need_backpointers    = alloc_backpointers != 0;  
+    args.num_primrefs         = globals->numPrimitives;
+    args.primref_index_buffer = primref_index_buffer;
+    args.need_masks           = args.leaf_node_type == NODE_TYPE_INSTANCE;
+
+    if ( args.num_primrefs <= TREE_ARITY )
+    {
+        // TODO_OPT: This decision should be made using indirect dispatch
+        if( get_sub_group_id() == 0 )
+            Trivial_DFS( args );
+        return;
+    }
+
+    local struct Single_WG_build_SLM slm;
+   
+    execute_single_WG_build( args, &slm );
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( DFS_WG_SIZE, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void DFS_single_wg( 
+    global struct Globals* globals,
+    global char* bvh_mem,
+    global PrimRef* primref_buffer,
+    global uint* primref_index_buffer,
+    uint sah_flags
+)
+{
+    struct DFSArgs args;
+    args.bvh_base = (global struct BVHBase*) bvh_mem;
+    args.leaf_node_type = globals->leafPrimType;
+    args.inner_node_type = NODE_TYPE_INTERNAL;
+    args.leaf_size_in_bytes = globals->leafSize;
+    args.primref_buffer = primref_buffer;
+    args.need_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS;
+    args.num_primrefs = globals->numPrimitives;
+    args.primref_index_buffer = primref_index_buffer;
+    args.need_masks = sah_flags & SAH_FLAG_NEED_MASKS;
+
+    local struct Single_WG_build_SLM slm;
+
+    execute_single_WG_build( args, &slm );
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 16, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+kernel void DFS_trivial(
+    global struct Globals* globals,
+    global char* bvh_mem,
+    global PrimRef* primref_buffer,
+    global uint* primref_index_buffer,
+    uint sah_flags
+)
+{
+    struct DFSArgs args;
+    args.bvh_base = (global struct BVHBase*) bvh_mem;
+    args.leaf_node_type = globals->leafPrimType;
+    args.inner_node_type = NODE_TYPE_INTERNAL;
+    args.leaf_size_in_bytes = globals->leafSize;
+    args.primref_buffer = primref_buffer;
+    args.need_backpointers = sah_flags & SAH_FLAG_NEED_BACKPOINTERS;
+    args.num_primrefs = globals->numPrimitives;
+    args.primref_index_buffer = primref_index_buffer;
+    args.need_masks = sah_flags & SAH_FLAG_NEED_MASKS;
+
+    Trivial_DFS( args );
+}
+
+
+struct DFSArgs dfs_args_from_sah_globals( global struct SAHBuildGlobals* sah_globals )
+{
+    struct DFSArgs args;
+    args.bvh_base               = (global struct BVHBase*) sah_globals->p_bvh_base;
+    args.leaf_node_type         = sah_globals->leaf_type;
+    args.inner_node_type        = NODE_TYPE_INTERNAL;
+    args.leaf_size_in_bytes     = sah_globals->leaf_size;
+    args.primref_buffer         = (global PrimRef*) sah_globals->p_primrefs_buffer;
+    args.need_backpointers      = sah_globals->flags & SAH_FLAG_NEED_BACKPOINTERS;
+    args.num_primrefs           = sah_globals->num_primrefs;
+    args.primref_index_buffer   = (global uint*) sah_globals->p_primref_index_buffers;
+    args.need_masks             = sah_globals->flags & SAH_FLAG_NEED_MASKS;
+
+    return args;
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(DFS_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void DFS_single_wg_batchable(
+    global struct SAHBuildGlobals* globals_buffer,
+    global struct VContextScheduler* scheduler
+)
+{
+    global struct SAHBuildGlobals* sah_globals = globals_buffer + scheduler->num_trivial_builds + get_group_id(0);
+
+    struct DFSArgs args = dfs_args_from_sah_globals( sah_globals );
+    
+    local struct Single_WG_build_SLM slm;
+
+    execute_single_WG_build(args, &slm);
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+kernel void DFS_trivial_batchable(
+    global struct SAHBuildGlobals* globals_buffer
+)
+{
+    global struct SAHBuildGlobals* sah_globals = globals_buffer + get_group_id(0);
+
+    struct DFSArgs args = dfs_args_from_sah_globals(sah_globals);
+
+    Trivial_DFS(args);
+}
+\ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_leaf.cl b/src/intel/vulkan/grl/gpu/bvh_build_leaf.cl
new file mode 100644
index 00000000000..bb220b30612
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_leaf.cl
@@ -0,0 +1,357 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "common.h"
+#include "instance.h"
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(32, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel
+primref_to_quads(global struct Globals *globals,
+                 global struct AABB *primref,
+                 global char *primref_index,
+                 global char *bvh_mem,
+                 global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
+                 const uint stride,
+                 const uint offset,
+                 const uint allow_update)
+{
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    global struct Quad *quads = (global struct Quad *)(bvh_mem + 64*bvh->quadLeafStart );
+    uint quadIndicesStart = bvh->quadIndicesDataStart;
+
+    const uint numPrimitives = globals->numPrimitives;
+    uint i = get_group_id( 0 ) * get_local_size( 0 ) + get_local_id(0);
+    if (i < numPrimitives)
+    {
+        global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+
+        const uint primrefID = *(uint *)(primref_index + i * stride + offset);
+
+        const uint geomID    = PRIMREF_geomID(&primref[primrefID]);
+        const uint primID0   = PRIMREF_primID0(&primref[primrefID]);
+        const uint primID1   = PRIMREF_primID1(&primref[primrefID]);
+        const uint geomFlags = PRIMREF_geomFlags(&primref[primrefID]);
+
+        const uint3 tri0 = GRL_load_triangle(&geomDesc[geomID], primID0);
+        const uint3 tri1 = GRL_load_triangle(&geomDesc[geomID], primID1);
+
+        const struct TrianglePair q = TrianglePair_Constructor(tri0, primID0, tri1, primID1);
+
+        uint vertex_stride = geomDesc[geomID].Desc.Triangles.VertexBufferByteStride;
+
+        const uint4 indices = q.a;
+
+        const uint mask = 0xff; // FIXME: hardcoded mask
+        float3 vtx0, vtx1, vtx2, vtx3;
+        GRL_load_quad_vertices(&geomDesc[geomID], &vtx0, &vtx1, &vtx2, &vtx3, indices);
+
+        uint j0 = q.lb.x;
+        uint j1 = q.lb.y;
+        uint j2 = q.lb.z;
+        uint shaderIndex = (mask << 24) | geomID;
+        uint geomIndex = geomID | (geomFlags << 30);
+        uint primIndex0 = primID0;
+        const uint delta = primID1 - primID0;
+        const uint j = (((j0) << 0) | ((j1) << 2) | ((j2) << 4));
+        uint primIndex1Delta = delta | (j << 16) | (1 << 22);
+
+        uint4 pack0 = (uint4)(shaderIndex, geomIndex, primIndex0, primIndex1Delta);
+        float4 pack1 = (float4)(vtx0.x, vtx0.y, vtx0.z, vtx1.x);
+        float4 pack2 = (float4)(vtx1.y, vtx1.z, vtx2.x, vtx2.y);
+        float4 pack3 = (float4)(vtx2.z, vtx3.x, vtx3.y, vtx3.z);
+        
+        global uint4* dst = (global uint4*)&quads[i];
+        store_uint4_L1WB_L3WB(dst, 0, pack0);
+        store_uint4_L1WB_L3WB(dst, 1, as_uint4(pack1));
+        store_uint4_L1WB_L3WB(dst, 2, as_uint4(pack2));
+        store_uint4_L1WB_L3WB(dst, 3, as_uint4(pack3));
+
+        if(allow_update)
+        {
+            global uint4* vertex_indice_ptr = (global uint4*)(((char*)bvh) + (64u * quadIndicesStart + 32 * i));
+        
+            uint4 pack_indices = (uint4) ( indices.x , indices.y, indices.z, indices.w );
+        
+            store_uint4_L1WB_L3WB( vertex_indice_ptr, 0, pack0 );
+            store_uint4_L1WB_L3WB( vertex_indice_ptr, 1, pack_indices * vertex_stride);
+        }
+
+        if (i == 0)
+            bvh->quadLeafCur += numPrimitives ;
+    }
+
+
+
+#if 0
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    global struct Quad *quads = (global struct Quad *)(bvh_mem + 64*bvh->quadLeafStart );
+
+    const uint numPrimitives = globals->numPrimitives;
+    const uint startID = get_group_id( 0 ) * get_local_size( 0 );
+    const uint endID   = min((uint)(startID + get_local_size( 0 )), numPrimitives);
+
+    for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
+    {
+        const uint primrefID = *(uint *)(primref_index + i * stride + offset);
+
+        const uint geomID    = PRIMREF_geomID(&primref[primrefID]);
+        const uint primID0   = PRIMREF_primID0(&primref[primrefID]);
+        const uint primID1   = PRIMREF_primID1(&primref[primrefID]);
+        const uint geomFlags = PRIMREF_geomFlags(&primref[primrefID]);
+
+        const uint3 tri0 = GRL_load_triangle(&geomDesc[geomID], primID0);
+        const uint3 tri1 = GRL_load_triangle(&geomDesc[geomID], primID1);
+
+        const struct TrianglePair q = TrianglePair_Constructor(tri0, primID0, tri1, primID1);
+
+        const uint4 indices = q.a;
+        const uint mask = 0xff; // FIXME: hardcoded mask
+        float3 vtx0, vtx1, vtx2, vtx3;
+        GRL_load_quad_vertices(&geomDesc[geomID], &vtx0, &vtx1, &vtx2, &vtx3, indices);
+
+        setQuad(&quads[i], (float4)(vtx0,0), (float4)(vtx1,0), (float4)(vtx2,0), (float4)(vtx3,0), q.lb.x, q.lb.y, q.lb.z, geomID, primID0, primID1, mask, geomFlags );
+    }
+
+    if (get_local_id(0) + get_group_id(0)*get_local_size(0) == 0)
+        bvh->quadLeafCur += numPrimitives ;
+#endif
+}
+
+GRL_INLINE void create_procedural_leaf(global struct Globals *globals,
+                            global struct AABB *primref,
+                            local uint *primrefids,
+                            uint numProcedurals,
+                            struct QBVHNodeN *qnode,
+                            global char *bvh_mem,
+                            global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    if (get_local_id(0) >= 8)
+        return;
+
+    global struct BVHBase* bvh_base = (global struct BVHBase*)bvh_mem;
+
+    /* first read geomID of all primitives */
+    uint primrefID = -1;
+    uint geomID = -1;
+    uint geomFlags = 0;
+    if (get_local_id(0) < numProcedurals)
+    {
+        primrefID = primrefids[get_local_id(0)];
+        geomID = PRIMREF_geomID(&primref[primrefID]);
+        geomFlags = PRIMREF_geomFlags( &primref[primrefID] );
+    }
+
+    // cannot sort by geomID as bounds in parent node are then wrong
+    //ulong geomID_primrefID = (((ulong)geomID) << 32) | ((ulong)primrefID);
+    //geomID_primrefID = sort8_ascending_ulong(geomID_primrefID);
+    //geomID = geomID_primrefID >> 32;
+    //primrefID = geomID_primrefID;
+
+    /* We have to split at geomID boundaries into multiple leaves. This
+   * block calculates the lane where a leaf starts and ends. */
+    const uint geomIDprev = intel_sub_group_shuffle_up(0xFFFFFFFFu, geomID, 1u);
+    const uint geomIDnext = intel_sub_group_shuffle_down(geomID, 0xFFFFFFFFu, 1u);
+    const uint leaf_start = geomIDprev != geomID;
+    const uint leaf_end = geomIDnext != geomID;
+    const uint leaf_start_next = intel_sub_group_shuffle_down(leaf_start, 0u, 1u);
+
+    /* This computes which leaf a lane processes. E.g. form geomID =
+   * [3,3,4,4,4,0] we get leaf_id = [0,0,1,1,1,2] */
+    //const uint leaf_id = sub_group_scan_inclusive_add(leaf_start); // FIXME: exclusive?
+
+    /* This computes the n'th primitive a lane processes inside its
+    * leaf. For the example above we compute leaf_prim =
+    * [0,1,0,1,2,0]. */
+    const uint leaf_prim = get_local_id(0) - sub_group_scan_inclusive_max(leaf_start ? get_local_id(0) : 0);
+
+    /* from here on we allocate data and write to memory, thus only
+   * lanes that process a primitive should continue. */
+    if (get_local_id(0) >= numProcedurals)
+        return;
+
+    /* Here we allocate a single memory block for each required
+     * ProceduralLeaf node. We do this from a single lane to ensure
+     * the allocation is contiguous. */
+    uint leaf_base_offset = 0;
+    uint n_leafs = sub_group_reduce_add(leaf_start);
+    if (get_local_id(0) == 0)
+       leaf_base_offset = allocate_procedural_leaves( bvh_base, n_leafs );
+    leaf_base_offset = sub_group_broadcast(leaf_base_offset, 0);
+
+    /* Compute the leaf offset for each lane. */
+    uint leaf_offset = leaf_base_offset + sub_group_scan_inclusive_add(leaf_start) - 1;
+
+    struct ProceduralLeaf *pleaf = ((global struct ProceduralLeaf *)(bvh_mem)) + leaf_offset;
+
+    /* write the procedural leaf headers */
+    if (leaf_end)
+    {
+        pleaf->leafDesc.shaderIndex_geomMask = 0xFF000000 | (geomID & 0x00FFFFFF); // FIXME: use accessor function.   Future extensions may have shaderIndex != geomID
+        pleaf->leafDesc.geomIndex_flags = geomID | (geomFlags<<30); // FIXME:  Use setter function
+        pleaf->DW1 = 0xFFFFFFF0 | (leaf_prim + 1); // !!!
+    }
+    /* write the procedural leaf primIDs */
+    pleaf->_primIndex[leaf_prim] = PRIMREF_primID0(&primref[primrefID]);
+
+    /* update leaf node offset inside parent node */
+    if (get_local_id(0) == 0)
+    {
+        QBVH6Node_set_offset(qnode, pleaf);
+        QBVH6Node_set_type(qnode, NODE_TYPE_PROCEDURAL);
+    }
+
+    /* Let parent node children point to proper procedural leaf block
+   * and primitive. */
+    qnode->childData[get_local_id(0)] = leaf_start_next | (leaf_prim << 2);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+GRL_ANNOTATE_BIG_REG_REQ
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+primref_to_procedurals(global struct Globals *globals,
+                                 global struct AABB *primref,
+                                 global char *primref_index,
+                                 global char *bvh_mem,
+                                 global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
+                                 const uint stride,
+                                 const uint offset)
+{
+    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
+
+    const uint numPrimitives = globals->numPrimitives;
+    uint startID = get_group_id( 0 ) * get_local_size( 0 );
+    uint endID   = min((uint)(startID + get_local_size( 0 )), numPrimitives);
+
+    uint offset1 = stride * globals->numPrimitives;
+    if (stride == 8)
+        offset1 = 4;
+
+    uint prev_start_back_pointer = startID == 0 ? -1 : *(uint *)(primref_index + (startID-1) * stride + offset1);
+    /* start at leaf start */
+    while (startID < numPrimitives)
+    {
+        const uint back_pointer = *(uint *)(primref_index + startID * stride + offset1);
+        if (back_pointer != prev_start_back_pointer)
+            break;
+        startID++;
+    }
+
+    uint prev_end_back_pointer = *(uint *)(primref_index + (endID-1) * stride + offset1);
+    /* end at next leaf start */
+    while (endID < numPrimitives)
+    {
+        const uint back_pointer = *(uint *)(primref_index + endID * stride + offset1);
+        if (back_pointer != prev_end_back_pointer)
+            break;
+        endID++;
+    }
+
+    local uint procedurals[16];
+
+    for (uint lid = startID + get_local_id(0); lid < endID + get_local_id(0);)
+    {
+        /* load leaf start points and back_pointer */
+        const uint primrefID = *(uint *)(primref_index + lid * stride + offset);
+        uint back_pointer = *(uint *)(primref_index + lid * stride + offset1);
+        uint prev_back_pointer = get_local_id(0) == 0 ? -1 : *(uint *)(primref_index + (lid-1) * stride + offset1);
+
+        const uint leaf_start = back_pointer != prev_back_pointer;
+        uint leaf_start_back_pointer = sub_group_broadcast(back_pointer, 0);
+
+        /* compute number of primitives inside the leaf starting at lid */
+        const uint leaf_id = sub_group_scan_inclusive_add(leaf_start);
+        uint numPrimitives = 0;
+        if (back_pointer == leaf_start_back_pointer && lid < endID)
+            numPrimitives = sub_group_reduce_add(1);
+        numPrimitives = sub_group_broadcast(numPrimitives, 0);
+
+        procedurals[get_local_id(0)] = primrefID;
+
+        struct QBVHNodeN *qnode = (struct QBVHNodeN *)bvh_mem + back_pointer;
+
+        create_procedural_leaf(globals, primref, procedurals, numPrimitives, qnode, bvh_mem, geomDesc);
+
+        lid += numPrimitives;
+    }
+}
+
+GRL_INLINE void create_HW_instance_leaf(
+    global struct BVHBase* bvh,
+    global const struct GRL_RAYTRACING_INSTANCE_DESC* instDesc,
+    uint dstLeafId,
+    uint instanceIndex,
+    uint rootNodeByteOffset,
+    uint instanceMask)
+{
+    /* convert DXR instance to instance leaf node */
+    global struct HwInstanceLeaf* leaves = (__global struct HwInstanceLeaf*)BVHBase_quadLeaves(bvh);
+    HwInstanceLeaf_Constructor(&leaves[dstLeafId], instDesc, instanceIndex, rootNodeByteOffset, instanceMask);
+}
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel create_HW_instance_nodes(
+    global const struct Globals *globals,
+    global char *primref_index,
+    global struct AABB *primref,
+    global struct BVHBase *bvh,
+    global struct GRL_RAYTRACING_INSTANCE_DESC *src_instances,
+    uint32_t stride,
+    uint32_t offset)
+{
+    uint dstLeafId = get_group_id(0) * MAX_HW_SIMD_WIDTH + get_sub_group_local_id();
+    uint num_prims = globals->numPrimitives;
+    if (dstLeafId >= num_prims)
+        return;
+    if( dstLeafId == 0 )
+        bvh->instanceLeafEnd += 2*num_prims;
+
+    /* get instance ID */
+    const uint primrefID = *(uint *)(primref_index + dstLeafId * stride + offset);
+    const uint instIndex = PRIMREF_instanceIndex(&primref[primrefID]);
+    const uint rootByteOffset = PRIMREF_instanceRootNodeOffset(&primref[primrefID]);
+    const uint instMask = PRIMREF_instanceMask(&primref[primrefID]);
+    create_HW_instance_leaf(bvh, &src_instances[instIndex], dstLeafId, instIndex, rootByteOffset, instMask );
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel create_HW_instance_nodes_pointers(
+    global const struct Globals *globals,
+    global char *primref_index,
+    global struct AABB *primref,
+    global struct BVHBase *bvh,
+    global void *instances_in,
+    uint32_t stride,
+    uint32_t offset)
+{
+    uint dstLeafId = get_group_id(0) * MAX_HW_SIMD_WIDTH + get_sub_group_local_id();
+    uint num_prims = globals->numPrimitives;
+    if (dstLeafId >= num_prims)
+        return;
+    if (dstLeafId == 0)
+        bvh->instanceLeafEnd += 2 * num_prims;
+
+    global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
+        (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
+
+    /* get instance ID */
+    const uint primrefID = *(uint *)(primref_index + dstLeafId * stride + offset);
+    const uint instIndex = PRIMREF_instanceIndex(&primref[primrefID]);
+    const uint rootByteOffset = PRIMREF_instanceRootNodeOffset(&primref[primrefID]);
+    const uint instMask = PRIMREF_instanceMask(&primref[primrefID]);
+    create_HW_instance_leaf(bvh, instances[instIndex], dstLeafId, instIndex, rootByteOffset, instMask );
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_presplit.cl b/src/intel/vulkan/grl/gpu/bvh_build_presplit.cl
new file mode 100644
index 00000000000..bc9cf590f51
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_presplit.cl
@@ -0,0 +1,556 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "common.h"
+
+#define GRID_SIZE 1024
+
+/*
+  This presplit item contains for each primitive a number of splits to
+  perform (priority) and the primref index.
+ */
+
+struct PresplitItem
+{
+    unsigned int index;
+    float priority;
+};
+
+/*
+
+  This function splits a line v0->v1 at position pos in dimension dim
+  and merges the bounds for the left and right line segments into
+  lbounds and rbounds.
+
+ */
+
+GRL_INLINE void splitLine(const uint dim,
+                      const float pos,
+                      const float4 v0,
+                      const float4 v1,
+                      struct AABB *lbounds,
+                      struct AABB *rbounds)
+{
+    const float v0d = v0[dim];
+    const float v1d = v1[dim];
+
+    /* this point is on left side */
+    if (v0d <= pos)
+        AABB_extend_point(lbounds, v0);
+
+    /* this point is on right side */
+    if (v0d >= pos)
+        AABB_extend_point(rbounds, v0);
+
+    /* the edge crosses the splitting location */
+    if ((v0d < pos && pos < v1d) || (v1d < pos && pos < v0d))
+    {
+        const float f = (pos - v0d) / (v1d - v0d);
+        const float4 c = f * (v1 - v0) + v0;
+        AABB_extend_point(lbounds, c);
+        AABB_extend_point(rbounds, c);
+    }
+}
+
+/*
+
+  This function splits a clipped triangle v0,v1,v2 with bounds prim at
+  position pos in dimension dim and merges the bounds for the left and
+  right clipped triangle fragments into lbounds and rbounds.
+
+ */
+
+GRL_INLINE void splitTriangle(struct AABB *prim,
+                          const uint dim,
+                          const float pos,
+                          const float4 v0,
+                          const float4 v1,
+                          const float4 v2,
+                          struct AABB *lbounds,
+                          struct AABB *rbounds)
+{
+    /* clip each triangle edge */
+    splitLine(dim, pos, v0, v1, lbounds, rbounds);
+    splitLine(dim, pos, v1, v2, lbounds, rbounds);
+    splitLine(dim, pos, v2, v0, lbounds, rbounds);
+
+    /* the triangle itself was clipped already, thus clip against triangle bounds */
+    AABB_intersect(lbounds, prim);
+    AABB_intersect(rbounds, prim);
+}
+
+float calculate_priority(struct AABB *prim, global GRL_RAYTRACING_GEOMETRY_DESC *geom)
+{
+    /* calculate projected area of first triangles */
+    const uint primID0 = PRIMREF_primID0(prim);
+    const uint3 tri0 = GRL_load_triangle(geom, primID0);
+    const float4 av0 = GRL_load_vertex(geom, tri0.x);
+    const float4 av1 = GRL_load_vertex(geom, tri0.y);
+    const float4 av2 = GRL_load_vertex(geom, tri0.z);
+    const float area_tri0 = areaProjectedTriangle(av0, av1, av2);
+
+    /* calculate projected area of second triangle */
+    const uint primID1 = PRIMREF_primID1(prim);
+    const uint3 tri1 = GRL_load_triangle(geom, primID1);
+    const float4 bv0 = GRL_load_vertex(geom, tri1.x);
+    const float4 bv1 = GRL_load_vertex(geom, tri1.y);
+    const float4 bv2 = GRL_load_vertex(geom, tri1.z);
+    const float area_tri1 = areaProjectedTriangle(bv0, bv1, bv2);
+
+    /* as priority we use the AABB area */
+    const float area_aabb = AABB_halfArea(prim);
+    float priority = area_aabb;
+
+    /* prefer triangles with a large potential SAH gain. */
+    const float area_tris = area_tri0 + area_tri1;
+    const float area_ratio = min(4.0f, area_aabb / max(1E-12f, area_tris));
+    priority *= area_ratio;
+
+    /* ignore too small primitives */
+    //const float4 size = AABB_size(prim);
+    //const float max_size = max(size.x,max(size.y,size.z));
+    //if (max_size < 0.5f*max_scene_size/GRID_SIZE)
+    //  priority = 0.0f;
+
+    return priority;
+}
+
+/*
+
+  This kernel calculates for each primitive an estimated splitting priority.
+
+ */
+
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel compute_num_presplits(global struct Globals *globals,
+                                                                                                 global struct BVHBase* bvh_base,
+                                                                                                 global struct AABB *primref,
+                                                                                                 global struct PresplitItem *presplit,
+                                                                                                 global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    //assert(sizeof(PresplitItem) == sizeof_PresplitItem);
+
+    /* calculate the range of primitives each work group should process */
+    const uint numPrimitives = globals->numPrimitives;
+    const uint startID = (get_group_id(0) + 0) * numPrimitives / get_num_groups(0);
+    const uint endID = (get_group_id(0) + 1) * numPrimitives / get_num_groups(0);
+
+    /* get scene bounding box size */
+    const float3 scene_size = AABB3f_size(&bvh_base->Meta.bounds);
+    const float max_scene_size = max(scene_size.x, max(scene_size.y, scene_size.z));
+
+    /* each work group iterates over its range of primitives */
+    for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
+    {
+        const uint geomID = PRIMREF_geomID(&primref[i]);
+
+        /* splitting heuristic for triangles */
+        if (GRL_is_triangle(&geomDesc[geomID]))
+        {
+            presplit[i].index = i;
+            presplit[i].priority = calculate_priority(&primref[i], &geomDesc[geomID]);
+        }
+
+        /* splitting of procedurals is not supported */
+        else if (GRL_is_procedural(&geomDesc[geomID]))
+        {
+            presplit[i].index = i;
+            presplit[i].priority = 0.0f;
+        }
+
+        else
+        {
+            //assert(false);
+        }
+    }
+
+    if (get_local_id(0) + get_group_id(0)*get_local_size(0) == 0)
+        globals->numOriginalPrimitives = globals->numPrimitives;
+}
+
+/*
+
+  This kernel computes the sum of all priorities.
+
+ */
+
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+priority_sum(global struct Globals *globals,
+             global struct PresplitItem *presplit,
+             uint numPrimitivesToSplit)
+{
+    const uint N = globals->numPrimitives;
+    const uint j = get_local_id(0);
+    const uint J = get_local_size(0);
+    const uint BLOCKSIZE = (N + J - 1) / J;
+    const uint start = min((j + 0) * BLOCKSIZE, N);
+    const uint end = min((j + 1) * BLOCKSIZE, N);
+
+    float prioritySum = 0;
+    for (uint i = start; i < end; i++)
+        prioritySum += presplit[i].priority;
+
+    prioritySum = work_group_reduce_add(prioritySum);
+    globals->presplitPrioritySum = prioritySum;
+
+#if 0
+  work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+
+  float scale = 1.0f;
+  for (uint i = 0; i < 10; i++)
+  {
+    //if (j == 0)
+    //printf("prioritySum = %f\n",scale*prioritySum);
+
+    uint numSplits = 0;
+    for (uint i = start; i < end; i++)
+      numSplits += presplit[i].priority / (scale*prioritySum)*numPrimitivesToSplit;
+
+    numSplits = work_group_reduce_add(numSplits);
+
+    if (numSplits > numPrimitivesToSplit)
+      break;
+
+    //if (j == 0)
+    //  printf("numSplits = %i (%i)\n",numSplits,numPrimitivesToSplit);
+
+    globals->presplitPrioritySum = scale * prioritySum;
+    scale -= 0.05f;
+  }
+#endif
+}
+
+GRL_INLINE void heapify_down(struct AABB *array, uint size)
+{
+    /* we start at the root */
+    uint cur_node_id = 0;
+    struct AABB *cur_node = array;
+
+    while (true)
+    {
+        int larger_node_id = cur_node_id;
+        struct AABB *larger_node = cur_node;
+
+        /* check if left child is largest */
+        const int left_node_id = 2 * cur_node_id + 1;
+        struct AABB *left_node = &array[left_node_id];
+        if (left_node_id < size && AABB_halfArea(left_node) > AABB_halfArea(larger_node))
+        {
+            larger_node_id = left_node_id;
+            larger_node = left_node;
+        }
+
+        /* check if right child is largest */
+        const int right_node_id = 2 * cur_node_id + 2;
+        struct AABB *right_node = &array[right_node_id];
+        if (right_node_id < size && AABB_halfArea(right_node) > AABB_halfArea(larger_node))
+        {
+            larger_node_id = right_node_id;
+            larger_node = right_node;
+        }
+
+        /* if current node is largest heap property is fulfilled and we are done */
+        if (larger_node_id == cur_node_id)
+            break;
+
+        /* otherwise we swap cur and largest */
+        struct AABB tmp = *cur_node;
+        *cur_node = *larger_node;
+        *larger_node = tmp;
+
+        /* we continue downwards with the largest node */
+        cur_node_id = larger_node_id;
+        cur_node = larger_node;
+    }
+}
+
+GRL_INLINE void heapify_up(struct AABB *array, uint cur_node_id)
+{
+    /* stop if we start at the root */
+    if (cur_node_id == 0)
+        return;
+
+    struct AABB *cur_node = &array[cur_node_id];
+
+    /* we loop until we reach the root node */
+    while (cur_node_id)
+    {
+        /* get parent node */
+        uint parent_node_id = (cur_node_id - 1) / 2;
+        struct AABB *parent_node = &array[parent_node_id];
+
+        /* if parent is larger then current we fulfill the heap property and can terminate */
+        if (AABB_halfArea(parent_node) > AABB_halfArea(cur_node))
+            break;
+
+        /* otherwise we swap cur and parent */
+        struct AABB tmp = *cur_node;
+        *cur_node = *parent_node;
+        *parent_node = tmp;
+
+        /* and continue upwards */
+        cur_node_id = parent_node_id;
+        cur_node = parent_node;
+    }
+}
+
+/* splits a quad primref */
+GRL_INLINE void splitQuadPrimRef(global GRL_RAYTRACING_GEOMETRY_DESC *geom,
+                      struct AABB *cur, uint dim, float fsplit,
+                      struct AABB *left, struct AABB *right)
+{
+    /* left and right bounds to compute */
+    AABB_init(left);
+    AABB_init(right);
+
+    /* load first triangle and split it */
+    const uint primID0 = PRIMREF_primID0(cur);
+    const uint3 tri0 = GRL_load_triangle(geom, primID0);
+    const float4 av0 = GRL_load_vertex(geom, tri0.x);
+    const float4 av1 = GRL_load_vertex(geom, tri0.y);
+    const float4 av2 = GRL_load_vertex(geom, tri0.z);
+    splitTriangle(cur, dim, fsplit, av0, av1, av2, left, right);
+
+    /* load second triangle and split it */
+    const uint primID1 = PRIMREF_primID1(cur);
+    const uint3 tri1 = GRL_load_triangle(geom, primID1);
+    const float4 bv0 = GRL_load_vertex(geom, tri1.x);
+    const float4 bv1 = GRL_load_vertex(geom, tri1.y);
+    const float4 bv2 = GRL_load_vertex(geom, tri1.z);
+    splitTriangle(cur, dim, fsplit, bv0, bv1, bv2, left, right);
+
+    /* copy the PrimRef payload into left and right */
+    left->lower.w = cur->lower.w;
+    left->upper.w = cur->upper.w;
+    right->lower.w = cur->lower.w;
+    right->upper.w = cur->upper.w;
+}
+
+/*
+
+  This kernel performs the actual pre-splitting. It selects split
+  locations based on an implicit octree over the scene.
+
+ */
+
+#define USE_HEAP 0
+#define HEAP_SIZE 32u
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+//__attribute__((intel_reqd_sub_group_size(16)))
+void kernel
+perform_presplits(global struct Globals *globals,
+                  global struct BVHBase* bvh_base,
+                  global struct AABB *primref,
+                  global struct PresplitItem *presplit,
+                  global char *bvh_mem,
+                  global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
+                  uint numPrimitivesToSplit)
+{
+    /* calculate the range of primitives each work group should process */
+    const uint numPrimitives = globals->numPrimitives;
+    int pstart = globals->numOriginalPrimitives - numPrimitivesToSplit;
+    pstart = max(0, pstart);
+    const uint numPrimitivesToProcess = globals->numPrimitives - pstart;
+    const uint startID = (get_group_id(0) + 0) * numPrimitivesToProcess / get_num_groups(0);
+    const uint endID = (get_group_id(0) + 1) * numPrimitivesToProcess / get_num_groups(0);
+
+    /* calculates the 3D grid */
+    float4 grid_base;
+    grid_base.xyz = AABB3f_load_lower( &bvh_base->Meta.bounds );
+    grid_base.w = 0;
+
+    float4 grid_extend;
+    grid_extend.xyz = AABB3f_size(&bvh_base->Meta.bounds);
+    grid_extend.w=0;
+
+    grid_extend = max(grid_extend.x, max(grid_extend.y, grid_extend.z));
+    const float4 grid_scale = select(GRID_SIZE / grid_extend, 0.0f, grid_extend == 0.0f);
+    const float inv_grid_size = 1.0f / GRID_SIZE;
+
+    /* we have to update centroid bounds */
+    struct AABB centroidBounds;
+    AABB_init(&centroidBounds);
+
+    /* initialize heap */
+    struct AABB heap[HEAP_SIZE];
+    uint heap_size = 0;
+
+    /* each work group iterates over its range of primitives */
+    for (uint j = startID + get_local_id(0); j < endID; j += get_local_size(0))
+    {
+        /* array is in ascending order */
+        //const uint ID = numPrimitives-1-j;
+        const uint ID = pstart + j;
+        const float prob = presplit[ID].priority;
+        const uint i = presplit[ID].index;
+        const uint geomID = PRIMREF_geomID(&primref[i]);
+
+        /* do not split primitives with low splitting priority */
+        if (prob <= 0.0f)
+            continue;
+
+        /* we support splitting only for triangles */
+        if (!GRL_is_triangle(&geomDesc[geomID]))
+            continue;
+
+        /* compute number of split primitives to produce */
+        uint numSplitPrims = prob / globals->presplitPrioritySum * numPrimitivesToSplit;
+        numSplitPrims = min(HEAP_SIZE, numSplitPrims);
+
+        /* stop if not splits have to get performed */
+        if (numSplitPrims <= 1)
+            continue;
+
+        /* add primref to heap */
+        heap[0] = primref[i];
+        heap_size = 1;
+        uint heap_pos = 0;
+
+        /* iterate until all splits are done */
+        uint prims = 1;
+        uint last_heap_size = heap_size;
+        while (prims < numSplitPrims)
+        {
+            /* map the primitive bounds to the grid */
+            const float4 lower = heap[heap_pos].lower;
+            const float4 upper = heap[heap_pos].upper;
+            const float4 glower = (lower - grid_base) * grid_scale + 0.2f;
+            const float4 gupper = (upper - grid_base) * grid_scale - 0.2f;
+            uint4 ilower = convert_uint4_rtz(glower);
+            uint4 iupper = convert_uint4_rtz(gupper);
+
+            /* this ignores dimensions that are empty */
+            if (glower.x >= gupper.x)
+                iupper.x = ilower.x;
+            if (glower.y >= gupper.y)
+                iupper.y = ilower.y;
+            if (glower.z >= gupper.z)
+                iupper.z = ilower.z;
+
+            /* Now we compute a morton code for the lower and upper grid
+       * coordinates. */
+            const uint lower_code = bitInterleave3D(ilower);
+            const uint upper_code = bitInterleave3D(iupper);
+
+            /* if all bits are equal then we cannot split */
+            if (lower_code == upper_code)
+            {
+#if !USE_HEAP
+                prims++; // !!!!!!!
+
+                heap_pos++;
+                if (heap_pos == last_heap_size)
+                {
+                    heap_pos = 0;
+                    last_heap_size = heap_size;
+                }
+                continue;
+#else
+                if (heap_size == 1)
+                    break;
+
+                const uint offset = numPrimitives + atomic_add(&globals->numSplittedPrimitives, 1);
+                primref[offset] = heap[heap_pos];
+
+                presplit[offset].index = offset;
+                presplit[offset].priority = calculate_priority(&heap[heap_pos], &geomDesc[geomID]);
+
+                heap[0] = heap[--heap_size];
+                heapify_down(heap, heap_size);
+                continue;
+#endif
+            }
+
+            /* We find the bit position of the first differing bit from the
+       * top down. This bit indicates a split position inside an
+       * implicit octree. */
+            const uint diff = 31 - clz(lower_code ^ upper_code);
+
+            /* compute octree level and dimension to perform the split in */
+            const uint level = diff / 3;
+            const uint dim = diff % 3;
+
+            /* now we compute the grid position of the split */
+            const uint isplit = iupper[dim] & ~((1 << level) - 1);
+
+            /* compute world space position of split */
+            const float fsplit = grid_base[dim] + isplit * inv_grid_size * grid_extend[dim];
+
+            /* split primref into left and right part */
+            struct AABB left, right;
+            splitQuadPrimRef(&geomDesc[geomID], &heap[heap_pos], dim, fsplit, &left, &right);
+            prims++;
+
+            /* update centroid bounds */
+            AABB_extend_point(&centroidBounds, AABB_centroid2(&left));
+            AABB_extend_point(&centroidBounds, AABB_centroid2(&right));
+
+#if !USE_HEAP
+
+            heap[heap_pos] = left;
+            heap[heap_size] = right;
+            heap_size++;
+
+            heap_pos++;
+            if (heap_pos == last_heap_size)
+            {
+                heap_pos = 0;
+                last_heap_size = heap_size;
+            }
+#else
+
+            /* insert left element into heap */
+            heap[0] = left;
+            heapify_down(heap, heap_size);
+
+            /* insert right element into heap */
+            heap[heap_size] = right;
+            heapify_up(heap, heap_size);
+
+            heap_size++;
+#endif
+        }
+
+        /* copy primities to primref array */
+        primref[i] = heap[0];
+
+        presplit[ID].index = i;
+        presplit[ID].priority = calculate_priority(&heap[0], &geomDesc[geomID]);
+
+        for (uint k = 1; k < heap_size; k++)
+        {
+            const uint offset = numPrimitives + atomic_add(&globals->numSplittedPrimitives, 1);
+            primref[offset] = heap[k];
+
+            presplit[offset].index = offset;
+            presplit[offset].priority = calculate_priority(&heap[k], &geomDesc[geomID]);
+        }
+    }
+
+    /* merge centroid bounds into global bounds */
+    centroidBounds = AABB_sub_group_reduce(&centroidBounds);
+    if (get_sub_group_local_id() == 0)
+        AABB_global_atomic_merge(&globals->centroidBounds, &centroidBounds);
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+
+    /* update number of primitives on finish */
+    if (Globals_OnFinish(globals))
+    {
+        globals->numPrimitives = globals->numPrimitives + globals->numSplittedPrimitives;
+        globals->numSplittedPrimitives = 0;
+
+        /* update first build record */ // FIXME: should be done in builder itself
+        global struct BuildRecord *record = (global struct BuildRecord *)(bvh_mem + bvh_base->quadLeafStart*64);
+        record->end = globals->numPrimitives;
+    }
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_primref.cl b/src/intel/vulkan/grl/gpu/bvh_build_primref.cl
new file mode 100644
index 00000000000..1dd9a3cdd92
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_primref.cl
@@ -0,0 +1,674 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "common.h"
+#include "instance.h"
+
+#include "bvh_build_primref.h"
+
+//#pragma OPENCL EXTENSION cl_khr_subgroup_non_uniform_vote : enable
+//int sub_group_non_uniform_any(int predicate);
+
+#define WINDOW_SIZE 16
+
+/* Representation of two merged triangles. */
+struct QuadIndices
+{
+    uint primID0, primID1;
+    uint v0, v1, v2, v3;
+};
+
+/*
+
+  This function calculates a PrimRef from a merged quad and writes
+  this PrimRef to memory.
+
+ */
+GRL_INLINE void create_prim_ref(const uint geomID,
+                            const struct QuadIndices quad,
+                            global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
+                            struct AABB *geometryBounds,
+                            struct AABB *centroidBounds,
+                            global uint *numPrimitives,
+                            global struct AABB *primref)
+{
+
+    /* load quad vertices */
+    const float4 vtx0 = GRL_load_vertex(geomDesc, quad.v0); // FIXME: these multiple load_vertex calls should get merged
+    const float4 vtx1 = GRL_load_vertex(geomDesc, quad.v1);
+    const float4 vtx2 = GRL_load_vertex(geomDesc, quad.v2);
+    const float4 vtx3 = GRL_load_vertex(geomDesc, quad.v3);
+
+    /* calculate bounds for quad */
+    float4 lower = min(min(vtx0, vtx1), min(vtx2, vtx3));
+    float4 upper = max(max(vtx0, vtx1), max(vtx2, vtx3));
+
+    /* extend geometry and centroid bounds */
+    const float4 centroid2 = lower + upper;
+    AABB_extendlu(geometryBounds, lower, upper);
+    AABB_extendlu(centroidBounds, centroid2, centroid2);
+
+    PrimRef ref;
+    PRIMREF_setAABB( &ref, lower.xyz, upper.xyz );
+    PRIMREF_setQuadMetaData( &ref, quad.primID0, quad.primID1, geomID, GRL_get_Flags( geomDesc ) );
+
+    /* store primref to memory */
+    const uint offset = atomic_add_global(numPrimitives, 1);
+    primref[offset] = ref;
+}
+
+/*
+
+  This function calculates a PrimRef from a procedural and writes
+  this PrimRef to memory.
+
+ */
+GRL_INLINE void create_prim_ref_procedural(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc,
+                                       const uint geomID,
+                                       const uint primID,
+                                       struct AABB *geometryBounds,
+                                       struct AABB *centroidBounds,
+                                       global uint *numPrimitives,
+                                       global struct AABB *primref)
+{
+    /* load aabb from memory */
+    struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
+
+    /* extend geometry and centroid bounds */
+    float4 lower = (float4)(aabb.MinX, aabb.MinY, aabb.MinZ, 0.0f);
+    float4 upper = (float4)(aabb.MaxX, aabb.MaxY, aabb.MaxZ, 0.0f);
+    const float4 centroid2 = lower + upper;
+    AABB_extendlu(geometryBounds, lower, upper);
+    AABB_extendlu(centroidBounds, centroid2, centroid2);
+
+    /* encode geomID, primID */
+    uint geomFlags = GRL_get_Flags(&geomDesc[geomID]);
+
+    PrimRef ref;
+    PRIMREF_setAABB( &ref, lower.xyz, upper.xyz );
+    PRIMREF_setProceduralMetaData( &ref, geomID, primID, geomFlags );
+
+    /* store primref to memory */
+    const uint offset = atomic_add_global(numPrimitives, 1);
+    primref[offset] = ref;
+}
+
+/*
+
+   This function performs a binary search to calculate the geomID and
+   primID of the i'th primitive of the scene. For the search a
+   prefix_sum array is used that stores for each location j the sum of
+   the number of primitives of all meshes k with k<j.
+
+*/
+
+struct GeomPrimID
+{
+    uint geomID, primID;
+};
+
+struct GeomPrimID binary_search_geomID_primID(global uint *prefix_sum, const uint prefix_sum_size, const uint i)
+{
+    uint l = 0;
+    uint r = prefix_sum_size;
+    uint k = 0;
+
+    while (r - l > 1)
+    {
+        const uint m = (l + r) / 2;
+        k = prefix_sum[m];
+        if (k <= i)
+        {
+            l = m;
+        }
+        else if (i < k)
+        {
+            r = m;
+        }
+    }
+
+    struct GeomPrimID id;
+    id.geomID = l;
+    id.primID = i - prefix_sum[l];
+    return id;
+}
+
+/*
+
+  Checks if a vertex contains only finite floating point numbers.
+
+ */
+
+GRL_INLINE bool isfinite_vertex(float4 vtx)
+{
+    return isfinite(vtx.x) && isfinite(vtx.y) && isfinite(vtx.z);
+}
+
+
+/*
+  Create primrefs from array of instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+primrefs_from_DXR_instances(global struct Globals *globals,
+                            global struct BVHBase* bvh,
+                            global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
+                            uint numInstances,
+                            global struct AABB *primrefs,
+                            uint allowUpdate)
+{
+    const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+    if (instanceIndex < numInstances)
+    {
+        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex;
+
+        primrefs_from_instances(
+            globals,
+            bvh,
+            instance,
+            instanceIndex,
+            primrefs,
+            0,
+            allowUpdate);
+    }
+}
+
+/*
+  Create primrefs from array of instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel
+primrefs_from_DXR_instances_indirect(global struct Globals *globals,
+                            global struct BVHBase* bvh,
+                            global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
+                            global struct IndirectBuildRangeInfo* indirect_data,
+                            global struct AABB *primrefs,
+                            uint allowUpdate)
+{
+    // TODO: On DG2, we have 8 dwords of 'inline data' which can be pushed
+    // directly to the kernel. THe rest of the kernel args are pulled using
+    // loads from memory. It may be more efficient to put 'numInstances' and
+    // 'allowUpdate' into 'globals'
+
+    const uint instanceIndex =  get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+
+    if (instanceIndex < indirect_data->primitiveCount)
+    {
+        instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
+            (((global char*)instances) + indirect_data->primitiveOffset);
+        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex;
+        primrefs_from_instances(
+            globals,
+            bvh,
+            instance,
+            instanceIndex,
+            primrefs,
+            0,
+            allowUpdate);
+    }
+}
+
+/*
+  Create primrefs from array of pointers to instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+primrefs_from_DXR_instances_pointers(global struct Globals *globals,
+                                     global struct BVHBase* bvh,
+                                     global void *instances_in,
+                                     uint numInstances,
+                                     global struct AABB *primrefs,
+                                     uint allowUpdate)
+{
+    global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
+        (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
+
+    const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+    if (instanceIndex < numInstances)
+    {
+        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex];
+
+        primrefs_from_instances(
+            globals,
+            bvh,
+            instance,
+            instanceIndex,
+            primrefs,
+            0,
+            allowUpdate);
+    }
+}
+
+/*
+  Create primrefs from array of pointers to instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel
+primrefs_from_DXR_instances_pointers_indirect(global struct Globals *globals,
+                                              global struct BVHBase* bvh,
+                                              global void *instances_in,
+                                              global struct AABB *primrefs,
+                                              global struct IndirectBuildRangeInfo* indirect_data,
+                                              uint allowUpdate)
+{
+    global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
+        (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
+
+    const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+
+    if (instanceIndex < indirect_data->primitiveCount)
+    {
+        instances = (global const struct GRL_RAYTRACING_INSTANCE_DESC**)
+            (((global char*)instances) + indirect_data->primitiveOffset);
+        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex];
+
+        primrefs_from_instances(
+            globals,
+            bvh,
+            instance,
+            instanceIndex,
+            primrefs,
+            0,
+            allowUpdate);
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////
+
+bool can_pair( uint3 a, uint3 b )
+{
+    bool match0 = any( a.xxx == b.xyz ) ? 1 : 0;
+    bool match1 = any( a.yyy == b.xyz ) ? 1 : 0;
+    bool match2 = any( a.zzz == b.xyz ) ? 1 : 0;
+    return (match0 + match1 + match2) >= 2;
+}
+
+void reduce_bounds(
+    float3 lower,
+    float3 upper,
+    global struct Globals* globals,
+    global struct BVHBase* bvh )
+{
+
+    // reduce centroid bounds... make sure to exclude lanes with invalid AABBs
+    float3 cent = lower + upper;
+    float3 cent_lower = select( (float3)(INFINITY, INFINITY, INFINITY), cent, lower <= upper);
+    float3 cent_upper = select(-(float3)(INFINITY, INFINITY, INFINITY), cent, lower <= upper);
+
+    // reduce geo bounds
+    AABB3f_atomic_merge_global_sub_group_lu( &bvh->Meta.bounds, lower, upper );
+    AABB_global_atomic_merge_sub_group_lu(&globals->centroidBounds, cent_lower, cent_upper );
+}
+
+
+struct TriState
+{
+    bool valid;
+    uint prim_index;
+    uint pairing;
+    uint3 indices;
+    float3 lower;
+    float3 upper;
+};
+
+#define NOT_PAIRED 0xffffffff
+
+void load_triangle_data(uniform global char* index_buffer,
+                        uniform const uint index_format,
+                        uniform global char* vertex_buffer,
+                        uniform const uint vertex_format,
+                        uniform const uint vertex_stride,
+                        uniform global float* transform_buffer,
+                        uniform uint total_vert_count,
+                        struct TriState* state,
+                        float4* v)
+{
+        state->indices = GRL_load_indices_from_buffer(index_buffer, index_format, state->prim_index );
+
+        const uint last_vertex = total_vert_count - 1;
+        const uint x = min(state->indices.x, last_vertex);
+        const uint y = min(state->indices.y, last_vertex);
+        const uint z = min(state->indices.z, last_vertex);
+
+        GRL_load_triangle_vertices(vertex_buffer, vertex_format, vertex_stride, transform_buffer, x, y, z, v);
+}
+
+struct TriState load_triangle( uniform global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+                        uniform uint base,
+                        uniform uint num_prims,
+                        uniform uint total_vert_count )
+{
+
+    struct TriState state;
+    state.pairing        = NOT_PAIRED;
+    state.valid          = false;
+    state.prim_index     = base + get_sub_group_local_id();
+    state.lower = (float3)(INFINITY, INFINITY, INFINITY);
+    state.upper = -(float3)(INFINITY, INFINITY, INFINITY);
+
+    if (state.prim_index < num_prims)
+    {
+        state.valid = true;
+        float4 v[3];
+        load_triangle_data((global char*)geomDesc->Desc.Triangles.pIndexBuffer,
+                        geomDesc->Desc.Triangles.IndexFormat,
+                        (global char*)geomDesc->Desc.Triangles.pVertexBuffer,
+                        geomDesc->Desc.Triangles.VertexFormat,
+                        geomDesc->Desc.Triangles.VertexBufferByteStride,
+                        (global float*)geomDesc->Desc.Triangles.pTransformBuffer,
+                        total_vert_count,
+                        &state,
+                        v);
+
+        if (state.indices.x >= total_vert_count || state.indices.y >= total_vert_count || state.indices.z >= total_vert_count ||
+            !isfinite_vertex(v[0]) || !isfinite_vertex(v[1]) || !isfinite_vertex(v[2]) ||
+            state.indices.x == state.indices.y || state.indices.x == state.indices.z || state.indices.y == state.indices.z)
+        {
+            state.valid = false;
+        }
+        else
+        {
+            state.lower.xyz = min(v[2].xyz, min(v[1].xyz, v[0].xyz));
+            state.upper.xyz = max(v[2].xyz, max(v[1].xyz, v[0].xyz));
+        }
+    }
+    return state;
+}
+
+void broadcast_triangles_local( struct TriState* state  )
+{
+    varying uint my_prim    = state->prim_index;
+    varying uint my_pairing = state->pairing;
+    varying float3 my_lower = state->lower;
+    varying float3 my_upper = state->upper;
+    varying bool valid      = state->valid;
+    varying uint3 indices   = state->indices;
+
+    for (uniform uint broadcast_lane = 0; broadcast_lane < get_sub_group_size(); broadcast_lane++)
+    {
+        // don't broadcast invalid prims
+        if ( !sub_group_broadcast( valid, broadcast_lane ) )
+            continue;
+
+        uint broadcast_pairing = sub_group_broadcast(my_pairing, broadcast_lane);
+        uint broadcast_prim = sub_group_broadcast(my_prim, broadcast_lane);
+
+        if (broadcast_pairing == NOT_PAIRED)
+        {
+            // if the broadcast prim is not paired already, all unpaired lanes attempt to pair with it
+            bool pairable = false;
+            uint3 other_indices = sub_group_broadcast_uint3( indices, broadcast_lane );
+            if (broadcast_prim != my_prim && my_pairing == NOT_PAIRED && valid )
+            {
+                pairable = can_pair( indices, other_indices );
+            }
+
+
+            uint pairable_lane = ctz(intel_sub_group_ballot(pairable));
+            if (valid && pairable_lane < get_sub_group_size())
+            {
+                // pair the broadcast primitive with the first lane that can accept it
+                float3 broadcast_lower = sub_group_broadcast_float3(my_lower.xyz, broadcast_lane);
+                float3 broadcast_upper = sub_group_broadcast_float3(my_upper.xyz, broadcast_lane);
+                if (get_sub_group_local_id() == pairable_lane)
+                {
+                    my_pairing = broadcast_prim;
+                    my_lower.xyz = min(my_lower.xyz, broadcast_lower);
+                    my_upper.xyz = max(my_upper.xyz, broadcast_upper);
+                }
+
+                // pair the broadcast primitive with the same that was paired to it
+                uint pairable_prim = sub_group_broadcast(my_pairing, pairable_lane);
+                if (get_sub_group_local_id() == broadcast_lane)
+                {
+                    my_pairing = pairable_prim;
+                }
+            }
+        }
+        else
+        {
+            //
+            // if this lane was already paired with the broadcasting tri
+            //   in an earlier loop iteration, then record the pairing in this lane's registers
+            float3 broadcast_lower = sub_group_broadcast_float3(my_lower.xyz, broadcast_lane);
+            float3 broadcast_upper = sub_group_broadcast_float3(my_upper.xyz, broadcast_lane);
+            if (broadcast_pairing == my_prim)
+            {
+                my_pairing = broadcast_prim;
+                my_lower.xyz = min(my_lower.xyz, broadcast_lower);
+                my_upper.xyz = max(my_upper.xyz, broadcast_upper);
+            }
+        }
+    }
+
+    state->pairing = my_pairing;
+    state->lower = my_lower;
+    state->upper = my_upper;
+}
+
+
+void broadcast_triangles_nonlocal(struct TriState* state, const struct TriState* other )
+{
+    varying uint my_prim = state->prim_index;
+    varying uint my_pairing = state->pairing;
+    varying float3 my_lower = state->lower;
+    varying float3 my_upper = state->upper;
+    varying bool valid = state->valid;
+    varying uint3 indices = state->indices;
+
+    for (uniform uint broadcast_lane = 0; broadcast_lane < get_sub_group_size(); broadcast_lane++)
+    {
+        // don't broadcast invalid prims
+        if (!sub_group_broadcast(other->valid, broadcast_lane))
+            continue;
+
+        uint broadcast_pairing = sub_group_broadcast(other->pairing, broadcast_lane);
+        uint broadcast_prim = sub_group_broadcast(other->prim_index, broadcast_lane);
+
+        if (broadcast_pairing == NOT_PAIRED)
+        {
+            // if the broadcast prim is not paired already, all unpaired lanes attempt to pair with it
+            bool pairable = false;
+            if ( my_pairing == NOT_PAIRED && valid )
+            {
+                uint3 other_indices = sub_group_broadcast_uint3(other->indices, broadcast_lane);
+                pairable = can_pair(indices, other_indices);
+            }
+
+            // pair the broadcast primitive with the first lane that can accept it
+            uint pairable_mask = intel_sub_group_ballot(pairable);
+            if (valid && (ctz(pairable_mask) == get_sub_group_local_id()))
+            {
+                my_pairing = broadcast_prim;
+                my_lower.xyz = min(my_lower.xyz, sub_group_broadcast_float3(other->lower.xyz, broadcast_lane));
+                my_upper.xyz = max(my_upper.xyz, sub_group_broadcast_float3(other->upper.xyz, broadcast_lane));
+            }
+        }
+
+    }
+
+    state->pairing = my_pairing;
+    state->lower = my_lower;
+    state->upper = my_upper;
+}
+
+GRL_INLINE void do_triangles_to_primrefs(
+    global struct Globals*               globals,
+    global struct BVHBase*               bvh,
+    global struct AABB*                  primref,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    uint                                 geomID_and_flags,
+    const uint                           num_prims)
+{
+    uint geomID             = geomID_and_flags & 0x00ffffff;
+    uint geom_flags         = geomID_and_flags >> 24;
+    uint prim_base          = get_group_id(0) * get_local_size(0);
+    uint total_vert_count   = GRL_get_triangles_VertexCount(geomDesc);
+
+    struct TriState tri = load_triangle( geomDesc, prim_base, num_prims, total_vert_count );
+    broadcast_triangles_local( &tri );
+
+
+    // we will produce output if the lane creates a triangle (my_pairing == NOT_PAIRED)
+    // or for the lane corresponding to the larger of two triangles
+    bool will_write = (tri.pairing > tri.prim_index) && tri.valid;
+    uint write_mask = intel_sub_group_ballot(will_write);
+    uint write_offs = subgroup_bit_prefix_exclusive( write_mask );
+    uint write_count = popcount(write_mask);
+
+    // allocate space in primref buffer
+    uint write_base;
+    if( get_sub_group_local_id() == 0 )
+        write_base = atomic_add_global( &globals->numPrimitives, write_count );
+    write_offs += sub_group_broadcast( write_base, 0 );
+
+    uint primID0 = tri.prim_index;
+    uint primID1 = (tri.pairing != NOT_PAIRED) ? tri.pairing : tri.prim_index;
+
+    if (will_write)
+    {
+        PrimRef ref;
+        PRIMREF_setAABB(&ref, tri.lower.xyz, tri.upper.xyz);
+        PRIMREF_setQuadMetaData(&ref, primID0, primID1, geomID, geom_flags);
+        uint8 val = (uint8)(
+            as_uint(ref.lower.x), as_uint(ref.lower.y), as_uint(ref.lower.z), as_uint(ref.lower.w),
+            as_uint(ref.upper.x), as_uint(ref.upper.y), as_uint(ref.upper.z), as_uint(ref.upper.w));
+        store_uint8_L1WB_L3WB((global uint8*)(primref + write_offs), 0, val);
+    }
+
+    reduce_bounds( tri.lower, tri.upper, globals, bvh );
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+triangles_to_primrefs(
+    global struct Globals* globals,
+    global struct BVHBase* bvh,
+    global struct AABB* primref,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    uint geomID_and_flags,
+    uint num_prims
+    )
+{
+    do_triangles_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+triangles_to_primrefs_indirect(
+    global struct Globals*                globals,
+    global struct BVHBase*                bvh,
+    global struct AABB*                   primref,
+    global GRL_RAYTRACING_GEOMETRY_DESC*  geomDesc,
+    global struct IndirectBuildRangeInfo* indirect_data,
+    uint                                  geomID_and_flags)
+{
+    const uint num_prims = indirect_data->primitiveCount;
+    do_triangles_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims);
+}
+
+GRL_INLINE void do_procedurals_to_primrefs(
+    global struct Globals* globals,
+    global struct BVHBase* bvh,
+    global struct AABB* primref,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    uint geomID_and_flags,
+    const uint num_prims)
+{
+    uint geomID    = geomID_and_flags & 0x00ffffff;
+    uint geomFlags = geomID_and_flags >> 24;
+
+    uint primID   = get_group_id(0) * get_local_size(0) + get_sub_group_local_id();
+
+    bool create_primref = false;
+    float3 lower =  (float3)(INFINITY, INFINITY, INFINITY);
+    float3 upper = -(float3)(INFINITY, INFINITY, INFINITY);
+    if (primID < num_prims)
+    {
+        /* check if procedural is valid */
+        struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(geomDesc, primID);
+        const bool valid_min = isfinite(aabb.MinX) && isfinite(aabb.MinY) && isfinite(aabb.MinZ);
+        const bool valid_max = isfinite(aabb.MaxX) && isfinite(aabb.MaxY) && isfinite(aabb.MaxZ);
+        if (valid_min & valid_max)
+        {
+            /* load aabb from memory */
+            float3 l = (float3)(aabb.MinX, aabb.MinY, aabb.MinZ);
+            float3 u = (float3)(aabb.MaxX, aabb.MaxY, aabb.MaxZ);
+
+            // convert degenerate boxes to points at the box centroid
+            lower = min( l, u );
+            upper = max( l, u );
+
+            create_primref = true;
+        }
+    }
+
+    uint write_mask = intel_sub_group_ballot(create_primref);
+    uint write_offs = subgroup_bit_prefix_exclusive(write_mask);
+    uint write_count = popcount(write_mask);
+
+    // allocate space in primref buffer
+    uint write_base;
+    if (get_sub_group_local_id() == 0)
+        write_base = atomic_add_global(&globals->numPrimitives, write_count);
+    write_offs += sub_group_broadcast(write_base, 0);
+
+    // write the primref
+    if (create_primref)
+    {
+        PrimRef ref;
+        PRIMREF_setAABB(&ref, lower.xyz, upper.xyz);
+        PRIMREF_setProceduralMetaData(&ref, geomID, primID, geomFlags);
+        primref[write_offs] = ref;
+    }
+
+    reduce_bounds(lower, upper, globals, bvh);
+
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+procedurals_to_primrefs(
+    global struct Globals* globals,
+    global struct BVHBase* bvh,
+    global struct AABB* primref,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    uint geomID_and_flags,
+    uint num_prims
+    )
+{
+    do_procedurals_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+procedurals_to_primrefs_indirect(
+    global struct Globals* globals,
+    global struct BVHBase* bvh,
+    global struct AABB* primref,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    global const struct IndirectBuildRangeInfo* indirect_data,
+    uint geomID_and_flags
+    )
+{
+    const uint num_prims = indirect_data->primitiveCount;
+    do_procedurals_to_primrefs(globals, bvh, primref, geomDesc, geomID_and_flags, num_prims);
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_primref.h b/src/intel/vulkan/grl/gpu/bvh_build_primref.h
new file mode 100644
index 00000000000..25e2d3df194
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_primref.h
@@ -0,0 +1,246 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#if 0
+/*
+
+Create primrefs from array of instance descriptors.
+
+*/
+
+void store_instance_primref(
+    global struct BVHBase* top_bvh,
+    global struct Globals* globals,
+    global PrimRef* primrefs,
+    bool alloc_primref,
+    PrimRef new_primref )
+{
+    uint allocatePrimref = alloc_primref ? 1 : 0;
+    uint index = 0;
+    uint numAllocations = sub_group_reduce_add(allocatePrimref);
+
+    if (get_sub_group_local_id() == 0)
+    {
+        index = atomic_add_global(&globals->numPrimitives, numAllocations);
+    }
+
+    index = sub_group_broadcast(index, 0);
+    index = index + sub_group_scan_exclusive_add(allocatePrimref);
+
+    if (allocatePrimref)
+    {
+        primrefs[index] = new_primref;
+    }
+
+    struct AABB centroidBounds;
+    centroidBounds.lower = centroidBounds.upper = AABB_centroid2(&new_primref);
+    struct AABB subgroup_bbox = AABB_sub_group_reduce(&new_primref);
+    struct AABB subgroup_CentroidBounds = AABB_sub_group_reduce(&centroidBounds);
+
+    if (get_sub_group_local_id() == 0)
+    {
+        AABB3f_atomic_merge_global_lu(&top_bvh->Meta.bounds, subgroup_bbox.lower.xyz, subgroup_bbox.upper.xyz);
+        AABB_global_atomic_merge(&globals->centroidBounds, &subgroup_CentroidBounds);
+    }
+}
+
+
+
+// Compute transformed blas AABB.  Returns false if instance is degenerate
+bool create_instance_primref(
+    PrimRef* ref_out,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance,
+    global struct BVHBase* bvh,
+    uint instanceMask,
+    uint instanceIndex
+    )
+{
+    struct AABB3f bbox;
+    bool alloc_primref = false;
+    uint rootNodeOffset = NO_NODE_OFFSET;
+    if (bvh != 0)
+    {
+        alloc_primref = true;
+        AABB3f AS_bounds = BVHBase_GetRootAABB(bvh);
+
+        const bool valid_min = isfinite(AS_bounds.lower[0]) && isfinite(AS_bounds.lower[1]) && isfinite(AS_bounds.lower[2]);
+        const bool valid_max = isfinite(AS_bounds.upper[0]) && isfinite(AS_bounds.upper[1]) && isfinite(AS_bounds.upper[2]);
+
+        if (!valid_min || !valid_max || instanceMask == 0)
+        {
+            // degenerated instance case
+
+            // TODO this should be under  if ( allocate backpointers )
+            {
+                // we have to allocate the primref because this instance can be updated to non-degenerated
+                // take the origin of the instance as a bounding box.
+
+                bbox.lower[0] = instance->Transform[3];
+                bbox.lower[1] = instance->Transform[7];
+                bbox.lower[2] = instance->Transform[11];
+                bbox.upper[0] = instance->Transform[3];
+                bbox.upper[1] = instance->Transform[7];
+                bbox.upper[2] = instance->Transform[11];
+                instanceMask = 0;
+            }
+        }
+        else
+        {
+            rootNodeOffset = BVH_ROOT_NODE_OFFSET;
+            float transformOverhead = 0.0f;
+            bbox = compute_xfm_bbox(instance->Transform, BVHBase_GetRootNode(bvh), XFM_BOX_NOT_REFINED_TAKE_CLIPBOX, &AS_bounds, transformOverhead);
+        }
+    }
+
+    *ref_out = PRIMREF_set_instance(AABB3f_load_lower(&bbox), AABB3f_load_upper(&bbox), instanceIndex, instanceMask, rootNodeOffset, 0);
+    return alloc_primref;
+}
+
+GRL_INLINE void primrefs_from_instances(
+    global struct Globals* globals,
+    global struct BVHBase* top_bvh,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance,
+    uint instanceIndex,
+    global struct AABB* primrefs)
+{
+    bool alloc_primref = false;
+    PrimRef new_primref;
+    AABB_init(&new_primref);
+
+    if (instance)
+    {
+        uint mask = GRL_get_InstanceMask(instance);
+        global struct BVHBase* bvh = (global struct BVHBase*)instance->AccelerationStructure;
+        alloc_primref = create_instance_primref(&new_primref, instance, bvh, mask, instanceIndex);
+    }
+
+    store_instance_primref(top_bvh, globals, primrefs, alloc_primref, new_primref);
+}
+#endif
+
+#if 1
+GRL_INLINE void primrefs_from_instances(
+    global struct Globals* globals,
+    global struct BVHBase* top_bvh,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance,
+    uint instanceIndex,
+    global struct AABB* primrefs,
+    global GRL_RAYTRACING_AABB* procedural_aabb,
+    uint allowUpdate
+    )
+{
+    struct AABB3f bbox;
+    uint allocatePrimref = 0;
+
+    uint rootNodeOffset = NO_NODE_OFFSET;
+    uint instanceMask = 0;
+
+    bool is_procedural = (procedural_aabb != 0);
+
+    if( instance )
+    {
+        instanceMask = GRL_get_InstanceMask(instance) ;
+        if ( is_procedural )
+        {
+            // procedural instance primref
+            allocatePrimref = 1;
+
+            float3 lower = (float3)(procedural_aabb->MinX, procedural_aabb->MinY, procedural_aabb->MinZ);
+            float3 upper = (float3)(procedural_aabb->MaxX, procedural_aabb->MaxY, procedural_aabb->MaxZ);
+
+            if (instanceMask == 0 || any(lower > upper))
+            {
+                bbox.lower[0] = instance->Transform[3];
+                bbox.lower[1] = instance->Transform[7];
+                bbox.lower[2] = instance->Transform[11];
+                bbox.upper[0] = instance->Transform[3];
+                bbox.upper[1] = instance->Transform[7];
+                bbox.upper[2] = instance->Transform[11];
+                instanceMask = 0;
+            }
+            else
+            {
+                bbox = transform_aabb(lower, upper, instance->Transform);
+            }
+        }
+        else
+        {
+            // HW-instance primref
+
+            global struct BVHBase* bvh = instance ?
+                (global struct BVHBase*)instance->AccelerationStructure :
+                0;
+
+            if (bvh != 0)
+            {
+                AABB3f AS_bounds = BVHBase_GetRootAABB(bvh);
+
+                const bool valid_min = isfinite(AS_bounds.lower[0]) && isfinite(AS_bounds.lower[1]) && isfinite(AS_bounds.lower[2]);
+                const bool valid_max = isfinite(AS_bounds.upper[0]) && isfinite(AS_bounds.upper[1]) && isfinite(AS_bounds.upper[2]);
+
+
+                if (valid_min && valid_max && instanceMask != 0)
+                {
+                    allocatePrimref = 1;
+                    rootNodeOffset = BVH_ROOT_NODE_OFFSET;
+                    float transformOverhead = 0.0f;
+                    bbox = compute_xfm_bbox(instance->Transform, BVHBase_GetRootNode(bvh), XFM_BOX_NOT_REFINED_TAKE_CLIPBOX, &AS_bounds, transformOverhead);
+                }
+                else if (allowUpdate)
+                {
+                    // degenerated instance case
+                    // we have to allocate the primref because this instance can be updated to non-degenerated
+                    // take the origin of the instance as a bounding box.
+                    allocatePrimref = 1;
+                    bbox.lower[0] = instance->Transform[3];
+                    bbox.lower[1] = instance->Transform[7];
+                    bbox.lower[2] = instance->Transform[11];
+                    bbox.upper[0] = instance->Transform[3];
+                    bbox.upper[1] = instance->Transform[7];
+                    bbox.upper[2] = instance->Transform[11];
+                    instanceMask = 0;
+                }
+            }
+        }
+    }
+
+    uint index = 0;
+    uint numAllocations = sub_group_reduce_add(allocatePrimref);
+
+    if (get_sub_group_local_id() == 0)
+    {
+        index = atomic_add_global(&globals->numPrimitives, numAllocations);
+    }
+
+    index = sub_group_broadcast(index, 0);
+    index = index + sub_group_scan_exclusive_add(allocatePrimref);
+
+    struct AABB new_primref;
+    struct AABB centroidBounds;
+    if (allocatePrimref)
+    {
+        new_primref = PRIMREF_set_instance(AABB3f_load_lower(&bbox), AABB3f_load_upper(&bbox), instanceIndex, instanceMask, rootNodeOffset, is_procedural);
+        primrefs[index] = new_primref;
+        centroidBounds.lower = centroidBounds.upper = AABB_centroid2(&new_primref);
+    }
+    else
+    {
+        AABB_init(&new_primref);
+        AABB_init(&centroidBounds);
+    }
+
+
+    struct AABB subgroup_bbox = AABB_sub_group_reduce(&new_primref);
+    struct AABB subgroup_CentroidBounds = AABB_sub_group_reduce(&centroidBounds);
+
+    if (get_sub_group_local_id() == 0)
+    {
+        AABB3f_atomic_merge_global_lu(&top_bvh->Meta.bounds, subgroup_bbox.lower.xyz, subgroup_bbox.upper.xyz);
+        AABB_global_atomic_merge(&globals->centroidBounds, &subgroup_CentroidBounds);
+    }
+}
+#endif
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_refit.cl b/src/intel/vulkan/grl/gpu/bvh_build_refit.cl
new file mode 100644
index 00000000000..bcda2fa54ec
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_refit.cl
@@ -0,0 +1,491 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "bvh_build_refit.h"
+#include "api_interface.h"
+#include "common.h"
+
+
+
+
+
+#if 0 
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 16, 1, 1 )) )
+void kernel
+update_instance_leaves( global struct BVHBase* bvh,
+    uint64_t dxrInstancesArray,
+    uint64_t dxrInstancesPtr,
+    global struct AABB3f* instance_aabb_scratch
+)
+{
+    uint num_leaves = BVHBase_GetNumHWInstanceLeaves( bvh );
+    uint id = get_local_id( 0 ) + get_local_size( 0 ) * get_group_id( 0 );
+    if ( id >= num_leaves )
+        return;
+
+    global struct GRL_RAYTRACING_INSTANCE_DESC* instancesArray =
+        (global struct GRL_RAYTRACING_INSTANCE_DESC*)dxrInstancesArray;
+    global struct GRL_RAYTRACING_INSTANCE_DESC** instancesPtrArray =
+        (global struct GRL_RAYTRACING_INSTANCE_DESC**)dxrInstancesPtr;
+
+    global struct HwInstanceLeaf* leafs = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh );
+
+    /* iterate over all children of the instance node and get their bounds */
+
+    uint32_t instanceIdx = HwInstanceLeafPart1_getInstanceIndex( &leafs[id] );
+    global struct GRL_RAYTRACING_INSTANCE_DESC* instance = NULL;
+    if ( dxrInstancesArray != NULL )
+        instance = &instancesArray[instanceIdx];
+    else
+        instance = instancesPtrArray[instanceIdx];
+
+    struct AffineSpace3f xfm = AffineSpace3f_load_row_major( instance->Transform );
+    global struct BVHBase* instanceBvh = (global struct BVHBase*)instance->AccelerationStructure;
+    struct AABB3f newSubtreeBounds = instanceBvh->Meta.bounds;
+    struct AABB3f bbox = AABB3f_transform( xfm, newSubtreeBounds ); // JDB TODO:  Use faster abs-matrix method
+
+    const bool valid_min = isfinite( bbox.lower[0] ) && isfinite( bbox.lower[1] ) && isfinite( bbox.lower[2] );
+    const bool valid_max = isfinite( bbox.upper[0] ) && isfinite( bbox.upper[1] ) && isfinite( bbox.upper[2] );
+
+    uint mask = GRL_get_InstanceMask(instance);
+
+    uint offset = instanceBvh->rootNodeOffset;
+    if ( !valid_min || !valid_max )
+    {
+        bbox.lower[0] = xfm.p.x;
+        bbox.lower[1] = xfm.p.y;
+        bbox.lower[2] = xfm.p.z;
+        bbox.upper[0] = xfm.p.x;
+        bbox.upper[1] = xfm.p.y;
+        bbox.upper[2] = xfm.p.z;
+        offset = NO_NODE_OFFSET;
+        mask = 0;
+    }
+
+    instance_aabb_scratch[id] = bbox;
+    
+    HwInstanceLeaf_Constructor( &leafs[id], instance, instanceIdx, offset, mask ); // TODO: No instance opening for refittable BVH   
+}
+#endif
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+update_instance_leaves(global struct BVHBase* bvh,
+    uint64_t dxrInstancesArray,
+    uint64_t dxrInstancesPtr,
+    global struct AABB3f* instance_aabb_scratch
+)
+{
+    uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh);
+    uint id = get_local_id(0) + get_local_size(0) * get_group_id(0);
+    if (id >= num_leaves)
+        return;
+
+    DO_update_instance_leaves(
+        bvh,
+        dxrInstancesArray,
+        dxrInstancesPtr,
+        instance_aabb_scratch,
+        id,
+        0 );
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+update_instance_leaves_indirect(global struct BVHBase* bvh,
+    uint64_t dxrInstancesArray,
+    uint64_t dxrInstancesPtr,
+    global struct AABB3f* instance_aabb_scratch,
+    global struct IndirectBuildRangeInfo* indirect_data)
+{
+    uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh);
+    uint id = get_local_id(0) + get_local_size(0) * get_group_id(0);
+    if (id >= num_leaves)
+        return;
+
+    DO_update_instance_leaves(
+        bvh,
+        dxrInstancesArray + indirect_data->primitiveOffset,
+        dxrInstancesPtr,
+        instance_aabb_scratch,
+        id,
+        0 );
+}
+
+#if 0
+/*
+
+  This kernel refit a BVH. The algorithm iterates over all BVH nodes
+  to find all leaf nodes, which is where refitting starts. For these
+  leaf nodes bounds get recalculated and then propagates up the tree.
+
+  One kernel instance considers a range of inner nodes as startpoints.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(8, 1, 1))) void kernel refit(
+    global struct BVHBase *bvh,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
+    global struct AABB3f* instance_leaf_aabbs )
+{
+    /* here we temporarily store the bounds for the children of a node */
+    struct AABB childrenAABB[BVH_NODE_N6];
+
+    /* get pointer to inner nodes and back pointers */
+    global struct QBVHNodeN *inner_nodes = BVHBase_rootNode(bvh);
+    BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+
+    /* construct range of nodes that each work group will process */
+    const uint numInnerNodes = BVHBase_numNodes(bvh);
+    const uint startID = (get_group_id(0) + 0) * numInnerNodes / get_num_groups(0);
+    const uint endID = (get_group_id(0) + 1) * numInnerNodes / get_num_groups(0);
+
+    /* each workgroup iterates over its range of nodes */
+    for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
+    {
+        global struct QBVHNodeN* curNode = &inner_nodes[i];
+        uint numChildren = refit_bottom(bvh, geosArray,
+                                 instance_leaf_aabbs,
+                                 curNode,
+                                 childrenAABB,
+                                 *InnerNode_GetBackPointer(backPointers, i));
+        if (numChildren != 0)
+        {
+            /* update bounds of node */
+            QBVHNodeN_setBounds(curNode, childrenAABB, numChildren);
+
+            /* refit upper parts of the BVH */
+            // TODO: this will not gonna work for mixed nodes
+            refit_bottom_up(curNode, bvh, childrenAABB, numChildren);
+        }
+    }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(8, 1, 1)))
+void kernel Find_refit_treelets(
+    global struct BVHBase* bvh,
+    global TreeletNodeData* treelets,
+    global uint* scratchStartpoints,
+    global uint* startpointAlloc)
+{
+    find_refit_treelets(bvh,
+                        treelets,
+                        scratchStartpoints,
+                        startpointAlloc);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1))) 
+void kernel Assign_refit_startpoints_to_treelets(
+    global struct BVHBase* bvh,
+    global TreeletNodeData* treelets,
+    global uint* scratchStartpoints)
+{
+    assign_refit_startpoints_to_treelets(bvh, treelets, scratchStartpoints);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(128, 1, 1))) 
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel Finalize_treelets_in_groups(
+    global struct BVHBase* bvh,
+    global uint* scratchStartpoints )
+{
+    local uint depths[FINALIZE_TREELETS_SLM_DEPTHS_SPACE];
+
+    finalize_treelets_in_groups(bvh, scratchStartpoints, depths);
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel Refit_quads_tree_per_group(global SquashedInput* psqinputs)
+{
+    uint group_id = get_group_id(0);
+    SquashedInput sqinput = psqinputs[group_id];
+    global struct BVHBase* bvh = sqinput.pBvh;
+    uint numLeaves = BVHBase_GetNumQuads(bvh);
+    global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh);
+
+    global void* input = sqinput.pInput;
+    global struct AABB* bbox_scratch = sqinput.bbox_scratch;
+
+    uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64;
+    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input;
+    uint id = get_local_id(0);
+
+    for (uint leaf_id = id; leaf_id < numLeaves; leaf_id += get_local_size(0))
+    {
+        struct AABB theAABB;
+        refit_bottom_child_quad(leafs + leaf_id, geosArray, &theAABB);
+        theAABB.lower.w = as_float(0xABBADEFFu);
+        bbox_scratch[leafsIndexOffset + leaf_id] = theAABB;
+    }
+}
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(32, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel Refit_quads(
+    global struct BVHBase* bvh,
+    global void* input,
+    global struct AABB* bbox_scratch,
+    uint numGroupsExecuted,
+    global SquashedInputGroupDesc* sqinput)
+{
+    uint numLeafs = BVHBase_GetNumQuads(bvh);
+    if (numLeafs == 0) return;
+    global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh);
+    
+    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input;
+    uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64;
+
+    uint numLeafsPerGr = (numLeafs + (numGroupsExecuted - 1)) / numGroupsExecuted;
+
+    uint id_start = get_group_id(0) * numLeafsPerGr + get_local_id(0);
+    uint id_end = min(id_start + numLeafsPerGr, numLeafs);
+    for (uint id = id_start; id < id_end; id+= get_local_size(0))
+    {
+        struct AABB theAABB;
+        refit_bottom_child_quad(leafs + id, geosArray, &theAABB);
+        theAABB.lower.w = as_float(0xABBADEFFu);
+        bbox_scratch[leafsIndexOffset + id] = theAABB;
+    }
+
+    if (get_group_id(0) == 0 && get_local_id(0) < 16)
+    {
+        
+        uint groupnr;
+        uint treeletCnt = *BVHBase_GetRefitTreeletCntPtr(bvh);
+        if (get_sub_group_local_id() == 0) {
+            groupnr = atomic_add_global(&sqinput->totalNumGroups, treeletCnt);
+        }
+        groupnr = sub_group_broadcast(groupnr, 0);
+        for (uint subtree = get_sub_group_local_id(); subtree < treeletCnt; subtree += get_sub_group_size())
+        {
+            uint gr = groupnr + subtree;
+            //printf("tree %llx, treelet %d/%d, grId %d, numStartpoints %d\n",  bvh, subtree,treeletCnt, gr, BVHBase_GetRefitTreeletDescs(bvh)[subtree].numStartpoints);
+            sqinput[gr].bvh = (qword)bvh;
+            sqinput[gr].scratch = (qword)bbox_scratch;
+            sqinput[gr].groupInTree = subtree;
+        }
+        //if (get_local_id(0)==0 && treeletCnt > 1)
+        //{
+        //    printf("tree %llx, tip treelet %d/%d = numStartpoints %d depth %d\n", bvh, treeletCnt, treeletCnt, BVHBase_GetRefitTreeletDescs(bvh)[treeletCnt].numStartpoints, BVHBase_GetRefitTreeletDescs(bvh)[treeletCnt].maxDepth);
+        //}
+    }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel
+Refit_tree_per_group_quad(
+    global SquashedInput* psqinputs)
+{
+    uint group_id = get_group_id(0);
+    SquashedInput sqinput = psqinputs[group_id];
+    global struct BVHBase* bvh = sqinput.pBvh;
+    global struct AABB* bbox_scratch = sqinput.bbox_scratch;
+    global void* pInput = sqinput.pInput;
+    local Treelet_by_single_group_locals loc;
+
+    if (*BVHBase_GetRefitTreeletCntPtr(bvh) == 0)
+        return;
+
+#if REFIT_DEBUG_CHECKS
+    uint bottoms_cnt = *BVHBase_GetRefitTreeletCntPtr(bvh);
+    if (bottoms_cnt != 1) {
+        if (get_local_id(0) == 0)
+        {
+            printf("Error: this tree has more than 1 treelets!\n");
+        }
+        return;
+    }
+#endif
+
+    /* get pointer to inner nodes and back pointers */
+    uniform global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
+
+    // uniform per group
+    uniform RefitTreelet* pTrltDsc = BVHBase_GetRefitTreeletDescs(bvh);    
+
+    uint numLeafs = bvh->quadLeafCur - bvh->quadLeafStart;
+    
+    if (numLeafs == 0) { return; }
+
+    uint numLeafsByOneThread = (numLeafs + (get_local_size(0) - 1)) / get_local_size(0);
+
+    update_quads(bvh, pInput, bbox_scratch, get_local_id(0), numLeafsByOneThread);
+
+    mem_fence_workgroup_default(); work_group_barrier(0);
+    
+    RefitTreelet trltDsc = *pTrltDsc;
+
+    refit_treelet_by_single_group(
+        bbox_scratch,
+        &loc,
+        bvh,
+        trltDsc,
+        false,
+        true);
+    
+    if (trltDsc.maxDepth > 0)
+    {
+        mem_fence_workgroup_default(); work_group_barrier(0);
+        post_refit_encode_qnode_tree_per_group(bbox_scratch,bvh);
+    }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel
+Refit_treelet_per_group(
+    global SquashedInputGroupDesc* sqinput)
+{
+    uint group_id = get_group_id(0);
+    global struct AABB*    bbox_scratch = (global struct AABB* )sqinput[group_id].scratch;
+    global struct BVHBase* bvh          = (global struct BVHBase* )sqinput[group_id].bvh;
+    group_id                            = sqinput[group_id].groupInTree;
+
+    /* get pointer to inner nodes and back pointers */
+    uniform global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
+
+    uint bottoms_cnt = *BVHBase_GetRefitTreeletCntPtr(bvh);
+
+    // uniform per group
+    uniform RefitTreelet* pTrltDsc = BVHBase_GetRefitTreeletDescs(bvh);
+
+    bool should_we_process_treetip = true;
+    local Treelet_by_single_group_locals loc;
+    local bool* l_should_we_process_treetip = (local bool*)&loc;
+#if REFIT_VERBOSE_LOG
+    if (group_id != 0) return;
+#endif
+
+    if (bottoms_cnt > 1)
+    {
+#if REFIT_VERBOSE_LOG
+        for (; group_id < bottoms_cnt; group_id++)
+        {
+            if (get_local_id(0) == 0) { printf("\n ====== treelet %d ====== \n", group_id); }
+            work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE, memory_scope_device);
+#endif
+            bool rootProcThread = refit_treelet_by_single_group(
+                bbox_scratch,
+                &loc,
+                bvh,
+                pTrltDsc[group_id],
+                true,
+                false);
+
+            // we have to make last group that finishes go up and process the treetip
+            if (rootProcThread)
+            {
+
+                mem_fence_gpu_invalidate();
+                uint finished_cnt = atomic_inc_global((global uint*) & bvh->refitTreeletCnt2);
+                should_we_process_treetip = finished_cnt + 1 == bottoms_cnt;
+
+                * l_should_we_process_treetip = should_we_process_treetip;
+
+                if (should_we_process_treetip) mem_fence_gpu_invalidate();
+            }
+#if REFIT_VERBOSE_LOG
+        }
+#endif
+        work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group);
+
+        should_we_process_treetip = *l_should_we_process_treetip;
+    }
+    
+    if (should_we_process_treetip)
+    {
+        //this group will process treetip
+        if (get_local_id(0) == 0) { bvh->refitTreeletCnt2 = 0; }    
+        if (bottoms_cnt == 1) { bottoms_cnt = 0; }
+        refit_treelet_by_single_group(
+            bbox_scratch,
+            &loc,
+            bvh,
+            pTrltDsc[bottoms_cnt],
+            true,
+            true);
+    }
+}
+
+/*
+  This kernel refit a BVH. The algorithm iterates over all BVH nodes
+  to find all leaf nodes, which is where refitting starts. For these
+  leaf nodes bounds get recalculated and then propagates up the tree.
+
+  One kernel instance considers exactly one inner_node startpoint. 
+  not range of inner nodes.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(8, 1, 1))) void kernel 
+Refit_per_one_startpoint(
+    global struct BVHBase* bvh,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
+    global struct AABB3f* instance_leaf_aabbs )
+{
+    /* here we temporarily store the bounds for the children of a node */
+    struct AABB childrenAABB[BVH_NODE_N6];
+
+    /* get pointer to inner nodes and back pointers */
+    global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
+    BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+    
+    /* get the inner node that we will consider as a bottom startpoint */
+    const uint numInnerNodes = BVHBase_numNodes(bvh);
+    const uint innerNodeIdx = (get_group_id(0) + 0) * get_local_size(0) + get_local_id(0);
+
+    if (innerNodeIdx >= numInnerNodes) return;
+
+    global struct QBVHNodeN* curNode = &inner_nodes[innerNodeIdx];
+    uint numChildren = refit_bottom(
+        bvh,
+        geosArray,
+        instance_leaf_aabbs,
+        curNode,
+        childrenAABB,
+        *InnerNode_GetBackPointer(backPointers, innerNodeIdx));
+        
+    if (numChildren != 0)
+    {
+        /* update bounds of node */
+        QBVHNodeN_setBounds(curNode, childrenAABB, numChildren);
+
+        /* refit upper parts of the BVH */
+        /* TODO: this will not gonna work for mixed nodes */
+        refit_bottom_up(curNode, bvh, childrenAABB, numChildren);
+    }
+}
+
+#endif
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(SG_REFIT_WG_SIZE, 1, 1))) void kernel
+Refit_indirect_sg(
+    global struct BVHBase* bvh,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
+    global struct AABB3f* instance_leaf_aabbs)
+{    
+    DO_Refit_per_one_startpoint_sg(bvh, geosArray, instance_leaf_aabbs, 0);
+
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_refit.h b/src/intel/vulkan/grl/gpu/bvh_build_refit.h
new file mode 100644
index 00000000000..522a44b23a7
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_refit.h
@@ -0,0 +1,546 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "common.h"
+#include "api_interface.h"
+#include "instance.h"
+#include "GRLGen12.h"
+#include "libs/lsc_intrinsics.h"
+
+
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+DO_update_instance_leaves(global struct BVHBase* bvh,
+    uint64_t dxrInstancesArray,
+    uint64_t dxrInstancesPtr,
+    global struct AABB3f* instance_aabb_scratch,
+    uint id ,
+    global struct GRL_RAYTRACING_AABB* procedural_box
+)
+{
+
+    global struct GRL_RAYTRACING_INSTANCE_DESC* instancesArray =
+        (global struct GRL_RAYTRACING_INSTANCE_DESC*)dxrInstancesArray;
+    global struct GRL_RAYTRACING_INSTANCE_DESC** instancesPtrArray =
+        (global struct GRL_RAYTRACING_INSTANCE_DESC**)dxrInstancesPtr;
+
+    global struct HwInstanceLeaf* leafs = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves(bvh);
+    
+
+    /* iterate over all children of the instance node and get their bounds */
+
+    uint32_t instanceIdx = HwInstanceLeafPart1_getInstanceIndex(&leafs[id]);
+    global struct GRL_RAYTRACING_INSTANCE_DESC* instance = NULL;
+    if (dxrInstancesArray != NULL)
+        instance = &instancesArray[instanceIdx];
+    else
+        instance = instancesPtrArray[instanceIdx];
+
+    uint mask = GRL_get_InstanceMask(instance);
+    uint offset = NO_NODE_OFFSET;
+
+    struct AffineSpace3f xfm = AffineSpace3f_load_row_major(instance->Transform);
+    struct AABB3f bbox;
+
+    if (procedural_box != 0)
+    {
+        bbox.lower[0] = procedural_box->MinX;
+        bbox.lower[1] = procedural_box->MinY;
+        bbox.lower[2] = procedural_box->MinZ;
+        bbox.upper[0] = procedural_box->MaxX;
+        bbox.upper[1] = procedural_box->MaxY;
+        bbox.upper[2] = procedural_box->MaxZ;
+    }
+    else
+    {
+        global struct BVHBase* instanceBvh = (global struct BVHBase*)instance->AccelerationStructure;
+        bbox = instanceBvh->Meta.bounds;
+        offset = BVH_ROOT_NODE_OFFSET;
+    }
+
+
+    const bool valid_min = isfinite(bbox.lower[0]) && isfinite(bbox.lower[1]) && isfinite(bbox.lower[2]);
+    const bool valid_max = isfinite(bbox.upper[0]) && isfinite(bbox.upper[1]) && isfinite(bbox.upper[2]);
+
+    if (!valid_min || !valid_max )
+    {
+        bbox.lower[0] = xfm.p.x;
+        bbox.lower[1] = xfm.p.y;
+        bbox.lower[2] = xfm.p.z;
+        bbox.upper[0] = xfm.p.x;
+        bbox.upper[1] = xfm.p.y;
+        bbox.upper[2] = xfm.p.z;
+        offset = NO_NODE_OFFSET;
+        mask = 0;
+    }
+    else
+    {
+        bbox = AABB3f_transform(xfm, bbox); // JDB TODO:  Use faster abs-matrix method
+    }
+
+    instance_aabb_scratch[id] = bbox;
+
+    HwInstanceLeaf_Constructor(&leafs[id], instance, instanceIdx, offset, mask); // TODO: No instance opening for refittable BVH   
+}
+
+/*
+   This function starts at some BVH node and refits all nodes upwards
+   to the root. At some node the algorithm only proceeds upwards if
+   all children of the current node have already been processed. This
+   is checked as each time a node is reached an atomic counter is
+   incremented, which will reach the number of children of the node at
+   some time.
+ */
+
+GRL_INLINE void refit_bottom_up(global struct QBVHNodeN *qnode_start, // start node to refit (already processed)
+                            global struct BVHBase *bvh,           // pointer to BVH
+                            struct AABB *childrenAABB,            // temporary data to use
+                            uint numChildrenTotal)
+{
+    global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
+    BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+
+    /* compute the index of the start node */
+    uint curNodeIndex = qnode_start - nodeData;
+
+    /* the start node got already processed, thus go to its parent node */
+    curNodeIndex = *InnerNode_GetBackPointer(backPointers,curNodeIndex) >> 6;
+
+    /* end at root node */
+    while (curNodeIndex != 0x03FFFFFF)
+    {
+        /* increment refit counter that counts refitted children of current node */
+        const uint parentPointer = 1 + atomic_inc_global( (__global uint *) InnerNode_GetBackPointer(backPointers, curNodeIndex));
+
+        /* if all children got refitted, then continue */
+        const uint numChildrenRefitted = (parentPointer >> 0) & 0x7;
+        numChildrenTotal = (parentPointer >> 3) & 0x7;
+        if (numChildrenRefitted != numChildrenTotal)
+            return;
+
+        /* reset refit counter for next refit */
+        *InnerNode_GetBackPointer(backPointers, curNodeIndex) &= 0xfffffff8;
+
+        /* get bounds of all children from child nodes directly */
+        global struct QBVHNodeN *qnode = nodeData + curNodeIndex;
+        global struct QBVHNodeN *qnode_child = (global struct QBVHNodeN *)QBVHNodeN_childrenPointer(qnode);
+        for (uint k = 0; k < numChildrenTotal; k++)
+            childrenAABB[k] = getAABB_QBVHNodeN(qnode_child + k);
+
+        /* update node bounds of all children */
+        QBVHNodeN_setBounds(qnode, childrenAABB, numChildrenTotal);
+
+        write_mem_fence(CLK_GLOBAL_MEM_FENCE);
+
+        /* make parent node the current node */
+        curNodeIndex = parentPointer >> 6;
+    }
+
+    /* update QBVH6 bounds */
+    struct AABB bounds;
+    AABB_init(&bounds);
+
+    for (uint i = 0; i < numChildrenTotal; i++)
+        AABB_extend(&bounds, &childrenAABB[i]);
+
+    setBVHBaseBounds(bvh, &bounds);
+}
+
+
+GRL_INLINE void SUBGROUP_refit_bottom_up( 
+    uniform global struct QBVHNodeN* qnode_start, // start node to refit (already processed)
+    uniform global struct BVHBase* bvh,           // pointer to BVH
+    varying struct AABB reduce_bounds,            
+    uniform uint numChildrenTotal,
+    varying ushort lane,
+    varying ushort head_lane)
+{
+    uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+    uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+
+    /* compute the index of the start node */
+    uniform uint curNodeIndex = qnode_start - nodeData;
+
+    /* the start node got already processed, thus go to its parent node */
+    uniform curNodeIndex = *InnerNode_GetBackPointer(backPointers, curNodeIndex) >> 6;
+
+    varying struct AABB childrenAABB;
+
+    /* end at root node */
+    while ( curNodeIndex != 0x03FFFFFF )
+    {
+        mem_fence_gpu_invalidate();
+
+        /* increment refit counter that counts refitted children of current node */
+        uniform uint parentPointer = 1;
+        if (lane == 0)
+        {
+            // acquire fence ensures that all previous writes complete before the atomic starts
+            parentPointer += atomic_inc_global((__global uint *)InnerNode_GetBackPointer(backPointers, curNodeIndex));
+        }
+
+        parentPointer = intel_sub_group_shuffle( parentPointer, head_lane );
+
+        /* if all children got refitted, then continue */
+        uniform uint numChildrenRefitted = (parentPointer >> 0) & 0x7;
+        numChildrenTotal = (parentPointer >> 3) & 0x7;
+        if ( numChildrenRefitted != numChildrenTotal )
+            return;
+
+        /* reset refit counter for next refit */
+        if (lane == 0)
+        {
+            *InnerNode_GetBackPointer(backPointers, curNodeIndex) = (parentPointer & 0xfffffff8);
+        }
+
+        /* get bounds of all children from child nodes directly */
+        global struct QBVHNodeN* qnode = nodeData + curNodeIndex;
+        global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
+
+        varying ushort child_idx = (lane < numChildrenTotal) ? lane : 0;
+        childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );
+
+        /* update node bounds of all children */
+        reduce_bounds = AABB_sub_group_reduce_N6( &childrenAABB );
+        reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, head_lane );
+
+        subgroup_QBVHNodeN_setBounds(qnode, reduce_bounds, childrenAABB, numChildrenTotal, lane);
+
+        /* update node mask */
+        uchar childrenMask = qnode_child[child_idx].instMask;
+
+        qnode->instMask = sub_group_reduce_or_N6(childrenMask);
+
+        /* make parent node the current node */
+        curNodeIndex = parentPointer >> 6;
+    }
+
+    /* update QBVH6 bounds */
+    
+    if( lane == 0 )
+        setBVHBaseBounds( bvh, &reduce_bounds );
+}
+
+
+GRL_INLINE void quadCopyVertices(
+    const struct QuadLeaf* pQuad,
+    struct QuadLeaf* newQuad)
+{
+    const uint4* s = (const uint4*) & (pQuad->v[0][0]);
+    uint4* d = (uint4*) & (newQuad->v[0][0]);
+    const uint8* s2 = (const uint8*)(s+1);
+    uint8* d2 = (uint8*)(d+1);
+    *d = *s;
+    *d2 = *s2;
+}
+
+
+GRL_INLINE void get_updated_quad(
+    global const struct QuadLeaf* pQuad,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDescs,
+    struct QuadLeaf* newQuad)
+{
+    struct QuadLeaf tempQuad;
+
+    // fetch non vtx data;
+    {
+        uint4* tempQuad4U = (uint4*)&tempQuad;
+        global const uint4* pQuad4U = (global const uint4*)pQuad;
+        *tempQuad4U = *pQuad4U;
+    }   
+
+    /* get the geomID and primID0/1 for both quad triangles */
+    const uint geomID = PrimLeaf_GetGeoIndex(&tempQuad.leafDesc);
+    const uint primID0 = tempQuad.primIndex0;
+    const uint primID1 = tempQuad.primIndex0 + QuadLeaf_GetPrimIndexDelta(&tempQuad);
+    ushort fourth_vert = 0;
+
+    if (primID1 != primID0)
+    {
+        ushort packed_indices = QuadLeaf_GetSecondTriangleIndices(&tempQuad);
+        fourth_vert = ((packed_indices & 0x0C) == 0x0C) ? 1 : fourth_vert;
+        fourth_vert = ((packed_indices & 0x30) == 0x30) ? 2 : fourth_vert;
+    }
+
+    global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDescs + geomID;
+
+    uint4 indices = GRL_load_quad_indices(desc, primID0, primID1, fourth_vert);
+
+    // read the indices of the 4 verts we want
+    float3 vtx0, vtx1, vtx2, vtx3;
+    GRL_load_quad_vertices(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices);
+
+    QuadLeaf_SetVertices(&tempQuad, vtx0, vtx1, vtx2, vtx3);
+
+    *newQuad = tempQuad;
+}
+
+// This calculates children BBs for innerNode having *all* children leafs.
+// mixed nodes will be updated by passing through bottom-up thread.
+GRL_INLINE uint refit_bottom( global struct BVHBase* bvh, 
+                          global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,                       
+                          global struct AABB3f* instance_leaf_aabbs,
+                          global struct QBVHNodeN* curNode,
+                          struct AABB *childrenAABB,
+                          uint backPointer)
+{
+    uint numChildren = 0;
+
+    /* we start refit at leaf nodes, this case is for quad nodes */
+    if (curNode->type == BVH_QUAD_NODE)
+    {
+        global struct QuadLeaf* quads = (global struct QuadLeaf*)QBVHNodeN_childrenPointer(curNode);
+
+        /* iterate over all quads of the quad node and get their bounds */
+        numChildren = (backPointer >> 3) & 0x7;
+        for (uint k = 0; k < numChildren; k++)
+        {
+            struct QuadLeaf Q;
+            get_updated_quad(&quads[k], geomDesc, &Q);
+            quadCopyVertices(&Q, &quads[k]);
+            childrenAABB[k] = getAABB_Quad((struct Quad*)&Q); // FIXME: support leaves with more than one quad
+        }
+    }
+
+    /* we start refit at leaf nodes, this case is for procedural nodes */
+    else if (curNode->type == BVH_PROCEDURAL_NODE)
+    {
+        global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer(curNode);
+
+        /* iterate over all children of the procedural node and get their bounds */
+        numChildren = (backPointer >> 3) & 0x7;
+        for (uint k = 0; k < numChildren; k++)
+        {
+            /* extract geomID and primID from leaf */
+            const uint startPrim = QBVHNodeN_startPrim(curNode, k);
+            const uint geomID = ProceduralLeaf_geomIndex(leaf);
+            const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf!
+
+            /* read bounds from geometry descriptor */
+            struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
+            childrenAABB[k].lower.x = aabb.MinX;
+            childrenAABB[k].lower.y = aabb.MinY;
+            childrenAABB[k].lower.z = aabb.MinZ;
+            childrenAABB[k].upper.x = aabb.MaxX;
+            childrenAABB[k].upper.y = aabb.MaxY;
+            childrenAABB[k].upper.z = aabb.MaxZ;
+
+            /* advance leaf pointer to next child */
+            leaf += QBVHNodeN_blockIncr(curNode, k);
+        }
+    }
+
+    /* we start refit at leaf nodes, this case is for instance nodes */
+    else if (curNode->type == BVH_INSTANCE_NODE)
+    {
+        global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer(curNode);
+        global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh );
+
+        /* iterate over all children of the instance node and get their bounds */
+        numChildren = (backPointer >> 3) & 0x7;
+        for (uint k = 0; k < numChildren; k++)
+        {
+            uint leafindex = (instancesLeaves + k) - leafBase;
+            childrenAABB[k].lower.xyz = AABB3f_load_lower( &instance_leaf_aabbs[leafindex] );
+            childrenAABB[k].upper.xyz = AABB3f_load_upper( &instance_leaf_aabbs[leafindex] );
+        }
+    }
+
+    return numChildren;
+}
+
+
+
+
+
+// This calculates children BBs for innerNode having *all* children leafs.
+// mixed nodes will be updated by passing through bottom-up thread.
+GRL_INLINE uint SUBGROUP_refit_bottom(
+    uniform global struct BVHBase* bvh,
+    uniform global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    uniform global struct AABB3f* instance_leaf_aabbs,
+    uniform global struct QBVHNodeN* curNode,
+    uniform uint backPointer,
+    varying struct AABB* childrenAABB,
+    varying uchar* childrenMask,
+    varying ushort lane,
+    global uchar* is_procedural_instance
+    )
+{
+    uniform uint numChildren = 0;
+    bool enable_procedural_instance = (is_procedural_instance != 0);
+
+    /* we start refit at leaf nodes, this case is for quad nodes */
+    if (curNode->type == BVH_QUAD_NODE)
+    {
+        /* iterate over all quads of the quad node and get their bounds */
+        numChildren = (backPointer >> 3) & 0x7;
+
+        uniform global struct QuadLeaf* quads = (global struct QuadLeaf*)QBVHNodeN_childrenPointer(curNode);
+
+        struct QuadLeaf Q;
+        if (lane < numChildren)
+        {
+            get_updated_quad(&quads[lane], geomDesc, &Q);
+
+            *childrenAABB = getAABB_Quad((struct Quad*) & Q); // FIXME: support leaves with more than one quad
+
+            quadCopyVertices(&Q, &quads[lane]);
+            *childrenMask = 0xff;
+        }
+        // FIXME: support leaves with more than one quad
+    }
+
+    /* we start refit at leaf nodes, this case is for procedural nodes */
+    else if (curNode->type == BVH_PROCEDURAL_NODE)
+    {
+        uniform global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer(curNode);
+
+
+        
+        /* iterate over all children of the procedural node and get their bounds */
+        numChildren = (backPointer >> 3) & 0x7;
+        
+        varying uint incr = (lane < numChildren) ? InternalNode_GetChildBlockIncr((struct InternalNode*)curNode, lane) : 0;
+        incr = sub_group_scan_exclusive_add(incr);
+
+        if( lane < numChildren )
+        {
+            /* extract geomID and primID from leaf */
+            varying uint start_prim = InternalNode_GetChildStartPrim((struct InternalNode*)curNode, lane );
+            varying global struct ProceduralLeaf* my_leaf = leaf + incr;
+            const uint geomID = ProceduralLeaf_geomIndex(my_leaf);
+            const uint primID = ProceduralLeaf_primIndex(my_leaf, start_prim); 
+
+            /* read bounds from geometry descriptor */
+            struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
+            childrenAABB->lower.x = aabb.MinX;
+            childrenAABB->lower.y = aabb.MinY;
+            childrenAABB->lower.z = aabb.MinZ;
+            childrenAABB->upper.x = aabb.MaxX;
+            childrenAABB->upper.y = aabb.MaxY;
+            childrenAABB->upper.z = aabb.MaxZ;
+            *childrenMask = 0xff;
+        }
+    }
+
+    /* we start refit at leaf nodes, this case is for instance nodes */
+    else if ( !enable_procedural_instance && curNode->type == BVH_INSTANCE_NODE)
+    {
+        uniform global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer(curNode);
+        uniform global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves(bvh);
+
+        /* iterate over all children of the instance node and get their bounds and masks */
+        numChildren = (backPointer >> 3) & 0x7;
+        if( lane < numChildren )
+        {
+            uint leafindex = (instancesLeaves + lane) - leafBase;
+            childrenAABB->lower.xyz = AABB3f_load_lower(&instance_leaf_aabbs[leafindex]);
+            childrenAABB->upper.xyz = AABB3f_load_upper(&instance_leaf_aabbs[leafindex]);
+            *childrenMask = HwInstanceLeaf_GetInstanceMask(&leafBase[leafindex]);
+        }
+    }
+    else if (enable_procedural_instance && curNode->type == BVH_INTERNAL_NODE)
+    {
+        // Handle procedural-instance leaves
+        //   TODO:  Generalize this!   Should re-write the kernel to work with arbitrary mixed-mode leaves
+        
+        numChildren = (backPointer >> 3) & 0x7;
+        uint childType = BVH_INTERNAL_NODE;
+        if ( lane < numChildren )
+        {
+            childType = InternalNode_GetChildType( (struct InternalNode*)curNode, lane );
+            if (childType != BVH_INTERNAL_NODE)
+            {
+                uniform global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer( curNode );
+                uniform global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh );
+                uint leafindex = (instancesLeaves + lane) - leafBase;
+                childrenAABB->lower.xyz = AABB3f_load_lower( &instance_leaf_aabbs[leafindex] );
+                childrenAABB->upper.xyz = AABB3f_load_upper( &instance_leaf_aabbs[leafindex] );
+                *childrenMask = HwInstanceLeaf_GetInstanceMask( &leafBase[leafindex] );
+
+                // see if the child has flipped from procedural to non-procedural and update the child type field as needed
+                uint instanceIndex = HwInstanceLeaf_GetInstanceIndex( &leafBase[leafindex] );
+                uint newChildType = is_procedural_instance[instanceIndex] ? BVH_PROCEDURAL_NODE : BVH_INSTANCE_NODE;
+                if (newChildType != childType)
+                {
+                    InternalNode_SetChildType( (struct InternalNode*)curNode, lane, newChildType );
+                }
+            }            
+        }
+
+
+        // don't ascend the tree for a true internal node
+        if (sub_group_all(childType == BVH_INTERNAL_NODE))
+            numChildren = 0;
+    }
+    
+    return numChildren;
+}
+
+#define SG_REFIT_WG_SIZE 8
+
+void DO_Refit_per_one_startpoint_sg(
+    global struct BVHBase* bvh,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
+    global struct AABB3f* instance_leaf_aabbs,
+    global uchar* is_procedural_instance )
+{
+    /* get pointer to inner nodes and back pointers */
+    global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
+    BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+
+    /* get the inner node that we will consider as a bottom startpoint */
+    const uint numInnerNodes = BVHBase_numNodes(bvh);
+    const uint innerNodeIdx = get_sub_group_global_id();
+
+    varying ushort lane = get_sub_group_local_id();
+
+    if (innerNodeIdx >= numInnerNodes) return;
+
+    varying struct AABB childrenAABB; // one child AABB per lane
+    AABB_init(&childrenAABB);
+
+    varying uchar childrenMask = 0; // one child mask per lane
+
+    global struct QBVHNodeN* curNode = &inner_nodes[innerNodeIdx];
+    uint backPointer = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+    uint numChildren = SUBGROUP_refit_bottom(
+        bvh,
+        geosArray,
+        instance_leaf_aabbs,
+        curNode,
+        backPointer,
+        &childrenAABB,
+        &childrenMask,
+        lane,
+        is_procedural_instance
+         );
+
+    
+    if (numChildren != 0)
+    {
+        /* update bounds of node */
+        struct AABB reduce_bounds = AABB_sub_group_reduce_N6(&childrenAABB);
+        reduce_bounds = AABB_sub_group_shuffle(&reduce_bounds, 0);
+        subgroup_QBVHNodeN_setBounds(curNode, reduce_bounds, childrenAABB, numChildren, lane);
+        
+        /* update mask of node */
+        uchar mask = sub_group_reduce_or_N6(childrenMask);
+        curNode->instMask = mask;
+
+        /* Leave this fence for now for all threads, if WG size is increased (tried 128) and fence is done
+           only by the first thread (similar to morton phase1) the machine hangs. */
+        mem_fence_gpu_invalidate();
+
+        /* refit upper parts of the BVH */
+        /* TODO: this will not gonna work for mixed nodes */
+        SUBGROUP_refit_bottom_up(curNode, bvh, reduce_bounds, numChildren, lane, 0);
+    }
+}
+\ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_sah_experimental.cl b/src/intel/vulkan/grl/gpu/bvh_build_sah_experimental.cl
new file mode 100644
index 00000000000..0a4bd3466af
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_sah_experimental.cl
@@ -0,0 +1,1917 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "common.h"
+#include "instance.h"
+
+#define DBG(x)
+
+#define ENABLE_CHECKS 0
+
+#define ENABLE_32BINS_IN_BREADTH_FIRST_PHASE 1
+
+/* todo:                                                     */
+/* - new cross WG code path for first splits                 */
+/* - optimize find best child loop sequence                  */
+/* - subgroup_setQBVHNodeN needs work on 6 slots in parallel */
+
+#define DIVIDE_BY_6 1
+
+inline uint getNumPrims(struct BuildRecord *buildRecord)
+{
+    return buildRecord->end - buildRecord->start;
+}
+
+inline void printBuildRecord(struct BuildRecord *record)
+{
+    printf("centroidBounds\n");
+    AABB_print(&record->centroidBounds);
+    printf("start %d end %d size %d depth %d \n", record->start, record->end, record->end - record->start, getBuildRecursionDepth(record));
+}
+
+inline void printBinInfo2(struct BinInfo2 *record)
+{
+    printf("boundsX[%d]\n", BINS * 2);
+    for (uint b = 0; b < BINS * 2; b++)
+    {
+        AABB3f_print(&record->boundsX[b]);
+        printf("counts.x = %d\n", record->counts[b].x);
+    }
+    printf("boundsY[%d]\n", BINS * 2);
+    for (uint b = 0; b < BINS * 2; b++)
+    {
+        AABB3f_print(&record->boundsY[b]);
+        printf("counts.y = %d\n", record->counts[b].y);
+    }
+    printf("boundsZ[%d]\n", BINS * 2);
+    for (uint b = 0; b < BINS * 2; b++)
+    {
+        AABB3f_print(&record->boundsZ[b]);
+        printf("counts.z = %d\n", record->counts[b].z);
+    }
+}
+
+inline void initBinMapping(struct BinMapping *binMapping, struct AABB *centBounds, const uint bins)
+{
+    const float4 eps = 1E-34f;
+    const float4 diag = max(eps, centBounds->upper - centBounds->lower);
+    const float4 scale = (float4)(0.99f * (float)bins) / diag;
+    binMapping->scale = select((float4)(0.0f), scale, diag > eps);
+    binMapping->ofs = centBounds->lower;
+}
+
+inline void atomicExtendLocalBuildRecord(local struct BuildRecord *buildRecord, global struct AABB *primref)
+{
+    const float4 centroid2 = primref->lower + primref->upper;
+    AABB_local_atomic_merge(&buildRecord->centroidBounds, centroid2, centroid2);
+}
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+inline void initBinInfo(struct BinInfo *binInfo)
+{
+    for (uint i = 0; i < BINS; i++)
+    {
+        AABB3f_init(&binInfo->boundsX[i]);
+        AABB3f_init(&binInfo->boundsY[i]);
+        AABB3f_init(&binInfo->boundsZ[i]);
+        binInfo->counts[i] = (uint3)(0);
+    }
+}
+
+inline void subgroup_initBinInfo(struct BinInfo *binInfo)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+
+    for (uint i = subgroupLocalID; i < BINS; i += subgroup_size)
+    {
+        AABB3f_init(&binInfo->boundsX[i]);
+        AABB3f_init(&binInfo->boundsY[i]);
+        AABB3f_init(&binInfo->boundsZ[i]);
+        binInfo->counts[i] = (uint3)(0);
+    }
+}
+
+inline void parallel_initBinInfo(struct BinInfo *binInfo)
+{
+    const uint localID = get_local_id(0);
+    if (localID < BINS)
+    {
+        AABB3f_init(&binInfo->boundsX[localID]);
+        AABB3f_init(&binInfo->boundsY[localID]);
+        AABB3f_init(&binInfo->boundsZ[localID]);
+        binInfo->counts[localID] = (uint3)(0);
+    }
+}
+
+inline void atomicUpdateLocalBinInfo(struct BinMapping *binMapping, local struct BinInfo *binInfo, global struct AABB *primref)
+{
+    const float4 lower = primref->lower;
+    const float4 upper = primref->upper;
+    const float4 p = lower + upper;
+    const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale);
+    AABB3f_atomic_merge_local(&binInfo->boundsX[i.x], lower, upper);
+    AABB3f_atomic_merge_local(&binInfo->boundsY[i.y], lower, upper);
+    AABB3f_atomic_merge_local(&binInfo->boundsZ[i.z], lower, upper);
+    atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1);
+    atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1);
+    atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1);
+}
+
+inline void atomicUpdateLocalBinInfo_nocheck(struct BinMapping *binMapping, local struct BinInfo *binInfo, global struct AABB *primref)
+{
+    const float4 lower = primref->lower;
+    const float4 upper = primref->upper;
+    const float4 p = lower + upper;
+    const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale);
+    AABB3f_atomic_merge_local_nocheck(&binInfo->boundsX[i.x], lower, upper);
+    AABB3f_atomic_merge_local_nocheck(&binInfo->boundsY[i.y], lower, upper);
+    AABB3f_atomic_merge_local_nocheck(&binInfo->boundsZ[i.z], lower, upper);
+    atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1);
+    atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1);
+    atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1);
+}
+
+inline void updateBins(struct BinMapping *binMapping, struct BinInfo *binInfo, global struct AABB *primref)
+{
+    const float4 lower = primref->lower;
+    const float4 upper = primref->upper;
+    const float4 p = lower + upper;
+    const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale);
+    AABB3f_extendlu(&binInfo->boundsX[i.x], lower.xyz, upper.xyz);
+    AABB3f_extendlu(&binInfo->boundsY[i.y], lower.xyz, upper.xyz);
+    AABB3f_extendlu(&binInfo->boundsZ[i.z], lower.xyz, upper.xyz);
+    binInfo->counts[i.x].x++;
+    binInfo->counts[i.y].y++;
+    binInfo->counts[i.z].z++;
+}
+
+// =====================================================================================================================
+// =====================================================================================================================
+// =====================================================================================================================
+
+inline void parallel_initBinInfo2(struct BinInfo2 *binInfo, const uint bins)
+{
+    const uint localID = get_local_id(0);
+    if (localID < bins)
+    {
+        AABB3f_init(&binInfo->boundsX[localID]);
+        AABB3f_init(&binInfo->boundsY[localID]);
+        AABB3f_init(&binInfo->boundsZ[localID]);
+        binInfo->counts[localID] = (uint3)(0);
+    }
+}
+
+inline void atomicUpdateLocalBinInfo2(struct BinMapping *binMapping, local struct BinInfo2 *binInfo, global struct AABB *primref)
+{
+    const float4 lower = primref->lower;
+    const float4 upper = primref->upper;
+    const float4 p = lower + upper;
+    const uint4 i = convert_uint4((p - binMapping->ofs) * binMapping->scale);
+    AABB3f_atomic_merge_local(&binInfo->boundsX[i.x], lower, upper);
+    AABB3f_atomic_merge_local(&binInfo->boundsY[i.y], lower, upper);
+    AABB3f_atomic_merge_local(&binInfo->boundsZ[i.z], lower, upper);
+    atomic_add((local uint *)&binInfo->counts[i.x] + 0, 1);
+    atomic_add((local uint *)&binInfo->counts[i.y] + 1, 1);
+    atomic_add((local uint *)&binInfo->counts[i.z] + 2, 1);
+}
+
+inline void atomicUpdateGlobalFromLocalBinInfo2(global struct BinInfo2 *dest, local struct BinInfo2 *source, const uint bins)
+{
+    const uint localID = get_local_id(0);
+    if (localID < bins)
+    {
+        AABB3f_atomic_merge_global_local(&dest->boundsX[localID], &source->boundsX[localID]);
+        AABB3f_atomic_merge_global_local(&dest->boundsY[localID], &source->boundsY[localID]);
+        AABB3f_atomic_merge_global_local(&dest->boundsZ[localID], &source->boundsZ[localID]);
+        atomic_add((global uint *)&dest->counts[localID] + 0, source->counts[localID].x);
+        atomic_add((global uint *)&dest->counts[localID] + 1, source->counts[localID].y);
+        atomic_add((global uint *)&dest->counts[localID] + 2, source->counts[localID].z);
+    }
+}
+
+inline uint subgroup_getMaxAreaChild(struct AABB *childrenAABB, const uint numChildren)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+#if 0
+  /*! find best child to split */
+  const float area = (subgroupLocalID < numChildren) & (as_uint(childrenAABB[subgroupLocalID].upper.w) > cfg_minLeafSize) ? childrenAABB[subgroupLocalID].lower.w : -(float)INFINITY;
+  const float maxArea = sub_group_reduce_max(area);
+  const uint mask = intel_sub_group_ballot(area == maxArea);
+  const uint bestChild = maxArea != -(float)INFINITY ? ctz(mask) : -1;
+#else
+    float bestArea = -(float)INFINITY;
+    int bestChild = -1;
+    for (int i = 0; i < numChildren; i++)
+    {
+        /* ignore leaves as they cannot get split */
+        if (as_uint(childrenAABB[i].upper.w) <= cfg_minLeafSize)
+            continue;
+
+        /* find child with largest surface area */
+        if (childrenAABB[i].lower.w > bestArea)
+        {
+            bestChild = i;
+            bestArea = childrenAABB[i].lower.w;
+        }
+    }
+#endif
+    return bestChild;
+}
+
+inline bool AABB_verifyBounds(struct BuildRecord *buildRecord, struct AABB *geometryBounds, struct AABB *primref)
+{
+    const float4 centroid2 = primref->lower + primref->upper;
+
+    if (centroid2.x < buildRecord->centroidBounds.lower.x)
+        return false;
+    if (centroid2.y < buildRecord->centroidBounds.lower.y)
+        return false;
+    if (centroid2.z < buildRecord->centroidBounds.lower.z)
+        return false;
+
+    if (centroid2.x > buildRecord->centroidBounds.upper.x)
+        return false;
+    if (centroid2.y > buildRecord->centroidBounds.upper.y)
+        return false;
+    if (centroid2.z > buildRecord->centroidBounds.upper.z)
+        return false;
+
+    if (primref->lower.x < geometryBounds->lower.x)
+        return false;
+    if (primref->lower.y < geometryBounds->lower.y)
+        return false;
+    if (primref->lower.z < geometryBounds->lower.z)
+        return false;
+
+    if (primref->upper.x > geometryBounds->upper.x)
+        return false;
+    if (primref->upper.y > geometryBounds->upper.y)
+        return false;
+    if (primref->upper.z > geometryBounds->upper.z)
+        return false;
+
+    return true;
+}
+
+/* initialize primref index array */
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+create_primref_index(global struct Globals *globals,
+                     global struct AABB *primref,
+                     global unsigned int *primref_index)
+{
+    const uint local_size = get_local_size(0);
+    const uint taskID = get_group_id(0);
+    const uint numTasks = get_num_groups(0);
+    const uint localID = get_local_id(0);
+
+    const uint startID = (taskID + 0) * globals->numPrimitives / numTasks;
+    const uint endID = (taskID + 1) * globals->numPrimitives / numTasks;
+    for (uint primID = startID + localID; primID < endID; primID += local_size)
+        primref_index[primID] = primID;
+}
+
+// ==========================================================================================================
+// ==========================================================================================================
+// ==========================================================================================================
+
+inline float left_to_right_area16(struct AABB3f *low)
+{
+    struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max(low);
+    return halfArea_AABB3f(&low_prefix);
+}
+
+inline uint left_to_right_counts16(uint low)
+{
+    return sub_group_scan_exclusive_add(low);
+}
+
+inline float right_to_left_area16(struct AABB3f *low)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+    const uint ID = subgroup_size - 1 - subgroupLocalID;
+    struct AABB3f low_reverse = AABB3f_sub_group_shuffle(low, ID);
+    struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max(&low_reverse);
+    const float low_area = sub_group_broadcast(halfArea_AABB3f(&low_prefix), ID);
+    return low_area;
+}
+
+inline uint right_to_left_counts16(uint low)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+    const uint ID = subgroup_size - 1 - subgroupLocalID;
+    const uint low_reverse = sub_group_broadcast(low, ID);
+    const uint low_prefix = sub_group_scan_inclusive_add(low_reverse);
+    return sub_group_broadcast(low_prefix, ID);
+}
+
+inline float2 left_to_right_area32(struct AABB3f *low, struct AABB3f *high)
+{
+    struct AABB3f low_prefix = AABB3f_sub_group_scan_exclusive_min_max(low);
+    struct AABB3f low_reduce = AABB3f_sub_group_reduce(low);
+    struct AABB3f high_prefix = AABB3f_sub_group_scan_exclusive_min_max(high);
+    AABB3f_extend(&high_prefix, &low_reduce);
+    const float low_area = halfArea_AABB3f(&low_prefix);
+    const float high_area = halfArea_AABB3f(&high_prefix);
+    return (float2)(low_area, high_area);
+}
+
+inline uint2 left_to_right_counts32(uint low, uint high)
+{
+    const uint low_prefix = sub_group_scan_exclusive_add(low);
+    const uint low_reduce = sub_group_reduce_add(low);
+    const uint high_prefix = sub_group_scan_exclusive_add(high);
+    return (uint2)(low_prefix, low_reduce + high_prefix);
+}
+
+inline float2 right_to_left_area32(struct AABB3f *low, struct AABB3f *high)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+    const uint ID = subgroup_size - 1 - subgroupLocalID;
+    struct AABB3f low_reverse = AABB3f_sub_group_shuffle(high, ID);
+    struct AABB3f high_reverse = AABB3f_sub_group_shuffle(low, ID);
+    struct AABB3f low_prefix = AABB3f_sub_group_scan_inclusive_min_max(&low_reverse);
+    struct AABB3f low_reduce = AABB3f_sub_group_reduce(&low_reverse);
+    struct AABB3f high_prefix = AABB3f_sub_group_scan_inclusive_min_max(&high_reverse);
+    AABB3f_extend(&high_prefix, &low_reduce);
+    const float low_area = sub_group_broadcast(halfArea_AABB3f(&high_prefix), ID);
+    const float high_area = sub_group_broadcast(halfArea_AABB3f(&low_prefix), ID);
+    return (float2)(low_area, high_area);
+}
+
+inline uint2 right_to_left_counts32(uint low, uint high)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+    const uint ID = subgroup_size - 1 - subgroupLocalID;
+    const uint low_reverse = sub_group_broadcast(high, ID);
+    const uint high_reverse = sub_group_broadcast(low, ID);
+    const uint low_prefix = sub_group_scan_inclusive_add(low_reverse);
+    const uint low_reduce = sub_group_reduce_add(low_reverse);
+    const uint high_prefix = sub_group_scan_inclusive_add(high_reverse) + low_reduce;
+    return (uint2)(sub_group_broadcast(high_prefix, ID), sub_group_broadcast(low_prefix, ID));
+}
+
+inline ulong getBestSplit(float3 sah, uint ID, const float4 scale, const ulong defaultSplit)
+{
+    ulong splitX = (((ulong)as_uint(sah.x)) << 32) | ((uint)ID << 2) | 0;
+    ulong splitY = (((ulong)as_uint(sah.y)) << 32) | ((uint)ID << 2) | 1;
+    ulong splitZ = (((ulong)as_uint(sah.z)) << 32) | ((uint)ID << 2) | 2;
+    /* ignore zero sized dimensions */
+    splitX = select(splitX, defaultSplit, (ulong)(scale.x == 0));
+    splitY = select(splitY, defaultSplit, (ulong)(scale.y == 0));
+    splitZ = select(splitZ, defaultSplit, (ulong)(scale.z == 0));
+    ulong bestSplit = min(min(splitX, splitY), splitZ);
+    bestSplit = sub_group_reduce_min(bestSplit);
+    return bestSplit;
+}
+
+inline uint fastDivideBy6_uint(uint v)
+{
+#if 1
+    const ulong u = (ulong)v >> 1;
+    return (uint)((u * 0x55555556ul) >> 32);
+#else
+    return v / 6;
+#endif
+}
+
+inline uint3 fastDivideBy6_uint3(uint3 v)
+{
+    return (uint3)(fastDivideBy6_uint(v.x), fastDivideBy6_uint(v.y), fastDivideBy6_uint(v.z));
+}
+
+inline struct Split reduceBinsAndComputeBestSplit16(struct BinInfo *binInfo, const float4 scale, uint startID, uint endID)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+
+    struct AABB3f boundsX = binInfo->boundsX[subgroupLocalID];
+
+    const float lr_areaX = left_to_right_area16(&boundsX);
+    const float rl_areaX = right_to_left_area16(&boundsX);
+
+    struct AABB3f boundsY = binInfo->boundsY[subgroupLocalID];
+
+    const float lr_areaY = left_to_right_area16(&boundsY);
+    const float rl_areaY = right_to_left_area16(&boundsY);
+
+    struct AABB3f boundsZ = binInfo->boundsZ[subgroupLocalID];
+
+    const float lr_areaZ = left_to_right_area16(&boundsZ);
+    const float rl_areaZ = right_to_left_area16(&boundsZ);
+
+    const uint3 counts = binInfo->counts[subgroupLocalID];
+
+    const uint lr_countsX = left_to_right_counts16(counts.x);
+    const uint rl_countsX = right_to_left_counts16(counts.x);
+    const uint lr_countsY = left_to_right_counts16(counts.y);
+    const uint rl_countsY = right_to_left_counts16(counts.y);
+    const uint lr_countsZ = left_to_right_counts16(counts.z);
+    const uint rl_countsZ = right_to_left_counts16(counts.z);
+
+    const float3 lr_area = (float3)(lr_areaX, lr_areaY, lr_areaZ);
+    const float3 rl_area = (float3)(rl_areaX, rl_areaY, rl_areaZ);
+
+#if DIVIDE_BY_6 == 0
+    const uint blocks_shift = SAH_LOG_BLOCK_SHIFT;
+    uint3 blocks_add = (uint3)((1 << blocks_shift) - 1);
+    const uint3 lr_count = ((uint3)(lr_countsX, lr_countsY, lr_countsZ) + blocks_add) >> blocks_shift;
+    const uint3 rl_count = ((uint3)(rl_countsX, rl_countsY, rl_countsZ) + blocks_add) >> blocks_shift;
+#else
+    const uint3 lr_count = fastDivideBy6_uint3((uint3)(lr_countsX, lr_countsY, lr_countsZ) + BVH_NODE_N6 - 1);
+    const uint3 rl_count = fastDivideBy6_uint3((uint3)(rl_countsX, rl_countsY, rl_countsZ) + BVH_NODE_N6 - 1);
+#endif
+    float3 sah = fma(lr_area, convert_float3(lr_count), rl_area * convert_float3(rl_count));
+
+    /* first bin is invalid */
+
+    sah.x = select((float)(INFINITY), sah.x, subgroupLocalID != 0);
+    sah.y = select((float)(INFINITY), sah.y, subgroupLocalID != 0);
+    sah.z = select((float)(INFINITY), sah.z, subgroupLocalID != 0);
+
+    const uint mid = (startID + endID) / 2;
+    const ulong defaultSplit = (((ulong)as_uint((float)(INFINITY))) << 32) | ((uint)mid << 2) | 0;
+
+    const ulong bestSplit = getBestSplit(sah, subgroupLocalID, scale, defaultSplit);
+
+    struct Split split;
+    split.sah = as_float((uint)(bestSplit >> 32));
+    split.dim = (uint)bestSplit & 3;
+    split.pos = (uint)bestSplit >> 2;
+
+    return split;
+}
+
+inline struct Split reduceBinsAndComputeBestSplit32(struct BinInfo2 *binInfo, const float4 scale, uint startID, uint endID)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+
+    struct AABB3f boundsX_low = binInfo->boundsX[subgroupLocalID];
+    struct AABB3f boundsX_high = binInfo->boundsX[subgroupLocalID + subgroup_size];
+
+    const float2 lr_areaX = left_to_right_area32(&boundsX_low, &boundsX_high);
+    const float2 rl_areaX = right_to_left_area32(&boundsX_low, &boundsX_high);
+
+    struct AABB3f boundsY_low = binInfo->boundsY[subgroupLocalID];
+    struct AABB3f boundsY_high = binInfo->boundsY[subgroupLocalID + subgroup_size];
+
+    const float2 lr_areaY = left_to_right_area32(&boundsY_low, &boundsY_high);
+    const float2 rl_areaY = right_to_left_area32(&boundsY_low, &boundsY_high);
+
+    struct AABB3f boundsZ_low = binInfo->boundsZ[subgroupLocalID];
+    struct AABB3f boundsZ_high = binInfo->boundsZ[subgroupLocalID + subgroup_size];
+
+    const float2 lr_areaZ = left_to_right_area32(&boundsZ_low, &boundsZ_high);
+    const float2 rl_areaZ = right_to_left_area32(&boundsZ_low, &boundsZ_high);
+
+    const uint3 counts_low = binInfo->counts[subgroupLocalID];
+    const uint3 counts_high = binInfo->counts[subgroupLocalID + subgroup_size];
+
+    const uint2 lr_countsX = left_to_right_counts32(counts_low.x, counts_high.x);
+    const uint2 rl_countsX = right_to_left_counts32(counts_low.x, counts_high.x);
+    const uint2 lr_countsY = left_to_right_counts32(counts_low.y, counts_high.y);
+    const uint2 rl_countsY = right_to_left_counts32(counts_low.y, counts_high.y);
+    const uint2 lr_countsZ = left_to_right_counts32(counts_low.z, counts_high.z);
+    const uint2 rl_countsZ = right_to_left_counts32(counts_low.z, counts_high.z);
+
+    const uint blocks_shift = SAH_LOG_BLOCK_SHIFT;
+    uint3 blocks_add = (uint3)((1 << blocks_shift) - 1);
+
+    /* low part: bins 0..15 */
+    const float3 lr_area_low = (float3)(lr_areaX.x, lr_areaY.x, lr_areaZ.x);
+    const float3 rl_area_low = (float3)(rl_areaX.x, rl_areaY.x, rl_areaZ.x);
+
+#if DIVIDE_BY_6 == 0
+    const uint3 lr_count_low = ((uint3)(lr_countsX.x, lr_countsY.x, lr_countsZ.x) + blocks_add) >> blocks_shift;
+    const uint3 rl_count_low = ((uint3)(rl_countsX.x, rl_countsY.x, rl_countsZ.x) + blocks_add) >> blocks_shift;
+
+#else
+    //const uint3 lr_count_low = ((uint3)(lr_countsX.x,lr_countsY.x,lr_countsZ.x)+BVH_NODE_N6-1) / BVH_NODE_N6;
+    //const uint3 rl_count_low = ((uint3)(rl_countsX.x,rl_countsY.x,rl_countsZ.x)+BVH_NODE_N6-1) / BVH_NODE_N6;
+
+    /* skip blocks for breadth-first phase */
+    const uint3 lr_count_low = ((uint3)(lr_countsX.x, lr_countsY.x, lr_countsZ.x));
+    const uint3 rl_count_low = ((uint3)(rl_countsX.x, rl_countsY.x, rl_countsZ.x));
+
+#endif
+
+    float3 sah_low = fma(lr_area_low, convert_float3(lr_count_low), rl_area_low * convert_float3(rl_count_low));
+
+    /* first bin is invalid */
+    // sah_low.x = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.x;
+    // sah_low.y = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.y;
+    // sah_low.z = (subgroupLocalID == 0) ? (float)(INFINITY) : sah_low.z;
+
+    sah_low.x = select((float)(INFINITY), sah_low.x, subgroupLocalID != 0);
+    sah_low.y = select((float)(INFINITY), sah_low.y, subgroupLocalID != 0);
+    sah_low.z = select((float)(INFINITY), sah_low.z, subgroupLocalID != 0);
+
+    /* high part: bins 16..31 */
+
+    const float3 lr_area_high = (float3)(lr_areaX.y, lr_areaY.y, lr_areaZ.y);
+    const float3 rl_area_high = (float3)(rl_areaX.y, rl_areaY.y, rl_areaZ.y);
+#if DIVIDE_BY_6 == 0
+    const uint3 lr_count_high = ((uint3)(lr_countsX.y, lr_countsY.y, lr_countsZ.y) + blocks_add) >> blocks_shift;
+    const uint3 rl_count_high = ((uint3)(rl_countsX.y, rl_countsY.y, rl_countsZ.y) + blocks_add) >> blocks_shift;
+#else
+    //const uint3 lr_count_high = ((uint3)(lr_countsX.y,lr_countsY.y,lr_countsZ.y)+BVH_NODE_N6-1) / BVH_NODE_N6;
+    //const uint3 rl_count_high = ((uint3)(rl_countsX.y,rl_countsY.y,rl_countsZ.y)+BVH_NODE_N6-1) / BVH_NODE_N6;
+
+    /* skip blocks for breadth-first phase */
+    const uint3 lr_count_high = ((uint3)(lr_countsX.y, lr_countsY.y, lr_countsZ.y));
+    const uint3 rl_count_high = ((uint3)(rl_countsX.y, rl_countsY.y, rl_countsZ.y));
+
+#endif
+    const float3 sah_high = fma(lr_area_high, convert_float3(lr_count_high), rl_area_high * convert_float3(rl_count_high));
+
+    const uint mid = (startID + endID) / 2;
+    const ulong defaultSplit = (((ulong)as_uint((float)(INFINITY))) << 32) | ((uint)mid << 2) | 0;
+
+    const ulong bestSplit_low = getBestSplit(sah_low, subgroupLocalID, scale, defaultSplit);
+    const ulong bestSplit_high = getBestSplit(sah_high, subgroupLocalID + subgroup_size, scale, defaultSplit);
+    const ulong bestSplit = min(bestSplit_low, bestSplit_high);
+
+    struct Split split;
+    split.sah = as_float((uint)(bestSplit >> 32));
+    split.dim = (uint)bestSplit & 3;
+    split.pos = (uint)bestSplit >> 2;
+
+    return split;
+}
+
+// =====================================================================
+
+inline float leafSAH(float geometryArea, uint prims, uint block_shift)
+{
+    return geometryArea * convert_float((prims + (1 << block_shift) - 1) >> block_shift);
+}
+
+inline bool is_left(struct BinMapping *binMapping, struct Split *split, struct AABB *primref)
+{
+    const uint dim = split->dim;
+    const float lower = primref->lower[dim];
+    const float upper = primref->upper[dim];
+    const float c = lower + upper;
+    const uint pos = convert_uint_rtz((c - binMapping->ofs[dim]) * binMapping->scale[dim]);
+    return pos < split->pos;
+}
+
+inline void serial_find_split(global struct AABB *primref,
+                              struct BinMapping *binMapping,
+                              struct BuildRecord *buildRecord,
+                              local struct Split *split,
+                              local struct BinInfo *binInfo,
+                              global uint *primref_index0,
+                              global uint *primref_index1)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+
+    const uint startID = buildRecord->start;
+    const uint endID = buildRecord->end;
+
+    subgroup_initBinInfo(binInfo);
+
+    for (uint t = startID + subgroupLocalID; t < endID; t += subgroup_size)
+    {
+        const uint index = primref_index0[t];
+        primref_index1[t] = index;
+        atomicUpdateLocalBinInfo_nocheck(binMapping, binInfo, &primref[index]);
+    }
+}
+
+inline void serial_partition_index(global struct AABB *primref,
+                                   struct BinMapping *binMapping,
+                                   struct BuildRecord *buildRecord,
+                                   struct Split *inSplit,
+                                   struct BuildRecord *outLeft,
+                                   struct BuildRecord *outRight,
+                                   struct AABB *outGeometryBoundsLeft,
+                                   struct AABB *outGeometryBoundsRight,
+                                   global uint *primref_index0,
+                                   global uint *primref_index1)
+{
+    const uint localID = get_local_id(0);
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroupID = get_sub_group_id();
+    const uint subgroup_size = get_sub_group_size();
+
+    const uint begin = buildRecord->start;
+    const uint end = buildRecord->end;
+    struct Split split = *inSplit;
+
+    struct BuildRecord left;
+    struct BuildRecord right;
+    initBuildRecord(&left, begin, end);
+    initBuildRecord(&right, begin, end);
+
+    struct AABB leftAABB;
+    struct AABB rightAABB;
+    AABB_init(&leftAABB);
+    AABB_init(&rightAABB);
+
+    global uint *l = primref_index0 + begin;
+    global uint *r = primref_index0 + end;
+
+    /* no valid split, just split in the middle */
+    if (split.sah == (float)(INFINITY))
+    {
+        for (uint i = begin + subgroupLocalID; i < split.pos; i += subgroup_size)
+        {
+            const uint index = primref_index1[i];
+            const uint count = sub_group_reduce_add(1);
+            extendBuildRecord(&left, &primref[index]);
+            AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper);
+            l[subgroupLocalID] = index;
+            l += count;
+        }
+
+        for (uint i = split.pos + subgroupLocalID; i < end; i += subgroup_size)
+        {
+            const uint index = primref_index1[i];
+            const uint count = sub_group_reduce_add(1);
+            extendBuildRecord(&right, &primref[index]);
+            AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper);
+            r -= count;
+            r[subgroupLocalID] = index;
+        }
+    }
+    else
+    {
+        for (uint i = begin + subgroupLocalID; i < end; i += subgroup_size)
+        {
+            const uint index = primref_index1[i];
+            const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0;
+            const uint isRight = 1 - isLeft;
+            const uint countLeft = sub_group_reduce_add(isLeft);
+            const uint countRight = sub_group_reduce_add(isRight);
+            const uint prefixLeft = sub_group_scan_exclusive_add(isLeft);
+            const uint prefixRight = sub_group_scan_exclusive_add(isRight);
+
+            r -= countRight;
+
+            if (isLeft)
+            {
+                extendBuildRecord(&left, &primref[index]);
+                AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper);
+                l[prefixLeft] = index;
+            }
+            else
+            {
+                extendBuildRecord(&right, &primref[index]);
+                AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper);
+                r[prefixRight] = index;
+            }
+            l += countLeft;
+        }
+    }
+
+    left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds);
+    right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds);
+    leftAABB = AABB_sub_group_reduce(&leftAABB);
+    rightAABB = AABB_sub_group_reduce(&rightAABB);
+
+    if (subgroupLocalID == 0)
+    {
+        uint pos = l - primref_index0; // single first thread needs to compute "pos"
+        left.end = pos;
+        right.start = pos;
+
+        leftAABB.lower.w = AABB_halfArea(&leftAABB);
+        rightAABB.lower.w = AABB_halfArea(&rightAABB);
+
+        leftAABB.upper.w = as_float(getNumPrimsBuildRecord(&left));
+        rightAABB.upper.w = as_float(getNumPrimsBuildRecord(&right));
+
+        *outLeft = left;
+        *outRight = right;
+        *outGeometryBoundsLeft = leftAABB;
+        *outGeometryBoundsRight = rightAABB;
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+#if ENABLE_CHECKS == 1
+    if (subgroupLocalID == 0)
+    {
+        if (AABB_verify(outLeft))
+        {
+            printf("outLeft:\n");
+            printBuildRecord(outLeft);
+        }
+        if (AABB_verify(outRight))
+        {
+            printf("outRight:\n");
+            printBuildRecord(outRight);
+        }
+        if (AABB_verify(outGeometryBoundsLeft))
+        {
+            printf("outGeometryBoundsLeft:\n");
+            AABB_print(outGeometryBoundsLeft);
+        }
+        if (AABB_verify(outGeometryBoundsRight))
+        {
+            printf("outGeometryBoundsRight:\n");
+            AABB_print(outGeometryBoundsRight);
+        }
+
+        for (uint i = outLeft->start; i < outLeft->end; i++)
+        {
+            const uint index = primref_index0[i];
+            if (split.sah != (float)(INFINITY) && !is_left(binMapping, inSplit, &primref[index]))
+                printf("check left %d \n", i);
+            if (!AABB_verifyBounds(outLeft, outGeometryBoundsLeft, &primref[index]))
+                printf("check prim ref bounds left %d \n", i);
+        }
+        for (uint i = outRight->start; i < outRight->end; i++)
+        {
+            const uint index = primref_index0[i];
+            if (split.sah != (float)(INFINITY) && is_left(binMapping, inSplit, &primref[index]))
+                printf("check right %d \n", i);
+            if (!AABB_verifyBounds(outRight, outGeometryBoundsRight, &primref[index]))
+                printf("check prim ref bounds right %d \n", i);
+        }
+    }
+#endif
+}
+
+inline uint subgroup_createLeaf_index(global struct BlockAllocator *allocator,
+                                      const uint start,
+                                      const uint end,
+                                      global struct AABB *primref,
+                                      uint primID,
+                                      global char *bvh_mem,
+                                      unsigned leafSize)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+    const uint items = end - start;
+
+#if ENABLE_CHECKS == 1
+    if (items > BVH_LEAF_N_MAX)
+        printf("error items %d \n", items);
+#endif
+
+    // JDB TODO:  Why was this code commented out??
+    //uint offset = (subgroupLocalID == 0) ? alloc_leaf_mem(globals,sizeof(struct Quad)*items) : 0;
+    //offset = sub_group_broadcast(offset,0);
+
+    //uint offset = globals->leaf_mem_allocator_start + start * leafSize;
+    uint offset = allocator->start + start * leafSize;
+    return offset;
+}
+
+inline uint get_qnode_index_for_backptr(void *qnode_base, void *qnode)
+{
+    size_t offset = ((size_t)qnode - (size_t)qnode_base) / sizeof(struct QBVHNodeN);
+    uint offset_u = (uint)offset;
+#if ENABLE_CHECKS
+    if ((size_t)((offset_u << 6) >> 6) != offset)
+    {
+        printf("get_qnode_index_for_backptr - index out of reach");
+    }
+#endif
+    return offset_u;
+}
+
+struct SerialBuildRecurseTemplateConst
+{
+    unsigned leafSize;
+    unsigned leafType;
+    bool allocateBackpointers;
+};
+
+// ====================================================================================
+// ====================================================================================
+// ====================================================================================
+// ====================================================================================
+// ====================================================================================
+
+inline void parallel_find_split(global struct AABB *primref,
+                                local struct BuildRecord *buildRecord,
+                                local struct Split *bestSplit,
+                                local struct BinInfo *binInfo,
+                                global uint *primref_index0,
+                                global uint *primref_index1)
+{
+    const uint localID = get_local_id(0);
+    const uint local_size = get_local_size(0);
+    const uint subgroupID = get_sub_group_id();
+
+    const uint startID = buildRecord->start;
+    const uint endID = buildRecord->end;
+
+    struct BinMapping binMapping;
+    initBinMapping(&binMapping, &buildRecord->centroidBounds, BINS);
+
+    /* init bininfo */
+    parallel_initBinInfo(binInfo);
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (uint t = startID + localID; t < endID; t += local_size)
+    {
+        const uint index = primref_index0[t];
+        primref_index1[t] = index;
+        atomicUpdateLocalBinInfo(&binMapping, binInfo, &primref[index]);
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    /* find best dimension */
+
+    if (subgroupID == 0)
+    {
+        *bestSplit = reduceBinsAndComputeBestSplit16(binInfo, binMapping.scale, startID, endID);
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+inline void parallel_find_split32(local uint *local_sync,
+                                  global struct AABB *primref,
+                                  local struct BuildRecord *buildRecord,
+                                  local struct Split *bestSplit,
+                                  local struct BinInfo2 *binInfo2,
+                                  global uint *primref_index0,
+                                  global uint *primref_index1)
+{
+
+    const uint localID = get_local_id(0);
+    const uint local_size = get_local_size(0);
+    const uint subgroupID = get_sub_group_id();
+    const uint numSubGroups = get_num_sub_groups();
+    const uint subgroupLocalID = get_sub_group_local_id();
+
+    const uint startID = buildRecord->start;
+    const uint endID = buildRecord->end;
+
+    struct BinMapping binMapping;
+    initBinMapping(&binMapping, &buildRecord->centroidBounds, 2 * BINS);
+
+    /* init bininfo */
+    parallel_initBinInfo2(binInfo2, 2 * BINS);
+
+    if (localID == 0)
+        *local_sync = 0;
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (uint t = startID + localID; t < endID; t += local_size)
+    {
+        const uint index = primref_index0[t];
+        primref_index1[t] = index;
+        atomicUpdateLocalBinInfo2(&binMapping, binInfo2, &primref[index]);
+    }
+
+    /* find best split position using the last subgroup */
+    sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+    uint syncID = subgroupLocalID == 0 ? generic_atomic_add(local_sync, 1) : 0;
+    syncID = sub_group_broadcast(syncID, 0);
+
+    if (syncID + 1 == numSubGroups)
+    {
+        *bestSplit = reduceBinsAndComputeBestSplit32(binInfo2, binMapping.scale, startID, endID);
+        DBG(if (localID == 0) printSplit(bestSplit));
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+inline void parallel_partition_index(local uint *local_sync,
+                                     global struct AABB *primref,
+                                     struct BinMapping *binMapping,
+                                     const uint begin,
+                                     const uint end,
+                                     struct Split *inSplit,
+                                     local struct BuildRecord *outLeft,
+                                     local struct BuildRecord *outRight,
+                                     local struct AABB *outGeometryBoundsLeft,
+                                     local struct AABB *outGeometryBoundsRight,
+                                     global uint *primref_index0,
+                                     global uint *primref_index1,
+                                     uint *atomicCountLeft,
+                                     uint *atomicCountRight)
+{
+    const uint localID = get_local_id(0);
+    const uint local_size = get_local_size(0);
+    const uint subgroupID = get_sub_group_id();
+    const uint numSubGroups = get_num_sub_groups();
+    const uint subgroup_size = get_sub_group_size();
+    const uint subgroupLocalID = get_sub_group_local_id();
+
+    const uint size = end - begin;
+    struct Split split = *inSplit;
+
+    /* init bin bounds */
+    if (localID == 0)
+    {
+        initBuildRecord(outLeft, begin, end);
+        initBuildRecord(outRight, begin, end);
+        AABB_init(outGeometryBoundsLeft);
+        AABB_init(outGeometryBoundsRight);
+        *atomicCountLeft = 0;
+        *atomicCountRight = 0;
+        *local_sync = 0;
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); // remove ?
+
+    struct BuildRecord left;
+    struct BuildRecord right;
+    initBuildRecord(&left, begin, end);
+    initBuildRecord(&right, begin, end);
+
+    struct AABB leftAABB;
+    struct AABB rightAABB;
+    AABB_init(&leftAABB);
+    AABB_init(&rightAABB);
+
+    if (split.sah == (float)(INFINITY))
+    {
+        if (subgroupID == 0)
+        {
+            for (uint i = begin + subgroupLocalID; i < split.pos; i += subgroup_size)
+            {
+                const uint index = primref_index1[i];
+                extendBuildRecord(&left, &primref[index]);
+                AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper);
+                primref_index0[i] = index;
+            }
+
+            for (uint i = split.pos + subgroupLocalID; i < end; i += subgroup_size)
+            {
+                const uint index = primref_index1[i];
+                extendBuildRecord(&right, &primref[index]);
+                AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper);
+                primref_index0[i] = index;
+            }
+
+            left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds);
+            right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds);
+            leftAABB = AABB_sub_group_reduce(&leftAABB);
+            rightAABB = AABB_sub_group_reduce(&rightAABB);
+
+            if (localID == 0)
+            {
+                outLeft->centroidBounds = left.centroidBounds;
+                outRight->centroidBounds = right.centroidBounds;
+
+                *outGeometryBoundsLeft = leftAABB;
+                *outGeometryBoundsRight = rightAABB;
+
+                outLeft->end = split.pos;
+                outRight->start = split.pos;
+
+                outGeometryBoundsLeft->lower.w = AABB_halfArea(outGeometryBoundsLeft);
+                outGeometryBoundsRight->lower.w = AABB_halfArea(outGeometryBoundsRight);
+                outGeometryBoundsLeft->upper.w = as_float(getNumPrimsBuildRecord(outLeft));
+                outGeometryBoundsRight->upper.w = as_float(getNumPrimsBuildRecord(outRight));
+            }
+        }
+    }
+    else
+    {
+
+        const int startID = begin + ((subgroupID + 0) * size / numSubGroups);
+        const int endID = begin + ((subgroupID + 1) * size / numSubGroups);
+
+        for (uint i = startID + subgroupLocalID; i < endID; i += subgroup_size)
+        {
+            const uint index = primref_index1[i];
+            const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0;
+            const uint isRight = 1 - isLeft;
+            const uint countLeft = sub_group_reduce_add(isLeft);
+            const uint countRight = sub_group_reduce_add(isRight);
+            const uint prefixLeft = sub_group_scan_exclusive_add(isLeft);
+            const uint prefixRight = sub_group_scan_exclusive_add(isRight);
+
+            uint offsetLeft = subgroupLocalID == 0 ? generic_atomic_add(atomicCountLeft, countLeft) : 0;
+            offsetLeft = sub_group_broadcast(offsetLeft, 0);
+            uint offsetRight = subgroupLocalID == 0 ? generic_atomic_add(atomicCountRight, countRight) : 0;
+            offsetRight = sub_group_broadcast(offsetRight, 0);
+
+            if (isLeft)
+            {
+                extendBuildRecord(&left, &primref[index]);
+                AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper);
+                primref_index0[begin + offsetLeft + prefixLeft] = index;
+            }
+            else
+            {
+                extendBuildRecord(&right, &primref[index]);
+                AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper);
+                primref_index0[end - (offsetRight + countRight) + prefixRight] = index;
+            }
+        }
+        left.centroidBounds = AABB_sub_group_reduce(&left.centroidBounds);
+        right.centroidBounds = AABB_sub_group_reduce(&right.centroidBounds);
+        leftAABB = AABB_sub_group_reduce(&leftAABB);
+        rightAABB = AABB_sub_group_reduce(&rightAABB);
+
+        AABB_local_atomic_merge(&outLeft->centroidBounds, left.centroidBounds.lower, left.centroidBounds.upper);
+        AABB_local_atomic_merge(&outRight->centroidBounds, right.centroidBounds.lower, right.centroidBounds.upper);
+
+        AABB_local_atomic_merge(outGeometryBoundsLeft, leftAABB.lower, leftAABB.upper);
+        AABB_local_atomic_merge(outGeometryBoundsRight, rightAABB.lower, rightAABB.upper);
+
+        sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        if (subgroupLocalID == 0)
+        {
+            const uint sync = atomic_add(local_sync, 1);
+            if (sync + 1 == numSubGroups)
+            {
+                uint pos = begin + *atomicCountLeft; // single thread of last subgroup needs to compute "pos"
+                outLeft->end = pos;
+                outRight->start = pos;
+
+                outGeometryBoundsLeft->lower.w = AABB_halfArea(outGeometryBoundsLeft);
+                outGeometryBoundsRight->lower.w = AABB_halfArea(outGeometryBoundsRight);
+                outGeometryBoundsLeft->upper.w = as_float(getNumPrimsBuildRecord(outLeft));
+                outGeometryBoundsRight->upper.w = as_float(getNumPrimsBuildRecord(outRight));
+            }
+        }
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+#if ENABLE_CHECKS == 1
+    if (localID == 0)
+    {
+        if (outLeft->end <= begin)
+            printf("pos begin error\n");
+        if (outLeft->end > end)
+            printf("pos end error\n");
+
+        for (uint i = outLeft->start; i < outLeft->end; i++)
+        {
+            const uint index = primref_index0[i];
+            //printf("left %d -> %d \n",i,index);
+            if (!is_left(binMapping, inSplit, &primref[index]))
+                printf("check left %d \n", i);
+            if (!AABB_verifyBounds(outLeft, outGeometryBoundsLeft, &primref[index]))
+                printf("check prim ref bounds left %d \n", i);
+        }
+        for (uint i = outRight->start; i < outRight->end; i++)
+        {
+            const uint index = primref_index0[i];
+            //printf("right %d -> %d \n",i,index);
+            if (is_left(binMapping, inSplit, &primref[index]))
+                printf("check right %d \n", i);
+            if (!AABB_verifyBounds(outRight, outGeometryBoundsRight, &primref[index]))
+                printf("check prim ref bounds right %d \n", i);
+        }
+    }
+#endif
+}
+
+
+#define ENABLE_LOOP_BREADTH_FIRST 0
+#if ENABLE_LOOP_BREADTH_FIRST
+// TBD It might be that layout of this impact perf.
+struct BreadthFirstLoopLocals
+{
+    struct BuildRecord local_current;
+#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0
+    struct BinInfo binInfo;
+#else
+    struct BinInfo2 binInfo;
+#endif
+    struct Split split;
+    struct BuildRecord children[BVH_NODE_N + 1];
+    struct AABB childrenAABB[BVH_NODE_N + 1];
+    uint atomicCountLeft;
+    uint atomicCountRight;
+    uint local_sync;
+    uint recordID;
+    uint buildRecordIDs[BUILDRECORD_STACK_SIZE];
+    uint numBuildRecordIDs;
+    bool exit;
+};
+
+
+inline void parallel_build_breadth_first_loopT(global struct Globals *globals,
+                                               global struct AABB *primref,
+                                               global uint *primref_index,
+                                               global char *bvh_mem,
+                                               uint subtreeThreshold,
+                                               local struct BreadthFirstLoopLocals *L,
+                                               struct BreadthFirstTemplateConst T)
+{
+    const uint global_size = get_global_size(0);
+    const uint local_size = get_local_size(0);
+    const uint localID = get_local_id(0);
+    const uint taskID = get_group_id(0);
+    const uint numTasks = get_num_groups(0);
+
+    const uint subgroupID = get_sub_group_id();
+    const uint subgroupLocalID = get_sub_group_local_id();
+
+    /* double buffered primref index array */
+    global uint *primref_index0 = primref_index;
+    global uint *primref_index1 = primref_index + globals->numPrimitives;
+
+    global struct BuildRecord *records = getBuildRecords(bvh_mem, globals);
+
+#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0
+    const uint bins = BINS;
+#else
+    const uint bins = 2 * BINS;
+#endif
+
+    if (localID == 0)
+    {
+        L->numBuildRecordIDs = 0;
+        L->exit = false;
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    while (1)
+    {
+        if (localID == 0)
+        {
+            if (L->numBuildRecordIDs == 0)
+            {
+                L->recordID = generic_atomic_add(&globals->counter, 1);
+                if (L->recordID >= globals->numBuildRecords)
+                    L->exit = true;
+            }
+            else
+            {
+                L->numBuildRecordIDs--;
+                L->recordID = L->buildRecordIDs[L->numBuildRecordIDs];
+            }
+            L->local_current = records[L->recordID];
+        }
+
+        work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+
+        /* no more buildrecords available ? */
+
+        if (L->exit)
+            break;
+
+        local struct BuildRecord *current = &L->local_current;
+        const uint items = getNumPrims(current);
+        const uint depth = getBuildRecursionDepth(current);
+
+        global unsigned int *num_records_output = &globals->numBuildRecords_extended;
+
+        struct QBVHNodeN *qnode = (struct QBVHNodeN *)current->current;
+
+        /* ignore small buildrecords */
+        if (items < max(subtreeThreshold, cfg_minLeafSize))
+        {
+            // do nothing
+        }
+        else
+        {
+            /*! find best split */
+#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0
+            parallel_find_split(primref, current, &L->split, &L->binInfo, primref_index0, primref_index1);
+#else
+            parallel_find_split32(&L->local_sync, primref, current, &L->split, &L->binInfo, primref_index0, primref_index1);
+#endif
+            uint numChildren = 2;
+
+            /*! find best split */
+            struct BinMapping binMapping;
+            initBinMapping(&binMapping, &current->centroidBounds, bins);
+
+            parallel_partition_index(&L->local_sync, primref, &binMapping, current->start, current->end, &L->split, &L->children[0], &L->children[1], &L->childrenAABB[0], &L->childrenAABB[1], primref_index0, primref_index1, &L->atomicCountLeft, &L->atomicCountRight);
+
+            while (numChildren < BVH_NODE_N6)
+            {
+                /*! find best child to split */
+                const uint bestChild = subgroup_getMaxAreaChild(L->childrenAABB, numChildren);
+                if (bestChild == -1)
+                    break;
+
+                /* perform best found split */
+                local struct BuildRecord *brecord = &L->children[bestChild];
+                local struct BuildRecord *lrecord = &L->children[numChildren + 0];
+                local struct BuildRecord *rrecord = &L->children[numChildren + 1];
+
+#if ENABLE_32BINS_IN_BREADTH_FIRST_PHASE == 0
+                parallel_find_split(primref, brecord, &L->split, &L->binInfo, primref_index0, primref_index1);
+#else
+                parallel_find_split32(&L->local_sync, primref, brecord, &L->split, &L->binInfo, primref_index0, primref_index1);
+#endif
+
+                initBinMapping(&binMapping, &brecord->centroidBounds, bins);
+
+                parallel_partition_index(&L->local_sync, primref, &binMapping, brecord->start, brecord->end, &L->split, lrecord, rrecord, &L->childrenAABB[numChildren + 0], &L->childrenAABB[numChildren + 1], primref_index0, primref_index1, &L->atomicCountLeft, &L->atomicCountRight);
+
+                *brecord = *rrecord;
+                L->childrenAABB[bestChild] = L->childrenAABB[numChildren + 1];
+
+                work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+                numChildren++;
+            }
+
+            //sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+            if (localID <= 16 && subgroupID == 0)
+            {
+                global struct BVHBase *bvh_base = (global struct BVHBase *)bvh_mem;
+                global struct QBVHNodeN *nodes_start = BVHBase_nodeData(bvh_base);
+                global uint *back_pointers = BVHBase_backPointers(bvh_base);
+                uint qnode_index = 0;
+                if (T.allocateBackpointers)
+                {
+                    /* index of internal node, the domain of backpointers map*/
+                    qnode_index = get_qnode_index_for_backptr(nodes_start, qnode);
+                    // the backpointer is already set, but we need to add/encode the num of children
+                    // todo don't like the need of data read (we should just add), maybe should pass grandpa pointer in record..., or use atomic...
+                    back_pointers[qnode_index] += (numChildren << 3);
+                }
+
+                /* sort children based on rnage size */
+                const uint numPrimsIDs = select((uint)0, (as_uint(L->childrenAABB[subgroupLocalID].upper.w) << 3) | subgroupLocalID, subgroupLocalID < numChildren);
+                //const uint IDs = sortBVHChildrenIDs(numPrimsIDs) & (BVH_NODE_N-1);
+                const uint IDs = numPrimsIDs & 7;
+                const uint pushIDs = convertToPushIndices8(IDs);
+
+                /* alloc #numChildren nodes at once */
+                const uint node_offset = alloc_single_node_mem(globals, sizeof(struct QBVHNodeN) * numChildren);
+
+                /* update single relative node pointer and type */
+                const int offset = encodeOffset(bvh_mem, (global void *)qnode, node_offset) >> 6;
+                const uint type = BVH_INTERNAL_NODE;
+
+                /* set parent pointer in child build records */
+                if (subgroupLocalID < numChildren)
+                {
+                    setBuildRecursionDepth(&L->children[subgroupLocalID], depth + 1);
+                    global uchar *child_data_ptr = (global uchar *)bvh_mem + node_offset + pushIDs * sizeof(struct QBVHNodeN);
+                    L->children[subgroupLocalID].current = child_data_ptr;
+                    if (T.allocateBackpointers)
+                    {
+                        uint child_index = get_qnode_index_for_backptr(nodes_start, child_data_ptr);
+                        back_pointers[child_index] = qnode_index << 6;
+                    }
+                }
+
+                /* write out qbvh node */
+                subgroup_setQBVHNodeN(offset, type, &L->childrenAABB[IDs], numChildren, qnode);
+
+                /* write out child buildrecords to memory */
+
+                uint global_records_offset = (subgroupLocalID == 0) ? atomic_add(num_records_output, numChildren - 1) : 0;
+                global_records_offset = sub_group_broadcast(global_records_offset, 0);
+
+                if (localID == 0)
+                {
+                    records[L->recordID] = L->children[0];
+                    L->buildRecordIDs[L->numBuildRecordIDs++] = L->recordID;
+                    for (uint i = 1; i < numChildren; i++)
+                    {
+                        const uint ID = globals->numBuildRecords + global_records_offset + i - 1;
+                        records[ID] = L->children[i];
+                        L->buildRecordIDs[L->numBuildRecordIDs++] = ID;
+                    }
+                }
+            }
+        }
+        work_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+    }
+
+    /* last active HW thread ? */
+    if (localID == 0)
+    {
+        const uint sync = atomic_add(&globals->sync, 1);
+        if (sync + 1 == numTasks)
+        {
+            globals->sync = 0;
+            /* set final number of buildrecords */
+            globals->numBuildRecords += globals->numBuildRecords_extended;
+            globals->numBuildRecords_extended = 0;
+            globals->counter = 0;
+        }
+    }
+}
+
+__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE / 2, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_build_breadth_first_loop(global struct Globals *globals,
+                                  global struct AABB *primref,
+                                  global uint *primref_index,
+                                  global char *bvh_mem,
+                                  uint subtreeThreshold)
+{
+    local struct BreadthFirstLoopLocals L;
+    static const struct BreadthFirstTemplateConst T = {
+        false // bool allocateBackpointers;
+    };
+
+    parallel_build_breadth_first_loopT(globals,
+                                       primref,
+                                       primref_index,
+                                       bvh_mem,
+                                       subtreeThreshold,
+                                       &L,
+                                       T);
+}
+
+__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE / 2, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_build_breadth_first_loop_backpointers(global struct Globals *globals,
+                                               global struct AABB *primref,
+                                               global uint *primref_index,
+                                               global char *bvh_mem,
+                                               uint subtreeThreshold)
+{
+    local struct BreadthFirstLoopLocals L;
+    static const struct BreadthFirstTemplateConst T = {
+        true // bool allocateBackpointers;
+    };
+
+    parallel_build_breadth_first_loopT(globals,
+                                       primref,
+                                       primref_index,
+                                       bvh_mem,
+                                       subtreeThreshold,
+                                       &L,
+                                       T);
+}
+// ===================================================
+// =============== experimental code =================
+// ===================================================
+#endif
+
+#define ENABLE_GLOBAL_SPLIT 0
+#if ENABLE_GLOBAL_SPLIT
+inline void parallel_partition_segment_index(local uint *local_sync,
+                                             global struct AABB *primref,
+                                             struct BinMapping *binMapping,
+                                             const uint begin,
+                                             const uint end,
+                                             const uint global_begin,
+                                             const uint global_end,
+                                             struct Split *inSplit,
+                                             local struct AABB *outLeft,
+                                             local struct AABB *outRight,
+                                             local struct AABB *outGeometryBoundsLeft,
+                                             local struct AABB *outGeometryBoundsRight,
+                                             global uint *primref_index0,
+                                             global uint *primref_index1,
+                                             uint *atomicCountLeft,
+                                             uint *atomicCountRight)
+{
+    const uint localID = get_local_id(0);
+    const uint local_size = get_local_size(0);
+    const uint subgroupID = get_sub_group_id();
+    const uint numSubGroups = get_num_sub_groups();
+    const uint subgroup_size = get_sub_group_size();
+    const uint subgroupLocalID = get_sub_group_local_id();
+
+    const uint size = end - begin;
+    struct Split split = *inSplit;
+
+    /* init bin bounds */
+    if (localID == 0)
+    {
+        AABB_init(outLeft);
+        AABB_init(outRight);
+        AABB_init(outGeometryBoundsLeft);
+        AABB_init(outGeometryBoundsRight);
+        *local_sync = 0;
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    struct AABB left;
+    struct AABB right;
+    AABB_init(&left);
+    AABB_init(&right);
+
+    struct AABB leftAABB;
+    struct AABB rightAABB;
+    AABB_init(&leftAABB);
+    AABB_init(&rightAABB);
+
+    const int startID = begin + ((subgroupID + 0) * size / numSubGroups);
+    const int endID = begin + ((subgroupID + 1) * size / numSubGroups);
+
+    for (uint i = startID + subgroupLocalID; i < endID; i += subgroup_size)
+    {
+        const uint index = primref_index1[i];
+        const uint isLeft = is_left(binMapping, &split, &primref[index]) ? 1 : 0;
+        const uint isRight = 1 - isLeft;
+        const uint countLeft = sub_group_reduce_add(isLeft);
+        const uint countRight = sub_group_reduce_add(isRight);
+        const uint prefixLeft = sub_group_scan_exclusive_add(isLeft);
+        const uint prefixRight = sub_group_scan_exclusive_add(isRight);
+
+        uint offsetLeft = subgroupLocalID == 0 ? generic_atomic_add(atomicCountLeft, countLeft) : 0;
+        offsetLeft = sub_group_broadcast(offsetLeft, 0);
+        uint offsetRight = subgroupLocalID == 0 ? generic_atomic_add(atomicCountRight, countRight) : 0;
+        offsetRight = sub_group_broadcast(offsetRight, 0);
+
+        if (isLeft)
+        {
+            AABB_extend_point(&left, AABB_centroid2(&primref[index]));
+            AABB_extendlu(&leftAABB, primref[index].lower, primref[index].upper);
+            primref_index0[global_begin + offsetLeft + prefixLeft] = index;
+        }
+        else
+        {
+            AABB_extend_point(&right, AABB_centroid2(&primref[index]));
+            AABB_extendlu(&rightAABB, primref[index].lower, primref[index].upper);
+            primref_index0[global_end - (offsetRight + countRight) + prefixRight] = index;
+        }
+    }
+    left = AABB_sub_group_reduce(&left);
+    right = AABB_sub_group_reduce(&right);
+    leftAABB = AABB_sub_group_reduce(&leftAABB);
+    rightAABB = AABB_sub_group_reduce(&rightAABB);
+
+    AABB_local_atomic_merge(outLeft, left.lower, left.upper);
+    AABB_local_atomic_merge(outRight, right.lower, right.upper);
+
+    AABB_local_atomic_merge(outGeometryBoundsLeft, leftAABB.lower, leftAABB.upper);
+    AABB_local_atomic_merge(outGeometryBoundsRight, rightAABB.lower, rightAABB.upper);
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+__attribute__((reqd_work_group_size(BINS * 2, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel global_init_split_iteration(global struct Globals *globals,
+                            global struct GlobalBuildRecord *global_record,
+                            global char *bvh_mem,
+                            const uint subTreeThreshold)
+{
+    const uint localID = get_local_id(0);
+    const uint taskID = get_group_id(0);
+    const uint numTasks = get_num_groups(0);
+
+    global struct BuildRecord *records = getBuildRecords(bvh_mem, globals);
+
+    /* for each build record with size > subTreeThreshold initialize a global build record */
+
+    const uint startID = (taskID + 0) * globals->numBuildRecords / numTasks;
+    const uint endID = (taskID + 1) * globals->numBuildRecords / numTasks;
+
+    for (uint i = startID; i < endID; i++)
+    {
+        global struct BuildRecord *buildRecord = &records[i];
+        DBG(if (localID == 0) printf("i %d subTreeThreshold %d size %d \n", i, subTreeThreshold, buildRecord->end - buildRecord->start));
+
+        if ((buildRecord->end - buildRecord->start) > subTreeThreshold)
+        {
+            uint ID = localID == 0 ? generic_atomic_add(&globals->numGlobalBuildRecords, 1) : 0;
+
+            ID = work_group_broadcast(ID, 0);
+            global struct BinInfo2 *binInfo = &global_record[ID].binInfo;
+            global struct BinMapping *binMapping = &global_record[ID].binMapping;
+            initBinMapping(binMapping, &buildRecord->centroidBounds, 2 * BINS);
+            parallel_initBinInfo2(binInfo, 2 * BINS);
+            if (localID == 0)
+            {
+                global_record[ID].range.start = buildRecord->start;
+                global_record[ID].range.end = buildRecord->end;
+                global_record[ID].atomicCountLeft = 0;
+                global_record[ID].atomicCountRight = 0;
+                global_record[ID].buildRecordID = i;
+                AABB_init(&global_record[ID].leftCentroid);
+                AABB_init(&global_record[ID].rightCentroid);
+                AABB_init(&global_record[ID].leftGeometry);
+                AABB_init(&global_record[ID].rightGeometry);
+            }
+        }
+    }
+    DBG(
+        work_group_barrier(CLK_LOCAL_MEM_FENCE);
+        if (localID == 0)
+            printf("globals->numGlobalBuildRecords %d \n", globals->numGlobalBuildRecords););
+}
+
+__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel global_bin_iteration(global struct Globals *globals,
+                     global struct AABB *primref,
+                     global uint *primref_index,
+                     global char *bvh_mem,
+                     global struct GlobalBuildRecord *global_record)
+{
+    const uint localID = get_local_id(0);
+    const uint blockSize = get_local_size(0);
+    const uint taskID = get_group_id(0);
+    const uint numTasks = get_num_groups(0);
+
+    const uint numGlobalBuildRecords = globals->numGlobalBuildRecords;
+
+    /* early out */
+    if (numGlobalBuildRecords == 0)
+        return;
+
+    /* double buffered primref index array */
+    global uint *primref_index0 = primref_index;
+    global uint *primref_index1 = primref_index + globals->numPrimitives;
+
+    uint numBlocks = 0;
+
+    /* get total number of blocks, size of block == WG size */
+    for (uint i = 0; i < numGlobalBuildRecords; i++)
+        numBlocks += (global_record[i].range.end - global_record[i].range.start + blockSize - 1) / blockSize;
+
+    const uint startBlockID = (taskID + 0) * numBlocks / numTasks;
+    const uint endBlockID = (taskID + 1) * numBlocks / numTasks;
+    uint numBlockIDs = endBlockID - startBlockID;
+
+    uint splitRecordID = 0;
+    uint offset_start = 0;
+    uint offset_end = 0;
+    uint cur_blocks = 0;
+
+    for (uint blockCounter = 0; splitRecordID < numGlobalBuildRecords; splitRecordID++)
+    {
+        const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start;
+        const uint blocks = (sizeRecord + blockSize - 1) / blockSize;
+        if (startBlockID >= blockCounter && startBlockID < blockCounter + blocks)
+        {
+            const uint preBlocks = startBlockID - blockCounter;
+            cur_blocks = min(numBlockIDs, blocks - preBlocks);
+            offset_start = preBlocks * blockSize;
+            offset_end = min(offset_start + cur_blocks * blockSize, sizeRecord);
+            break;
+        }
+        blockCounter += blocks;
+    }
+
+    if (localID == 0)
+        DBG(printf("taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks));
+
+    local struct BinInfo2 local_binInfo;
+    parallel_initBinInfo2(&local_binInfo, 2 * BINS);
+    struct BinMapping binMapping = global_record[splitRecordID].binMapping;
+
+    while (1)
+    {
+        work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        const uint startID = global_record[splitRecordID].range.start + offset_start;
+        const uint endID = global_record[splitRecordID].range.start + offset_end;
+
+        if (localID == 0)
+            DBG(printf("taskID %d startID %d endID %d \n", taskID, startID, endID));
+
+        for (uint i = startID + localID; i < endID; i += blockSize)
+        {
+            const uint index = primref_index0[i];
+            primref_index1[i] = index;
+            atomicUpdateLocalBinInfo2(&binMapping, &local_binInfo, &primref[index]);
+        }
+
+        work_group_barrier(CLK_LOCAL_MEM_FENCE); //FIXME: remove, do local sync
+        atomicUpdateGlobalFromLocalBinInfo2(&global_record[splitRecordID].binInfo, &local_binInfo, 2 * BINS);
+        work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        numBlockIDs -= cur_blocks;
+        if (numBlockIDs == 0)
+            break;
+
+        splitRecordID++;
+        parallel_initBinInfo2(&local_binInfo, 2 * BINS);
+        binMapping = global_record[splitRecordID].binMapping;
+
+        const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start;
+        const uint blocks = (sizeRecord + blockSize - 1) / blockSize;
+        cur_blocks = min(numBlockIDs, blocks);
+        offset_start = 0;
+        offset_end = min(cur_blocks * blockSize, sizeRecord);
+
+        if (localID == 0)
+            DBG(printf("taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks));
+    }
+}
+
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+global_compute_best_split_iteration(global struct Globals *globals,
+                                    global char *bvh_mem,
+                                    global struct GlobalBuildRecord *global_record)
+{
+    const uint localID = get_local_id(0);
+    const uint blockSize = get_local_size(0);
+    const uint taskID = get_group_id(0);
+    const uint numTasks = get_num_groups(0);
+
+    const uint numGlobalBuildRecords = globals->numGlobalBuildRecords;
+
+    /* early out */
+    if (numGlobalBuildRecords == 0)
+        return;
+
+    const uint startRecordID = (taskID + 0) * numGlobalBuildRecords / numTasks;
+    const uint endRecordID = (taskID + 1) * numGlobalBuildRecords / numTasks;
+    for (uint i = startRecordID; i < endRecordID; i++)
+    {
+        struct Split split = reduceBinsAndComputeBestSplit32(&global_record[i].binInfo,
+                                                             global_record[i].binMapping.scale,
+                                                             global_record[i].range.start,
+                                                             global_record[i].range.end);
+        if (localID == 0)
+        {
+            global_record[i].split = split;
+            global_record[i].atomicCountLeft = 0;
+            global_record[i].atomicCountRight = 0;
+            DBG(printSplit(&global_record[i].split));
+        }
+    }
+}
+
+__attribute__((reqd_work_group_size(MAX_WORKGROUP_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+global_partition_iteration(global struct Globals *globals,
+                           global struct AABB *primref,
+                           global uint *primref_index,
+                           global char *bvh_mem,
+                           global struct GlobalBuildRecord *global_record)
+{
+
+    const uint localID = get_local_id(0);
+    const uint blockSize = get_local_size(0);
+    const uint taskID = get_group_id(0);
+    const uint numTasks = get_num_groups(0);
+
+    const uint numGlobalBuildRecords = globals->numGlobalBuildRecords;
+
+    /* early out */
+    if (numGlobalBuildRecords == 0)
+        return;
+
+    /* double buffered primref index array */
+    global uint *primref_index0 = primref_index;
+    global uint *primref_index1 = primref_index + globals->numPrimitives;
+
+    uint numBlocks = 0;
+
+    /* get total number of blocks, size of block == WG size */
+    for (uint i = 0; i < numGlobalBuildRecords; i++)
+        numBlocks += (global_record[i].range.end - global_record[i].range.start + blockSize - 1) / blockSize;
+
+    const uint startBlockID = (taskID + 0) * numBlocks / numTasks;
+    const uint endBlockID = (taskID + 1) * numBlocks / numTasks;
+    uint numBlockIDs = endBlockID - startBlockID;
+
+    uint splitRecordID = 0;
+    uint offset_start = 0;
+    uint offset_end = 0;
+    uint cur_blocks = 0;
+
+    for (uint blockCounter = 0; splitRecordID < numGlobalBuildRecords; splitRecordID++)
+    {
+        const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start;
+        const uint blocks = (sizeRecord + blockSize - 1) / blockSize;
+        if (startBlockID >= blockCounter && startBlockID < blockCounter + blocks)
+        {
+            const uint preBlocks = startBlockID - blockCounter;
+            cur_blocks = min(numBlockIDs, blocks - preBlocks);
+            offset_start = preBlocks * blockSize;
+            offset_end = min(offset_start + cur_blocks * blockSize, sizeRecord);
+            break;
+        }
+        blockCounter += blocks;
+    }
+
+    if (localID == 0)
+        DBG(printf("partition taskID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks));
+
+    local struct AABB centroidAABB[2];
+    local struct AABB geometryAABB[2];
+    local uint local_sync;
+
+    while (1)
+    {
+
+        const uint startID = global_record[splitRecordID].range.start + offset_start;
+        const uint endID = global_record[splitRecordID].range.start + offset_end;
+
+        struct BinMapping binMapping = global_record[splitRecordID].binMapping;
+        struct Split split = global_record[splitRecordID].split;
+
+        const uint global_start = global_record[splitRecordID].range.start;
+        const uint global_end = global_record[splitRecordID].range.end;
+
+        if (localID == 0)
+            DBG(printf("partition taskID %d startID %d endID %d numBlocks %d splitRecordID %d numBlockIDs %d offset_start %d offset_end %d cur_blocks %d \n", taskID, startID, endID, numBlocks, splitRecordID, numBlockIDs, offset_start, offset_end, cur_blocks));
+
+        parallel_partition_segment_index(&local_sync, primref, &binMapping, startID, endID, global_start, global_end, &split, &centroidAABB[0], &centroidAABB[1], &geometryAABB[0], &geometryAABB[1], primref_index0, primref_index1, &global_record[splitRecordID].atomicCountLeft, &global_record[splitRecordID].atomicCountRight);
+
+        /* update global structures */
+        if (localID == 0)
+        {
+            AABB_global_atomic_merge(&global_record[splitRecordID].leftCentroid, &centroidAABB[0]);
+            AABB_global_atomic_merge(&global_record[splitRecordID].rightCentroid, &centroidAABB[1]);
+            AABB_global_atomic_merge(&global_record[splitRecordID].leftGeometry, &geometryAABB[0]);
+            AABB_global_atomic_merge(&global_record[splitRecordID].rightGeometry, &geometryAABB[1]);
+        }
+
+        numBlockIDs -= cur_blocks;
+        if (numBlockIDs == 0)
+            break;
+
+        splitRecordID++;
+
+        const uint sizeRecord = global_record[splitRecordID].range.end - global_record[splitRecordID].range.start;
+        const uint blocks = (sizeRecord + blockSize - 1) / blockSize;
+        cur_blocks = min(numBlockIDs, blocks);
+        offset_start = 0;
+        offset_end = min(cur_blocks * blockSize, sizeRecord);
+    }
+}
+
+inline void printBinaryNode(struct AABB *aabb)
+{
+    printf("lower %f upper %f lower.w %d upper.w %d \n", aabb->lower, aabb->upper, as_uint(aabb->lower.w), as_uint(aabb->upper.w));
+}
+
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel global_finalize_iteration(global struct Globals *globals,
+                          global struct GlobalBuildRecord *global_record,
+                          global char *bvh_mem,
+                          global struct AABB *binary_nodes)
+{
+    const uint localID = get_local_id(0);
+    const uint localSize = get_local_size(0);
+    const uint groupID = get_group_id(0);
+    const uint numGroups = get_num_groups(0);
+
+    global struct BuildRecord *records = getBuildRecords(bvh_mem, globals);
+
+    for (uint i = localID; i < globals->numGlobalBuildRecords; i += localSize)
+    {
+        const uint buildRecordID = global_record[i].buildRecordID;
+        const uint binaryNodeID = as_uint(records[buildRecordID].centroidBounds.lower.w);
+        /* left child buildrecord */
+        const uint leftID = buildRecordID;
+        records[leftID].start = global_record[i].range.start;
+        records[leftID].end = global_record[i].range.start + global_record[i].atomicCountLeft;
+        records[leftID].centroidBounds = global_record[i].leftCentroid;
+        /* right child buildrecord */
+        const uint rightID = generic_atomic_add(&globals->numBuildRecords, 1);
+        records[rightID].start = global_record[i].range.start + global_record[i].atomicCountLeft;
+        records[rightID].end = global_record[i].range.end;
+        records[rightID].centroidBounds = global_record[i].rightCentroid;
+        /* two binary nodes */
+        const uint binaryChildID = generic_atomic_add(&globals->numGlobalBinaryNodes, 2);
+        binary_nodes[binaryNodeID].lower.w = as_float(binaryChildID + 0);
+        binary_nodes[binaryNodeID].upper.w = as_float(binaryChildID + 1);
+        binary_nodes[binaryChildID + 0] = global_record[i].leftGeometry;
+        binary_nodes[binaryChildID + 1] = global_record[i].rightGeometry;
+        binary_nodes[binaryChildID + 0].lower.w = as_float(leftID);
+        binary_nodes[binaryChildID + 0].upper.w = as_float(-1);
+        binary_nodes[binaryChildID + 1].lower.w = as_float(rightID);
+        binary_nodes[binaryChildID + 1].upper.w = as_float(-1);
+        records[leftID].centroidBounds.lower.w = as_float(binaryChildID + 0);
+        records[rightID].centroidBounds.lower.w = as_float(binaryChildID + 1);
+    }
+
+    sub_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (localID == 0)
+    {
+        const uint sync = atomic_add(&globals->sync, 1);
+        if (sync + 1 == numGroups)
+        {
+            globals->sync = 0;
+            DBG(printf("globals->numBuildRecords %d \n", globals->numBuildRecords));
+            DBG(
+                for (uint i = 0; i < globals->numBuildRecords; i++) {
+                    printf("i %d \n", i);
+                    printBuildRecord(&records[i]);
+                } printf("Binary Tree \n");
+                for (uint i = 0; i < globals->numGlobalBinaryNodes; i++) {
+                    printf("i %d \n", i);
+                    printBinaryNode(&binary_nodes[i]);
+                }
+
+            );
+            globals->numGlobalBuildRecords = 0;
+        }
+    }
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel global_build_top_level(global struct Globals *globals,
+                                                                                  global struct GlobalBuildRecord *global_record,
+                                                                                  global char *bvh_mem,
+                                                                                  global struct AABB *binary_nodes)
+{
+#define MAX_TOP_LEVEL_STACK_DEPTH 32
+    struct AABB stack[MAX_TOP_LEVEL_STACK_DEPTH];
+    global uchar *stackParentPtrs[MAX_TOP_LEVEL_STACK_DEPTH];
+    struct AABB childrenAABB[BVH_NODE_N6];
+    float childrenHalfArea[BVH_NODE_N6];
+
+    /* build records */
+    global struct BuildRecord *record = getBuildRecords(bvh_mem, globals);
+
+    struct BVHBase *base = (struct BVHBase *)bvh_mem;
+    struct QBVHNodeN *qnode_root = (global struct QBVHNodeN *)(bvh_mem + base->rootNodeOffset);
+
+    uint stack_index = 1;
+    stack[0] = binary_nodes[0];
+    stackParentPtrs[0] = (global uchar *)qnode_root;
+
+    while (stack_index != 0)
+    {
+        stack_index--;
+
+        childrenAABB[0] = stack[stack_index];
+        struct QBVHNodeN *qnode = (struct QBVHNodeN *)stackParentPtrs[stack_index];
+        childrenHalfArea[0] = AABB_halfArea(&childrenAABB[0]);
+
+        /* buildrecord leaf => set parent pointer and continue*/
+        DBG(
+            printf("stack_index %d \n", stack_index);
+            printf("as_uint(childrenAABB[0].upper.w) %d \n", as_uint(childrenAABB[0].upper.w)););
+
+        if (as_uint(childrenAABB[0].upper.w) == -1)
+        {
+            const uint buildRecordID = as_uint(childrenAABB[0].lower.w);
+            DBG(
+                printf("leaf buildRecordID %d \n", buildRecordID);
+                printBuildRecord(&record[buildRecordID]);)
+
+            record[buildRecordID].current = (global uchar *)qnode;
+            continue;
+        }
+
+        childrenHalfArea[0] = AABB_halfArea(&childrenAABB[0]);
+
+        uint numChildren = 1;
+        while (numChildren < BVH_NODE_N6)
+        {
+            // FIXME
+
+            /*! find best child to split */
+            float bestArea = -(float)INFINITY;
+            int bestChild = -1;
+            for (int i = 0; i < numChildren; i++)
+            {
+                /* ignore leaves as they cannot get split */
+                if (as_uint(childrenAABB[i].upper.w) == -1)
+                    continue;
+
+                /* find child with largest surface area */
+                if (childrenHalfArea[i] > bestArea)
+                {
+                    bestChild = i;
+                    bestArea = childrenAABB[i].lower.w;
+                }
+            }
+            if (bestChild == -1)
+                break;
+            const uint leftID = as_uint(childrenAABB[bestChild].lower.w);
+            const uint rightID = as_uint(childrenAABB[bestChild].upper.w);
+            childrenAABB[bestChild] = binary_nodes[leftID];
+            childrenAABB[numChildren] = binary_nodes[rightID];
+            childrenHalfArea[bestChild] = AABB_halfArea(&childrenAABB[bestChild]);
+            childrenHalfArea[numChildren] = AABB_halfArea(&childrenAABB[numChildren]);
+            numChildren++;
+        }
+
+        const uint child_node_offset = alloc_single_node_mem(globals, sizeof(struct QBVHNodeN) * numChildren);
+
+        /* update single relative node pointer */
+        const int offset = encodeOffset(bvh_mem, (global void *)qnode, child_node_offset) >> 6;
+        const uint type = BVH_INTERNAL_NODE;
+
+        setQBVHNodeN(offset, type, childrenAABB, numChildren, qnode);
+
+        DBG(
+            printQBVHNodeN(qnode);
+            printf("numChildren %d \n", numChildren);
+            for (uint i = 0; i < numChildren; i++)
+                AABB_print(&childrenAABB[i]););
+
+        /* update parent pointer of build records of all children */
+        for (uint ID = 0; ID < numChildren; ID++)
+        {
+            stack[stack_index] = childrenAABB[ID];
+            stackParentPtrs[stack_index] = (global uchar *)bvh_mem + child_node_offset + ID * sizeof(struct QBVHNodeN);
+            stack_index++;
+        }
+    }
+}
+
+#endif
diff --git a/src/intel/vulkan/grl/gpu/bvh_build_treelet_refit.h b/src/intel/vulkan/grl/gpu/bvh_build_treelet_refit.h
new file mode 100644
index 00000000000..b8cf7288f6a
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_build_treelet_refit.h
@@ -0,0 +1,1507 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "bvh_build_refit.h"
+#include "libs/lsc_intrinsics.h"
+
+
+#define REFIT_DEBUG_CHECKS 0
+#define REFIT_VERBOSE_LOG 0
+
+#define NUM_STARTPOINTS_IN_SLM (1024)
+
+GRL_INLINE void storeAABBToL1(struct AABB aabb, struct AABB* ptr)
+{
+    uint8 val = (uint8)(
+        as_uint(aabb.lower.x), as_uint(aabb.lower.y), as_uint(aabb.lower.z), as_uint(aabb.lower.w),
+        as_uint(aabb.upper.x), as_uint(aabb.upper.y), as_uint(aabb.upper.z), as_uint(aabb.upper.w));
+
+    store_uint8_L1WB_L3WB((__global uint8*) ptr, 0, val);
+}
+
+GRL_INLINE void storeAABBToL3(struct AABB aabb, struct AABB* ptr)
+{
+    uint8 val = (uint8)(
+        as_uint(aabb.lower.x), as_uint(aabb.lower.y), as_uint(aabb.lower.z), as_uint(aabb.lower.w),
+        as_uint(aabb.upper.x), as_uint(aabb.upper.y), as_uint(aabb.upper.z), as_uint(aabb.upper.w));
+
+    store_uint8_L1UC_L3WB((__global uint8*) ptr, 0, val);
+}
+
+typedef struct Treelet_by_single_group_locals
+{
+    uint   startpoints[NUM_STARTPOINTS_IN_SLM];
+} Treelet_by_single_group_locals;
+
+typedef struct SquashedInputGroupDesc {
+    qword bvh;
+    qword scratch;
+    uint  groupInTree;
+    uint  totalNumGroups; //valid only for 0th element in array, otherwise its trash padding
+} SquashedInputGroupDesc;
+
+//
+//
+// update primitives
+//
+//
+
+typedef struct SquashedInput {
+    global struct BVHBase* pBvh;
+    global void* pInput;
+    global struct AABB* bbox_scratch;
+} SquashedInput;
+
+
+
+// updates one quad leaf and gets BBOX contatining it
+GRL_INLINE void refit_bottom_child_quad(
+    global struct QuadLeaf* quad,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    struct AABB* childAABB)
+{
+    struct QuadLeaf Q;
+    get_updated_quad(quad, geomDesc, &Q);
+    quadCopyVertices(&Q, quad);
+    *childAABB = getAABB_Quad((struct Quad*) & Q); // FIXME: support leaves with more than one quad
+}
+
+// procedurals will have to go old path at first
+#if 0
+// updates one procedural leaf and gets BBOX contatining it
+GRL_INLINE void refit_bottom_child_procedural(
+    global struct ProceduralLeaf** pleaf,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    struct AABB* childAABB)
+{
+    global struct ProceduralLeaf* leaf = *pleaf;
+    /* extract geomID and primID from leaf */
+    const uint startPrim = QBVHNodeN_startPrim(curNode, child_idx);
+    const uint geomID = ProceduralLeaf_geomIndex(leaf);
+    const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf!
+
+    /* read bounds from geometry descriptor */
+    struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
+    childAABB->lower.x = aabb.MinX;
+    childAABB->lower.y = aabb.MinY;
+    childAABB->lower.z = aabb.MinZ;
+    childAABB->upper.x = aabb.MaxX;
+    childAABB->upper.y = aabb.MaxY;
+    childAABB->upper.z = aabb.MaxZ;
+
+    /* advance leaf pointer to next child */
+    *pleaf = leaf + QBVHNodeN_blockIncr(curNode, child_idx);
+}
+
+
+GRL_INLINE void update_procedural_leafs(
+    global struct BVHBase* bvh,
+    global void* input,
+    global struct AABB* bbox_scratch,
+    uint id,
+    uint num_done_by_one_thread)
+{
+    uint numLeaves = BVHBase_GetNumQuads(bvh);
+    uint leafsIndexOffset = bvh->proceduralDataStart - BVH_ROOT_NODE_OFFSET / 64;
+    global ProceduralLeaf* leafs = (global QuadLeaf*)BVHBase_GetProceduralLeaves(bvh);
+    uint start_leaf = id * num_done_by_one_thread;
+    uint end_leaf = min(start_leaf + num_done_by_one_thread, numLeaves);
+
+    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input;
+
+    for (uint leaf_id = start_leaf; leaf_id < end_leaf; leaf_id++)
+    {
+        struct AABB theAABB;
+        refit_bottom_child_procedural(leafs + leaf_id, geosArray, &theAABB);
+        theAABB.lower.w = as_float(0xABBADEFF);
+        theAABB.upper.w = 0x00;
+        storeAABBToL1(theAABB, &bbox[leafsIndexOffset + leaf_id]);
+    }
+}
+#endif
+
+GRL_INLINE void update_quads(
+    global struct BVHBase* bvh,
+    global void* input,
+    global struct AABB* bbox_scratch,
+    uint id,
+    uint num_done_by_one_thread)
+{
+    uint numLeaves = BVHBase_GetNumQuads(bvh);
+    uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64;
+    global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh);
+    uint start_leaf = id * num_done_by_one_thread;
+    uint end_leaf = min(start_leaf + num_done_by_one_thread, numLeaves);
+
+    global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input;
+
+    for (uint leaf_id = start_leaf; leaf_id < end_leaf; leaf_id++)
+    {
+        struct AABB theAABB;
+        refit_bottom_child_quad(leafs + leaf_id, geosArray, &theAABB);
+        theAABB.lower.w = as_float(0xABBADEFF);
+        theAABB.upper.w = 0x00;
+        storeAABBToL1(theAABB, &bbox_scratch[leafsIndexOffset + leaf_id]);
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// core bottom-up update functions
+//
+//
+
+GRL_INLINE void quantise_bounds(
+    struct AABB* input_aabb, float3 len, float3 mant, float3 org, int3 exp,
+    uchar3* lower_uchar,
+    uchar3* upper_uchar)
+{
+    const float up = 1.0f + ulp;
+    const float down = 1.0f - ulp;
+
+    struct AABB child_aabb = conservativeAABB(input_aabb); // conservative ???
+
+    float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8));
+    lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX));
+    float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8));
+    upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX));
+
+    *lower_uchar = convert_uchar3_rtn(lower);
+    *upper_uchar = convert_uchar3_rtp(upper);
+}
+
+typedef struct Qbounds_as_DW {
+    uint32_t xLL; uint32_t xLU; uint32_t xUU;
+    uint32_t yLL; uint32_t yLU; uint32_t yUU;
+    uint32_t zLL; uint32_t zLU; uint32_t zUU;
+} Qbounds_as_DW;
+
+GRL_INLINE void encodeQuantisedDataAsDW(
+    uchar3 lower_uchar,
+    uchar3 upper_uchar,
+    uint idx,
+    Qbounds_as_DW* qbounds)
+{
+    uint shift_init = idx * 8;
+    if (idx >= 4) {
+        uint shift = (shift_init - 32);
+        qbounds->xLU |= ((uint)lower_uchar.x) << shift;
+        qbounds->yLU |= ((uint)lower_uchar.y) << shift;
+        qbounds->zLU |= ((uint)lower_uchar.z) << shift;
+    }
+    else {
+        qbounds->xLL |= ((uint)lower_uchar.x) << shift_init;
+        qbounds->yLL |= ((uint)lower_uchar.y) << shift_init;
+        qbounds->zLL |= ((uint)lower_uchar.z) << shift_init;
+    }
+
+    if (idx < 2) {
+        uint shift = (shift_init + 16);
+        qbounds->xLU |= ((uint)upper_uchar.x) << shift;
+        qbounds->yLU |= ((uint)upper_uchar.y) << shift;
+        qbounds->zLU |= ((uint)upper_uchar.z) << shift;
+    }
+    else {
+        uint shift = (shift_init - 16);
+        
+        qbounds->xUU |= ((uint)upper_uchar.x) << shift;
+        qbounds->yUU |= ((uint)upper_uchar.y) << shift;
+        qbounds->zUU |= ((uint)upper_uchar.z) << shift;
+    }
+}
+
+GRL_INLINE void encodeChildBounds(uchar3 lower_uchar, uchar3 upper_uchar, uint ch, struct InternalNode* qnode)
+{
+    qnode->lower_x[ch] = lower_uchar.x; qnode->upper_x[ch] = upper_uchar.x;
+    qnode->lower_y[ch] = lower_uchar.y; qnode->upper_y[ch] = upper_uchar.y;
+    qnode->lower_z[ch] = lower_uchar.z; qnode->upper_z[ch] = upper_uchar.z;
+}
+    
+
+GRL_INLINE GRL_OVERLOADABLE void InternalNode_setBounds_skip_prev(struct InternalNode* qbvh_node, uint prevChildIdx, struct AABB* prev_input_aabb, struct AABB* input_aabb, uint childrenIndex, const uint numChildren, struct AABB* aabb_reduced)
+{
+    
+    int3 exp;
+    const float up = 1.0f + ulp;
+    struct AABB conservative_aabb = conservativeAABB(aabb_reduced);
+    const float3 len = AABB_size(&conservative_aabb).xyz * up;
+    const float3 mant = frexp_vec3(len, &exp);
+    const float3 org = conservative_aabb.lower.xyz;
+
+    exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
+
+    qbvh_node->lower[0] = org.x; qbvh_node->lower[1] = org.y; qbvh_node->lower[2] = org.z;
+
+    qbvh_node->exp_x = exp.x; qbvh_node->exp_y = exp.y;  qbvh_node->exp_z = exp.z;
+    
+    Qbounds_as_DW qbounds = { 0x0 };
+
+
+    {
+        uchar3 lower_uchar, upper_uchar;
+        quantise_bounds(prev_input_aabb, len, mant, org, exp, &lower_uchar, &upper_uchar);
+
+        //encode invalid children. its enough to set 0x80 as lower_x bytes
+        uint shift = numChildren * 8;
+        uint shift2 = min(shift, 31u);
+        qbounds.xLL = (0x80808080u << shift2);
+        uint shift3 = max(shift, 32u) - 32;
+        qbounds.xLU = (ushort)(((ushort)0x8080) << (ushort)shift3);
+
+        encodeQuantisedDataAsDW(lower_uchar, upper_uchar, prevChildIdx, &qbounds);
+        //encodeChildBounds(lower_uchar, upper_uchar, prevChildIdx, qbvh_node);
+    }
+
+    uint ch = prevChildIdx == 0;
+    while (ch < numChildren) {
+        uchar3 lower_uchar, upper_uchar;
+        quantise_bounds(input_aabb + ch, len, mant, org, exp, &lower_uchar, &upper_uchar);
+        encodeQuantisedDataAsDW(lower_uchar, upper_uchar, ch, &qbounds);
+        //encodeChildBounds(lower_uchar, upper_uchar, ch, qbvh_node);
+        ch += 1 + (prevChildIdx == (ch + 1));
+    }
+    Qbounds_as_DW* qbounds_dst = (Qbounds_as_DW*)(&qbvh_node->lower_x[0]);
+    *qbounds_dst = qbounds;
+    return;
+}
+
+GRL_INLINE struct AABB refitReduce2Boxes(struct AABB A, struct AABB B)
+{
+    AABB_extend(&A, &B);
+    // to make it work for TLAS node masks change to this:
+    // A.lower.w = as_float(as_uint(A.lower.w) | as_uint(B.lower.w));
+    A.lower.w = as_float(0xABBADE00u);
+    return A;
+}
+
+GRL_INLINE void refitReduceNodePrev(
+    uint prevIdx,
+    uint leadChildIdx,
+    uint numChildren,
+    struct AABB* globalBox,
+    struct AABB* reduceBox,
+    uint depth,
+    uint NodeIndex)
+{
+    uint8_t childIgnored = (prevIdx - leadChildIdx);
+
+#   if REFIT_DEBUG_CHECKS
+    bool err = false;
+    if ((as_uint(reduceBox->lower.w) & 0xFFFFFF00) != 0xABBADE00u)
+    {
+        printf("refitReduceNode6 (loc_id %d): prev (used as child %d) not updated! NodeIndex %d, child nodeIdx %d at depth %d\n",
+            get_local_id(0),
+            childIgnored,
+            NodeIndex,
+            prevIdx,
+            depth);
+        err = true;
+    }
+
+    if ((as_uint(globalBox[NodeIndex].lower.w) & 0xFFFFFF00) == 0xABBADE00u)
+    {
+        printf("refitReduceNode6 (loc_id %d): dst node already updated. NodeIndex %d depth %d\n",
+            get_local_id(0),
+            NodeIndex,
+            depth);
+    }
+
+    bool fail = false;
+    for (uint k = 0; (k < numChildren) && !err; ++k) {
+        if (k != childIgnored) {
+            if ((as_uint(globalBox[leadChildIdx + k].lower.w) & 0xFFFFFF00) != 0xABBADE00u) {
+                printf("refitReduceNode6 (loc_id %d): child %d not updated! use prev %d, NodeIndex %d, child nodeIdx %d at depth %d\n",
+                    get_local_id(0),
+                    k,
+                    prevIdx - leadChildIdx,
+                    NodeIndex,
+                    leadChildIdx + k,
+                    depth);
+                fail = true;   
+            }
+        }
+    }
+    err |= fail;
+#   endif
+
+    // for each child 3 bits contains load index
+    const uint32_t indicesEncoded =
+        (1 << 0) +
+        (2 << 3) +
+        (3 << 6) +
+        (4 << 9) +
+        (5 << 12) +
+        (0 << 15) +
+        (1 << 18) +
+        (2 << 21) +
+        (3 << 24) +
+        (4 << 27);
+    // 1,2,3,4,5
+
+
+    uint32_t indicesEncodedShifted = indicesEncoded >> (childIgnored * 3);
+
+    struct AABB* childAABB = globalBox + leadChildIdx;
+    struct AABB  temp = childAABB[indicesEncodedShifted & 7];
+    indicesEncodedShifted >>= 3;
+    struct AABB* nextChild = childAABB + (indicesEncodedShifted & 7);
+    struct AABB  backlog = temp;
+
+    for (uint child = 2; child < numChildren; child++)
+    {
+        temp = *nextChild;
+        *reduceBox = refitReduce2Boxes(*reduceBox, backlog);
+        indicesEncodedShifted >>= 3;
+        nextChild = childAABB + (indicesEncodedShifted & 7);
+        backlog = temp;
+    }
+
+    *reduceBox = refitReduce2Boxes(*reduceBox, backlog);
+
+#if REFIT_DEBUG_CHECKS
+    for (uint k = 0; (k < numChildren) && !err; ++k) {
+        if (k != childIgnored) {
+            if (!AABB_subset(&globalBox[leadChildIdx + k], reduceBox)) {
+                printf("refitReduceNode6 (loc_id %d): child AABB %d/%d reduction went wrong! skipped prev %d, NodeIndex %d, child nodeIdx %d at depth %d\n",
+                    get_local_id(0),
+                    k, numChildren,
+                    prevIdx - leadChildIdx,
+                    NodeIndex,
+                    leadChildIdx + k,
+                    depth);
+
+                err = true;
+            }
+        }
+    }
+    if (!err && ((as_uint(reduceBox->lower.w) & 0xFFFFFF00) != 0xABBADE00u)) {
+        printf("refitReduceNode6: havent set the 0xABBADEXXu marker in result node %d at depth %d!\n",
+            NodeIndex,
+            depth);
+    }
+#endif
+}
+
+
+GRL_INLINE uint hash_local_id()
+{
+    return get_sub_group_local_id() * get_num_sub_groups() + get_sub_group_id();
+}
+
+//===============================================================
+//
+//  Core update function
+//
+//===============================================================
+GRL_INLINE bool refit_treelet_by_single_group(
+    global  struct AABB* bbox,
+    local Treelet_by_single_group_locals* loc,
+    uniform global BVHBase* pBvh,
+    uniform RefitTreelet   trltDsc,
+    bool encodeQnodes,
+    bool isTipTreelet)
+{
+    BackPointers* backpointers = BVHBase_GetBackPointers(pBvh);    
+    InternalNode* internalNodes = BVHBase_GetInternalNodes(pBvh);
+    uint local_id = get_local_id(0);   
+    StartPoint* startPoints = BVHBase_GetRefitStartPoints(pBvh) + trltDsc.startpoint_offset;
+    
+    // special case for single path treelets, TODO rewrite it as subgroups based
+    if (trltDsc.numStartpoints == 1) {
+        if (local_id == 0) {
+            RefitTreeletTrivial desc = *((RefitTreeletTrivial*)& trltDsc);
+            uint innerNodeIdx   = desc.theOnlyNodeIndex;
+            uint numChildren    = desc.numChildrenOfTheNode;
+            uint childIndex     = desc.childrenOffsetOfTheNode;
+            uint maxDepth       = desc.maxDepth;
+
+            uint prevIdx = childIndex;
+            struct AABB myBox = bbox[childIndex];
+            struct AABB prevAABB;
+            uint backpointer = maxDepth > 0 ? *InnerNode_GetBackPointer(backpointers, innerNodeIdx) : 0;
+            InternalNode* curNode = internalNodes + innerNodeIdx;
+            uint currDepth = 0;
+            
+            while (1)
+            {
+                prevAABB = myBox;
+                if (numChildren > 1) { refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, 0, innerNodeIdx); }
+                
+                if (!encodeQnodes) { myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4)); }
+                
+                if (++currDepth > maxDepth) { break; }
+
+                if (encodeQnodes) {
+                    InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox); 
+                }
+#if !REFIT_DEBUG_CHECKS
+                else
+#endif
+                { storeAABBToL1(myBox, &bbox[innerNodeIdx]); }
+
+                prevIdx = innerNodeIdx;
+                innerNodeIdx = BackPointer_GetParentIndex(backpointer);
+                backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
+                numChildren = BackPointer_GetNumChildren(backpointer);
+                curNode = internalNodes + innerNodeIdx;
+                childIndex = innerNodeIdx + curNode->childOffset;
+            }
+
+            if (isTipTreelet) {
+                AABB3f reduced3f = AABB3fFromAABB(myBox);
+                pBvh->Meta.bounds = reduced3f;
+            }
+            else {
+                storeAABBToL3(myBox, &bbox[innerNodeIdx]);
+            }
+
+            if (encodeQnodes || isTipTreelet) {
+                InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
+            }
+            
+#if REFIT_VERBOSE_LOG
+            printf("single node treelet: storing node idx %d \n", innerNodeIdx);
+#endif
+        }
+        
+        return local_id == 0;
+    }
+
+    local uint* loc_startpoints = loc->startpoints;
+    
+
+#if REFIT_DEBUG_CHECKS
+    if ((trltDsc.numNonTrivialStartpoints > NUM_STARTPOINTS_IN_SLM)) {
+        if(local_id == 0) printf("out of SLM space, trltDsc.depthSub_NUM_STARTPOINTS_IN_SLM > 0\n");
+        return local_id == 0;
+    }
+#endif
+
+    uint SLMedStartpointsOffset = trltDsc.numStartpoints - trltDsc.numNonTrivialStartpoints;
+
+    /*=====================================================================
+    first phase where we update startpoints nodes only
+    ----------------------------------------------------------------------*/
+    for (uint startpoint_i = local_id; startpoint_i < trltDsc.numStartpoints; startpoint_i += get_local_size(0)) {
+        uint startpoint = (uint)intel_sub_group_block_read_ui((global uint*)(startPoints + startpoint_i));
+        uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint);
+        uint backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
+        if (startpoint_i >= SLMedStartpointsOffset) {
+            uint idx = startpoint_i - SLMedStartpointsOffset;
+            loc_startpoints[idx] = (BackPointer_GetParentIndex(backpointer) << 6) | StartPoint_GetDepth(startpoint);
+        }
+        
+        uint numChildren = BackPointer_GetNumChildren(backpointer);
+        InternalNode* curNode = internalNodes + innerNodeIdx;
+        uint childIndex = innerNodeIdx + curNode->childOffset;
+        
+        uint prevIdx = childIndex;
+        struct AABB myBox = bbox[childIndex];
+        struct AABB prevAABB = myBox;
+
+#   if REFIT_DEBUG_CHECKS
+        if (numChildren == 0) {
+            printf("this node has no chidren!\n", 0);
+            AABB_init(&myBox);
+        }
+#   endif
+        
+        if (numChildren > 1) { refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, 0, innerNodeIdx); }
+        myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4));
+        
+#if REFIT_VERBOSE_LOG
+        printf("init phase: at depth 0 storing node idx %d \n", innerNodeIdx);
+#endif
+        storeAABBToL1(myBox, &bbox[innerNodeIdx]);
+
+        if (encodeQnodes) {
+            InternalNode_setBounds_skip_prev(curNode, 0, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
+        }
+    }
+
+    uniform uint CurrPeeledDepth = 1;
+    uniform uint numStartpoints = trltDsc.numNonTrivialStartpoints;
+    uint nextFloorStartpoint = hash_local_id();
+
+    uint depthOnionEnd = trltDsc.depthLess64;
+    if (get_local_size(0) == 128) { depthOnionEnd = trltDsc.depthLess128; }
+    if (get_local_size(0) == 256) { depthOnionEnd = trltDsc.depthLess256; }
+
+    /*=====================================================================
+    second phase, we update horizontally untill 
+    we reach number of active path below grou size
+    ----------------------------------------------------------------------*/
+    while (CurrPeeledDepth < depthOnionEnd) {
+        mem_fence_workgroup_default();
+
+        work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group);
+        uint start = nextFloorStartpoint;
+        nextFloorStartpoint = numStartpoints;
+
+        for (uint startpoint_i = start; startpoint_i < numStartpoints; startpoint_i += get_local_size(0)) {
+            uint startpoint   = loc_startpoints[startpoint_i];
+            uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint);
+            uint backpointer  = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
+
+            if (StartPoint_GetDepth(startpoint) > CurrPeeledDepth) {
+                StartPoint newSP = (BackPointer_GetParentIndex(backpointer) << 6) | StartPoint_GetDepth(startpoint);
+                loc_startpoints[startpoint_i] = newSP;
+                nextFloorStartpoint = min(nextFloorStartpoint, startpoint_i);
+            }
+
+            InternalNode* curNode = internalNodes + innerNodeIdx;
+            uint childIndex = innerNodeIdx + curNode->childOffset;
+            uint numChildren = BackPointer_GetNumChildren(backpointer);
+
+            uint prevIdx = childIndex;
+            struct AABB myBox = bbox[childIndex];
+            struct AABB prevAABB = myBox;
+            refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx);
+
+            myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4));
+
+#if REFIT_VERBOSE_LOG
+            printf("onion: startpoint %d <n=%d , d=%d> at depth %d storing node idx %d \n", startpoint_i, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx);
+#endif
+            storeAABBToL1(myBox, &bbox[innerNodeIdx]);
+            if (encodeQnodes) {
+                InternalNode_setBounds_skip_prev(curNode, 0, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
+            }
+        }
+        CurrPeeledDepth++;
+    }
+
+    uint startpoint_idx = nextFloorStartpoint;
+    bool active = startpoint_idx < numStartpoints;
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group);
+    StartPoint startpoint = loc_startpoints[startpoint_idx];
+
+    struct AABB myBox;
+    uint prevIdx = 0;
+    uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint);
+
+    /*=====================================================================
+    last phase, each thread just continues path to its end
+    
+    only thread that computes the longest path leaves prematurely 
+    (thats why while condition isn't <=) the code for finalizing root of treelet
+    is special and hendled afterwards
+
+    TODO: with proper assigning of paths to lanes we should reach only three
+    active lanes per physical thread quite soon for this subgroups could be used 
+    ----------------------------------------------------------------------*/
+    bool prevActive = active;
+    while (CurrPeeledDepth < trltDsc.maxDepth) {
+        uint backpointer;
+        uint childIndex;
+        InternalNode* curNode = internalNodes + innerNodeIdx;
+        if (active) {
+            childIndex = innerNodeIdx + curNode->childOffset;
+            backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
+        } else if(prevActive){
+            mem_fence_workgroup_default();
+        }
+
+        prevActive = active;
+
+        work_group_barrier(0, memory_scope_work_group);
+        //printf("Start node %d at depth %d, innerNodeIdx %d dying! \n", StartPoint_GetNodeIdx(startpoint), CurrPeeledDepth, innerNodeIdx);
+        if (active) {
+
+#if REFIT_DEBUG_CHECKS
+            if (CurrPeeledDepth > StartPoint_GetDepth(startpoint))
+            {
+                printf("uppath: startpoint %d <n=%d , d=%d> at depth %d shouldn't be active!\n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth);
+            }
+#endif
+            if (prevIdx == 0) {
+                myBox = bbox[childIndex];
+                prevIdx = childIndex;
+            }
+            uint numChildren = BackPointer_GetNumChildren(backpointer);
+
+            struct AABB prevAABB = myBox;
+            refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx);
+            myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4));
+#if REFIT_VERBOSE_LOG
+            printf("uppath: startpoint %d <n=%d , d=%d> at depth %d storing node idx %d \n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx);
+#endif
+            active = CurrPeeledDepth < StartPoint_GetDepth(startpoint);
+
+            if (encodeQnodes) {
+#if !REFIT_DEBUG_CHECKS
+                if (!active)
+#endif
+                { storeAABBToL1(myBox, &bbox[innerNodeIdx]); }
+                InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
+            } else {
+                storeAABBToL1(myBox, &bbox[innerNodeIdx]);
+            }
+
+            prevIdx = innerNodeIdx;
+            innerNodeIdx = BackPointer_GetParentIndex(backpointer);
+        }
+
+        CurrPeeledDepth++;
+    }
+
+    {
+        uint backpointer;
+        uint childIndex;
+        InternalNode* curNode = internalNodes + innerNodeIdx;
+        if (active) {
+            childIndex = innerNodeIdx + curNode->childOffset;
+            backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
+        } else if(prevActive) {
+            mem_fence_workgroup_default();
+        }
+
+        work_group_barrier(0, memory_scope_work_group);
+
+        /*=====================================================================
+        final step, is special processing of root,
+        its different, since its box is transfered cross group (written to L3)
+        or is root of whole tree and hence fill global box in bvh MD
+        TODO: this should be done in SG as only one thread is active
+        ----------------------------------------------------------------------*/
+        if (active) {
+            if (prevIdx == 0) {
+                myBox = bbox[childIndex];
+                prevIdx = childIndex;
+            }
+            uint numChildren = BackPointer_GetNumChildren(backpointer);
+            struct AABB prevAABB = myBox;
+            refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx);
+            myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4));
+            
+#if REFIT_VERBOSE_LOG
+            printf("root: startpoint %d <n=%d , d=%d> at depth %d storing node idx %d \n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx/*,WeReInSIMD*/);
+#endif
+            if (isTipTreelet) {
+                AABB3f reduced3f = AABB3fFromAABB(myBox);
+                pBvh->Meta.bounds = reduced3f;
+                InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
+            } else {
+                storeAABBToL3(myBox, &bbox[innerNodeIdx]);
+                if (encodeQnodes) {
+                    InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
+                }
+            }
+        }
+    }
+
+    return active;
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////
+//
+// Internal nodes enocding as a separate dispatch
+//
+//
+
+// encode qnodes as a separate pass
+GRL_INLINE void post_refit_encode_qnode_tree_per_group(
+    global struct AABB* bbox_scratch,
+    global struct BVHBase* bvh)
+{
+    uint numInnerNodes = BVHBase_GetNumInternalNodes(bvh);
+    InternalNode* internalNodes = BVHBase_GetInternalNodes(bvh);
+
+    for (uint nodeIdx = get_local_id(0) + 1 /*+1 because node 0 is already updated*/; nodeIdx < numInnerNodes; nodeIdx += get_local_size(0))
+    {
+        struct AABB reduced = bbox_scratch[nodeIdx];
+#   if REFIT_DEBUG_CHECKS
+        if ((as_uint(reduced.lower.w) & 0xFFFFFF00) != 0xABBADE00u) {
+            printf("qnode enc group: NodeIndex %d not updated! \n", nodeIdx);
+            return;
+        }
+        for (uint k = 0; k < (as_uint(reduced.upper.w) & 7); ++k) {
+            uint childIdx = (as_uint(reduced.upper.w) >> 4) + k;
+            if ((as_uint(bbox_scratch[childIdx].lower.w) & 0xFFFFFF00) != 0xABBADE00u) {
+                printf("qnode enc group: child not updated! NodeIndex %d, child nodeIdx %d \n", nodeIdx, childIdx);
+                return;
+            }
+        }
+#   endif
+        struct InternalNode* qbvh_node = internalNodes + nodeIdx;
+        uint childIndex = as_uint(reduced.upper.w) >> 4;
+        uint numChildren = as_uint(reduced.upper.w) & 7;
+        struct AABB* children = bbox_scratch + childIndex;
+        //InternalNode_setBounds(internalNodes + nodeIdx, bbox_scratch + (as_uint(reduced.upper.w) >> 4), as_uint(reduced.upper.w) & 7, &reduced);
+        InternalNode_setBounds_skip_prev(qbvh_node, 0, children, children, childIndex, numChildren, &reduced);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+//
+// Construction of treelets and paths
+//
+//
+
+// this is tiny bit tricky, when bottom-up thread haven't yet closed treelet this is number of startpoints that are under the node
+// when thread closed treelets it the data is starts to be treelet ID
+typedef uint TreeletNodeData;
+
+typedef struct TreeletsOpenNodeInfo {
+    // bool isTreeletRoot; // : 1 
+    short   maxDepth;      // : 14
+    uint    numStartpoints;// : 16
+} TreeletsOpenNodeInfo;
+
+typedef struct TreeletsClosedNodeInfo {
+    // bool isTreeletRoot; // : 1 
+    uint    treeletId;     // : 31 (when treelet is closed)
+} TreeletsClosedNodeInfo;
+
+GRL_INLINE TreeletNodeData ClearTreeletRoot(TreeletNodeData D)
+{
+    return D & ((1u << 31u) - 1u);
+}
+
+GRL_INLINE uint isTreeletRoot(TreeletNodeData E)
+{
+    return E >> 31;
+}
+
+GRL_INLINE uint getNumStartpoints(TreeletNodeData E)
+{
+    return E & ((1 << 16) - 1);
+}
+
+GRL_INLINE uint getMaxDepth(TreeletNodeData E)
+{
+    return (E >> 16) & ((1 << 14) - 1);
+}
+
+// single startpoint treelet
+GRL_INLINE uint isTrivialTreeletRoot(TreeletNodeData E)
+{
+    return (E >> 31) && (getMaxDepth(E) == 0);
+}
+
+GRL_INLINE TreeletNodeData SetTipStartpoint(TreeletNodeData D)
+{
+    return ClearTreeletRoot(D) | (1 << 30);
+}
+
+GRL_INLINE TreeletNodeData SetTreeletRoot(TreeletNodeData D)
+{
+    return D | (1 << 31);
+}
+
+GRL_INLINE TreeletsOpenNodeInfo DecodeOpenInfo(TreeletNodeData E)
+{
+    TreeletsOpenNodeInfo I;
+    I.maxDepth = getMaxDepth(E);
+    I.numStartpoints = getNumStartpoints(E);
+    return I;
+}
+
+GRL_INLINE TreeletNodeData EncodeOpenInfo(TreeletsOpenNodeInfo I, bool isRoot)
+{
+    TreeletNodeData D = isRoot ? (1 << 31) : 0;
+    D |= (I.maxDepth & ((1 << 14) - 1)) << 16;
+    D |= I.numStartpoints & ((1 << 16) - 1);
+    return D;
+}
+
+GRL_INLINE TreeletsClosedNodeInfo DecodeClosedInfo(TreeletNodeData E)
+{
+    TreeletsClosedNodeInfo I;
+    I.treeletId = E & ((1u << 31u) - 1u);
+    return I;
+}
+
+GRL_INLINE TreeletNodeData GRL_OVERLOADABLE EncodeClosedInfo(TreeletsClosedNodeInfo I)
+{
+    TreeletNodeData D = (1u << 31u); // closed is always a root!
+    D |= I.treeletId & ((1u << 31u) - 1u);
+    return D;
+}
+
+GRL_INLINE TreeletNodeData GRL_OVERLOADABLE EncodeClosedInfo(uint treeletId)
+{
+    TreeletNodeData D = (1 << 31); // closed is always a root!
+    D |= treeletId & ((1u << 31u) - 1u);
+    return D;
+}
+
+GRL_INLINE void chk_close_Treelet(
+    RefitTreelet* TreeletDescsArr,
+    TreeletNodeData* nodeTreeletDataArr,
+    uint* StartPointBuffer,
+    uint* currStartpoint,
+    TreeletNodeData nodeData,
+    TreeletsOpenNodeInfo* nodeOpenInfo,
+    uint nodeIdx,
+    uint* treeletDescIdx)
+{
+    if (isTreeletRoot(nodeData))
+    {
+        TreeletNodeData encoded = 0;
+        if (nodeOpenInfo->numStartpoints == 1)
+        {
+            encoded = ClearTreeletRoot(SetTipStartpoint(nodeData));
+        }
+        else
+        {
+            RefitTreelet RTdesc;
+            RTdesc.startpoint_offset = *currStartpoint;
+            *currStartpoint += nodeOpenInfo->numStartpoints;
+            RTdesc.numStartpoints = nodeOpenInfo->numStartpoints;
+            RTdesc.maxDepth = nodeOpenInfo->maxDepth;
+            TreeletDescsArr[*treeletDescIdx] = RTdesc;
+            encoded = EncodeClosedInfo(*treeletDescIdx);
+            *treeletDescIdx = *treeletDescIdx + 1;
+            TreeletsOpenNodeInfo infoDefault = { 0, 0 };
+            *nodeOpenInfo = infoDefault;
+        }
+
+        nodeTreeletDataArr[nodeIdx] = encoded;
+    }
+    // printf("close_Treelet %d, nodeOpenInfo.numStartpoints %d, RTdesc.maxDepth %d, RTdesc.startpoint_offset %d\n", treeletDescIdx, nodeOpenInfo.numStartpoints, RTdesc.maxDepth, RTdesc.startpoint_offset);
+}
+
+
+// TreeletNodeData* treelets holds per node property, after running this some of them are marked as treelet root
+GRL_INLINE void treelet_bottom_up_mark_treelets(
+    global struct BVHBase* bvh,
+    global InternalNode* internalNodes,
+    global StartPoint* scratch_startpoints,
+    uint curNodeIndex,
+    BackPointers* backPointers,
+    global TreeletNodeData* treelets,
+    uint refitTreeletsDataStart,
+    uint* startpointAlloc)
+{
+    TreeletsOpenNodeInfo currInfo;
+    currInfo.maxDepth = 0;
+    currInfo.numStartpoints = 1;
+
+    global RefitTreelet* treeletDescs = (global RefitTreelet*) (((global char*)bvh) + (refitTreeletsDataStart * 64));
+
+    treelets[curNodeIndex] = EncodeOpenInfo(currInfo, true);
+
+    /* the start node got already processed, thus go to its parent node */
+    uint parentPointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex);
+    curNodeIndex = parentPointer >> 6;
+
+    bool isInTip = false;
+    while (curNodeIndex != 0x03FFFFFF)
+    {
+        uint numChildrenTotal = 0;
+        // numChildrenTotal and parentPointer gets updated...
+        // atomic trickery, on backpointers, only the last one thread enters up
+        {
+            /* increment refit counter that counts refitted children of current node */
+            global uint* pCurrentBackpointer = (global uint*)InnerNode_GetBackPointer(backPointers, curNodeIndex);
+            mem_fence_gpu_invalidate();
+            parentPointer = 1 + atomic_inc_global(pCurrentBackpointer);
+
+            /* if all children got refitted, then continue */
+            const uint numChildrenRefitted = (parentPointer >> 0) & 0x7;
+            numChildrenTotal = (parentPointer >> 3) & 0x7;
+
+            if (numChildrenRefitted != numChildrenTotal)
+                return;
+
+            /* reset refit counter for next refit */
+            *pCurrentBackpointer = (parentPointer & 0xfffffff8);
+        }
+
+        /* get children treelets */
+        global struct InternalNode* node = internalNodes + curNodeIndex;
+        uint childrenIndices = curNodeIndex + node->childOffset;
+        global TreeletNodeData* childrenTreelets = treelets + childrenIndices;
+
+        // yeah, it is possible we are pulling trash here, but we wont use it.
+        // this is for the sake of one non control flow spoiled data pull
+        TreeletNodeData dataCh0 = childrenTreelets[0]; TreeletNodeData dataCh1 = childrenTreelets[1];
+        TreeletNodeData dataCh2 = childrenTreelets[2]; TreeletNodeData dataCh3 = childrenTreelets[3];
+        TreeletNodeData dataCh4 = childrenTreelets[4]; TreeletNodeData dataCh5 = childrenTreelets[5];
+
+        // zero out the potential trash
+        if (numChildrenTotal < 3) dataCh2 = 0;
+        if (numChildrenTotal < 4) dataCh3 = 0;
+        if (numChildrenTotal < 5) dataCh4 = 0;
+        if (numChildrenTotal < 6) dataCh5 = 0;
+
+        TreeletsOpenNodeInfo infoCh0 = DecodeOpenInfo(dataCh0);
+        TreeletsOpenNodeInfo infoCh1 = DecodeOpenInfo(dataCh1);
+        TreeletsOpenNodeInfo infoCh2 = DecodeOpenInfo(dataCh2);
+        TreeletsOpenNodeInfo infoCh3 = DecodeOpenInfo(dataCh3);
+        TreeletsOpenNodeInfo infoCh4 = DecodeOpenInfo(dataCh4);
+        TreeletsOpenNodeInfo infoCh5 = DecodeOpenInfo(dataCh5);
+
+        uint numChildrenBeingRoots = isTreeletRoot(dataCh0) + isTreeletRoot(dataCh1) + isTreeletRoot(dataCh2) + isTreeletRoot(dataCh3) + isTreeletRoot(dataCh4) + isTreeletRoot(dataCh5);
+        // see if we should merge the trees, if not then we should move to tip.
+        currInfo.numStartpoints = infoCh0.numStartpoints + infoCh1.numStartpoints + infoCh2.numStartpoints + infoCh3.numStartpoints + infoCh4.numStartpoints + infoCh5.numStartpoints;
+
+        bool isTipStartpoint = false;
+        if (!isInTip)
+        {
+            // TODO: threshold could be a dynamic parameter based on the number of actual inner nodes
+            bool mergeTreelets = ((currInfo.numStartpoints > 0) && (currInfo.numStartpoints < TREELET_NUM_STARTPOINTS));
+            bool allChildrenRootsCurrently = numChildrenTotal == numChildrenBeingRoots;
+            if (mergeTreelets && allChildrenRootsCurrently)
+            {
+                childrenTreelets[0] = ClearTreeletRoot(dataCh0);
+                childrenTreelets[1] = ClearTreeletRoot(dataCh1); // -1 will be recognised then as this is not a treelet root.
+                if (numChildrenTotal > 2) childrenTreelets[2] = ClearTreeletRoot(dataCh2);
+                if (numChildrenTotal > 3) childrenTreelets[3] = ClearTreeletRoot(dataCh3);
+                if (numChildrenTotal > 4) childrenTreelets[4] = ClearTreeletRoot(dataCh4);
+                if (numChildrenTotal > 5) childrenTreelets[5] = ClearTreeletRoot(dataCh5);
+            }
+            else
+            {
+                isInTip = true;
+                isTipStartpoint = allChildrenRootsCurrently;
+            }
+        }
+
+        // close any roots underneath
+        if (isInTip && numChildrenBeingRoots)
+        {
+            uint trivialRoots = isTrivialTreeletRoot(dataCh0) + isTrivialTreeletRoot(dataCh1) + isTrivialTreeletRoot(dataCh2) +
+                                isTrivialTreeletRoot(dataCh3) + isTrivialTreeletRoot(dataCh4) + isTrivialTreeletRoot(dataCh5);
+
+            uint treeletId = 0;
+            uint bottomStartpointSpace = 0;
+
+            uint startpointsFromTiptree = trivialRoots;
+
+            if (trivialRoots) isTipStartpoint = false;
+
+            if (numChildrenBeingRoots > trivialRoots)
+            {
+                startpointsFromTiptree += // startpoint ONLY from tiptree
+                    (1 - isTreeletRoot(dataCh0)) * infoCh0.numStartpoints +
+                    (1 - isTreeletRoot(dataCh1)) * infoCh1.numStartpoints +
+                    (1 - isTreeletRoot(dataCh2)) * infoCh2.numStartpoints +
+                    (1 - isTreeletRoot(dataCh3)) * infoCh3.numStartpoints +
+                    (1 - isTreeletRoot(dataCh4)) * infoCh4.numStartpoints +
+                    (1 - isTreeletRoot(dataCh5)) * infoCh5.numStartpoints;
+                    
+                treeletId = atomic_add_global((global uint*)BVHBase_GetRefitTreeletCntPtr(bvh), numChildrenBeingRoots - trivialRoots);
+                bottomStartpointSpace = atomic_add_global((global uint*)startpointAlloc, currInfo.numStartpoints - startpointsFromTiptree);
+            }
+
+            currInfo.numStartpoints = startpointsFromTiptree;
+
+            chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh0, &infoCh0, childrenIndices + 0, &treeletId);
+            chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh1, &infoCh1, childrenIndices + 1, &treeletId);
+            chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh2, &infoCh2, childrenIndices + 2, &treeletId);
+            chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh3, &infoCh3, childrenIndices + 3, &treeletId);
+            chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh4, &infoCh4, childrenIndices + 4, &treeletId);
+            chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh5, &infoCh5, childrenIndices + 5, &treeletId);
+        }
+
+        if (isTipStartpoint)
+        {
+            currInfo.maxDepth = 0;
+            currInfo.numStartpoints = 1;
+        }
+        else
+        {
+            // reduce max depth and number of startpoint underneath
+            currInfo.maxDepth = max(max(max(infoCh0.maxDepth, infoCh1.maxDepth),
+                max(infoCh2.maxDepth, infoCh3.maxDepth)),
+                max(infoCh4.maxDepth, infoCh5.maxDepth)) + 1;
+        }
+
+        treelets[curNodeIndex] = EncodeOpenInfo(
+            currInfo,
+            !isInTip /*mark marged treelet as an new root iff we are in bottom we */);
+
+        /* make parent node the current node */
+        curNodeIndex = parentPointer >> 6;
+    }
+
+    uint treeletId = *BVHBase_GetRefitTreeletCntPtr(bvh);
+
+    uint bottomStartpointSpace = atomic_add_global((global uint*)startpointAlloc, currInfo.numStartpoints);
+
+    treelets[0] = EncodeClosedInfo(treeletId);
+    RefitTreelet tipTreeletDesc;
+    tipTreeletDesc.startpoint_offset = bottomStartpointSpace;
+    tipTreeletDesc.numStartpoints = currInfo.numStartpoints;
+    tipTreeletDesc.maxDepth = currInfo.maxDepth;
+    
+    treeletDescs[treeletId] = tipTreeletDesc;
+
+    uint realNumberOfTreelets = treeletId + 1;
+    // intentionally we set less by 1, because this number is used in num groups for dispatch which is number of bottom treelets 
+    // so substract 1. Except single treelet tree which is should stay 1.
+    uint numStartingTreelets = (treeletId == 0) ? 1 : treeletId;
+
+    *BVHBase_GetRefitTreeletCntPtr(bvh) = numStartingTreelets;
+
+    uint treeletDescSpaceIn64B = (realNumberOfTreelets * sizeof(RefitTreelet) + 63) >> 6;
+    uint startpointSpaceIn64B = ((bottomStartpointSpace + currInfo.numStartpoints) * sizeof(StartPoint) + 63) >> 6;
+    bvh->refitStartPointDataStart = refitTreeletsDataStart + treeletDescSpaceIn64B;
+    bvh->BVHDataEnd = refitTreeletsDataStart +treeletDescSpaceIn64B + startpointSpaceIn64B;
+    *startpointAlloc = 0;
+}
+
+
+GRL_INLINE void find_refit_treelets(
+    global struct BVHBase* bvh,
+    global TreeletNodeData* treelets,
+    global uint* scratchStartpoints,
+    global uint* startpointAlloc)
+{
+    /* get pointer to inner nodes and back pointers */
+    uniform global InternalNode* inner_nodes = (global InternalNode*) BVHBase_GetInternalNodes(bvh);
+
+    /* construct range of nodes that each work group will process */
+    uniform const uint numInnerNodes = BVHBase_numNodes(bvh);
+
+    varying ushort lane = get_sub_group_local_id();
+    varying uint global_id = get_local_id(0) + get_group_id(0) * get_local_size(0);
+
+    uint numBackpointers = BVHBase_GetNumInternalNodes(bvh);
+
+    // align to 64B and divide
+    uint treeletOffsetIn64B = ((numBackpointers * sizeof(uint)) + 63) >> 6;
+
+    uint refitTreeletsDataStart = bvh->backPointerDataStart + treeletOffsetIn64B;
+    if (global_id == 0)
+    {
+        bvh->refitTreeletsDataStart = refitTreeletsDataStart;
+    }
+
+    global struct InternalNode* curNode = &inner_nodes[global_id];
+
+    varying ushort has_startpoint = 0;
+    if (global_id < numInnerNodes) {
+        if ((curNode->nodeType != BVH_INTERNAL_NODE))
+        {
+            has_startpoint = 1;
+        }
+    }
+
+    if (has_startpoint == 0)
+        return;
+
+    treelet_bottom_up_mark_treelets(
+        bvh,
+        inner_nodes,
+        scratchStartpoints,
+        global_id,
+        BVHBase_GetBackPointers(bvh),
+        treelets,
+        refitTreeletsDataStart,
+        startpointAlloc);
+}
+
+GRL_INLINE void assign_refit_startpoints_to_treelets(
+    global struct BVHBase*  bvh,
+    global TreeletNodeData* treelets,
+    global uint*            scratchStartpoints)
+{
+    /* get pointer to inner nodes and back pointers */
+    uniform global struct InternalNode* inner_nodes = (global struct InternalNode*) BVHBase_GetInternalNodes(bvh);
+
+    /* construct range of nodes that each work group will process */
+    uniform const uint numInnerNodes = BVHBase_numNodes(bvh);
+
+    varying ushort lane = get_sub_group_local_id();
+    varying uint starPointNode = get_local_id(0) + get_group_id(0) * get_local_size(0);
+    varying uint curNodeIndex = starPointNode;
+    global struct InternalNode* curNode = &inner_nodes[curNodeIndex];
+
+    varying ushort is_startpoint = 0;
+
+    if (curNodeIndex < numInnerNodes)
+    {
+        if ((curNode->nodeType != BVH_INTERNAL_NODE))
+        {
+            is_startpoint = 1;
+        }
+    }
+
+    if (is_startpoint == 0)
+    {
+        return;
+    }
+
+    BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+
+    RefitTreelet* treeletDescs = BVHBase_GetRefitTreeletDescs(bvh);
+    uint numTreelets = *BVHBase_GetRefitTreeletCntPtr(bvh);
+    if (numTreelets > 1) numTreelets++;
+
+    uint myDepthWhenDead = 0;
+    uint startpointsBeforeMe = 0;
+    bool dead = false;
+
+    uint prevNodeIndex = 0x03FFFFFF;
+
+    while (curNodeIndex != 0x03FFFFFF)
+    {
+        TreeletNodeData nodeData = treelets[curNodeIndex];
+
+        uint parentPointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex);
+        uint numChildren = BackPointer_GetNumChildren(parentPointer);
+
+        // this is counterpart of atomic based entrance decision.
+        // the alive path is the longest, if two are equal take the one that came through child with smaller index.
+        if (prevNodeIndex != 0x03FFFFFF)
+        {
+            uint leadChildOfCur = curNodeIndex + inner_nodes[curNodeIndex].childOffset;
+            uint childEnd = numChildren + leadChildOfCur;
+
+            uint longestPath = 0;
+            uint longestPathChildIdx = leadChildOfCur;
+
+            for (uint child = leadChildOfCur; child < childEnd; child++)
+            {
+                TreeletNodeData childData = treelets[child];
+                if (!isTreeletRoot(childData))
+                {
+                    TreeletsOpenNodeInfo childinfo = DecodeOpenInfo(childData);
+                    if (longestPath <= childinfo.maxDepth) {
+                        longestPathChildIdx = child;
+                        longestPath = childinfo.maxDepth + 1;
+                    }
+
+                    if (child < prevNodeIndex)
+                    {
+                        // also count how many startpoints are there before me (used to place startpoint in proper slot)
+                        startpointsBeforeMe += childinfo.numStartpoints;
+                    }
+                }
+            }
+
+            if (!dead && prevNodeIndex != longestPathChildIdx)
+            {
+                dead = true;
+                //printf("starPointNode %d dies in node %d, myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead);
+            }
+
+            if (!dead) // this "if" is not an "else" to abouve as we might be dead before and comming through the same child index
+            {
+                myDepthWhenDead = longestPath;
+                // it is a startpoint
+                //printf("starPointNode %d in node %d lives up, its myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead);
+            }
+
+            if (starPointNode == (uint)-1) {
+                // we just entered upper treelet as treelet if we are alive, we can be a new startpoint in new treelet
+                if (dead)
+                {
+                    //printf("starPointNode %d disappears in node %d, myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead);
+                    // and we are dead, so we are not a startpoint of tip, 
+                    // so we must disappear to not be added as a startpoint.
+                    return;
+                }
+                else
+                {
+                    // it is a startpoint
+                    //printf("starPointNode %d in node %d becoming its new startpoint\n", starPointNode, curNodeIndex);
+                    starPointNode = curNodeIndex;
+                }
+            }
+        }
+
+        if (isTreeletRoot(nodeData))
+        {
+            TreeletsClosedNodeInfo info = DecodeClosedInfo(nodeData);
+            RefitTreelet treeletDesc = treeletDescs[info.treeletId];
+            uint startpointSlot = treeletDesc.startpoint_offset + startpointsBeforeMe;
+            scratchStartpoints[startpointSlot] = (starPointNode << 6) + (myDepthWhenDead & ((1 << 6) - 1));
+
+            //printf("Adding to treeletID %d at root %d startpoint %d StartNodeIdx %d, depth %d\n", info.treeletId, curNodeIndex, startpointSlot, starPointNode, myDepthWhenDead);
+
+            if (dead) return;
+            myDepthWhenDead = 0;
+            startpointsBeforeMe = 0;
+            starPointNode = (uint)-1;
+        }
+
+        /* make parent node the current node */
+        prevNodeIndex = curNodeIndex;
+        curNodeIndex = BackPointer_GetParentIndex(parentPointer);
+        //if(!dead)
+        //printf("starPointNode %d move from node %d to %d\n", starPointNode, prevNodeIndex, curNodeIndex);
+    }
+}
+
+const uint FINALIZE_TREELETS_SLM_DEPTHS_SPACE = 32;
+
+GRL_INLINE void finalize_treelets_in_groups(
+    global struct BVHBase* bvh,
+    global uint* scratchStartpoints,
+    local uint* depths)
+{
+    uint numTreeletsExecuted = *BVHBase_GetRefitTreeletCntPtr(bvh);
+
+    uint local_id = get_local_id(0);
+
+    uint numTreelets = (numTreeletsExecuted > 1) ? numTreeletsExecuted + 1 : numTreeletsExecuted;
+
+    RefitTreelet* treeletDescs = BVHBase_GetRefitTreeletDescs(bvh);
+
+    for (uint treeletId = get_group_id(0); treeletId < numTreelets; treeletId += numTreeletsExecuted)
+    {
+        if (treeletId == numTreeletsExecuted && treeletId != 0) { work_group_barrier(CLK_LOCAL_MEM_FENCE); }
+
+        RefitTreelet treeletDesc = treeletDescs[treeletId];
+        StartPoint* srcStartpoints = scratchStartpoints + treeletDesc.startpoint_offset;
+        if (treeletDesc.numStartpoints <= 1)
+        {
+            // for smaller latency we store 1 element treelets as RefitTreeletTrivial,
+            // this happens most of the time for tip treelet
+            if (local_id == 0)
+            {
+                RefitTreeletTrivial tr = { 0, treeletDesc.numStartpoints, 0, treeletDesc.maxDepth, 0 };
+                if (treeletDesc.numStartpoints == 1)
+                {
+                    StartPoint sp               = srcStartpoints[0];
+                    
+                    tr.theOnlyNodeIndex         = StartPoint_GetNodeIdx(sp);
+                    uint backpointer            = *InnerNode_GetBackPointer(BVHBase_GetBackPointers(bvh), tr.theOnlyNodeIndex);
+                    tr.numChildrenOfTheNode     = BackPointer_GetNumChildren(backpointer);
+                    tr.childrenOffsetOfTheNode  = BVHBase_GetInternalNodes(bvh)[tr.theOnlyNodeIndex].childOffset + tr.theOnlyNodeIndex;
+                }
+                RefitTreeletTrivial* trivial = (RefitTreeletTrivial*)(treeletDescs + treeletId);
+                *trivial = tr;
+#if REFIT_VERBOSE_LOG
+                printf("treelet trivial %d {\n  theOnlyNodeIndex = %d;\n  numStartpoints = %d;\n  childrenOffsetOfTheNode = %d;\n  maxDepth =%d;\n  numChildrenOfTheNode = %d;\n}\n",
+                    treeletId,
+                    tr.theOnlyNodeIndex,
+                    tr.numStartpoints,
+                    tr.childrenOffsetOfTheNode,
+                    tr.maxDepth,
+                    tr.numChildrenOfTheNode);
+#endif
+            }
+        }
+        else
+        {
+#define SKIP_PATHS_SORTING 0
+#if SKIP_PATHS_SORTING
+            StartPoint* dstStartpoints = BVHBase_GetRefitStartPoints(bvh) + treeletDesc.startpoint_offset;
+            for (uint startpointID = local_id; startpointID < treeletDesc.numStartpoints; startpointID += get_local_size(0))
+            {
+                dstStartpoints[startpointID] = srcStartpoints[startpointID];
+            }
+#else
+            //if (local_id == 0) { printf("treelet %d, numStartpoints = %d\n", treeletId, numStartpoints); }
+
+            if (local_id <= treeletDesc.maxDepth) {
+                depths[local_id] = 0;
+                //    printf("initializing slm treelet %d, depths[%d] = 0\n", treeletId, local_id);
+            }
+            work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+            uint loopSize = ((treeletDesc.numStartpoints + (get_sub_group_size() - 1)) / get_sub_group_size()) * get_sub_group_size();
+
+            // collect histogram of how many paths of given length we have
+
+            // keep count of depth 0 
+            uint val = 0;
+
+            // optimize: we will load Startpoint only once to 
+            uint S_c[8];
+            // optimize: keep accumulated numbers in registers to limit number of atomic ops
+            uint D_c[8] = { 0 };
+
+            uint cached_threshold = 8 * get_local_size(0);
+            cached_threshold = min(cached_threshold, treeletDesc.numStartpoints);
+
+            uint loop_turn = 0;
+            uint sgid = get_sub_group_local_id();
+
+            for (uint startpointID = local_id+ cached_threshold; startpointID < treeletDesc.numStartpoints; startpointID += get_local_size(0))
+            {
+                uint dstSlot = StartPoint_GetDepth(srcStartpoints[startpointID]);
+                atomic_inc((volatile local uint*) (depths + dstSlot));
+            }
+
+            uint HistogramSG = 0;
+            if (treeletDesc.maxDepth < 8)
+            {
+                for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0))
+                {
+                    StartPoint S = srcStartpoints[startpointID];
+                    S_c[loop_turn++] = S;
+                    uint dstSlot = StartPoint_GetDepth(S);
+                    D_c[dstSlot]++;
+                }
+                
+                for (uint d = 0; d <= treeletDesc.maxDepth; d++)
+                {
+                    val = sub_group_reduce_add(D_c[d]);
+                    if (sgid == d)
+                    {
+                        HistogramSG = val;
+                    }
+                }
+                if (sgid <= treeletDesc.maxDepth && HistogramSG != 0)
+                {
+                    atomic_add((volatile local uint*) (depths + sgid), HistogramSG);
+                }
+            }
+            else
+            {
+                for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0))
+                {
+                    StartPoint S = srcStartpoints[startpointID];
+                    S_c[loop_turn++] = S;
+                    uint dstSlot = StartPoint_GetDepth(S);
+                    atomic_inc((volatile local uint*) (depths + dstSlot));
+                }
+            }
+
+            work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+#if REFIT_VERBOSE_LOG
+            if (local_id == 0)
+            {
+                for (uint d = 0; d <= treeletDesc.maxDepth; d++)
+                {
+                    printf("treelet %d depths[%d] = %d\n", treeletId, d, depths[d]);
+                }
+            }
+#endif
+
+            if (treeletDesc.maxDepth < get_sub_group_size())
+            {
+                if (get_sub_group_id() == 0)
+                {
+                    
+                    uint cntOfDepth = 0;
+                    if (sgid <= treeletDesc.maxDepth) {
+                        cntOfDepth = depths[sgid];
+                    }
+                    uint pref_sum = sub_group_scan_exclusive_add(cntOfDepth);
+                    depths[sgid] = pref_sum;
+
+                    uint numLeft = treeletDesc.numStartpoints - (pref_sum);
+                    uint depthLess64  = (numLeft < 64 ) ? (uint)sgid : (uint)treeletDesc.maxDepth;
+                    uint depthLess128 = (numLeft < 128) ? (uint)sgid : (uint)treeletDesc.maxDepth;
+                    uint depthLess256 = (numLeft < 256) ? (uint)sgid : (uint)treeletDesc.maxDepth;
+
+                    // filling data for thread 0 who will save this to mem
+                    treeletDesc.depthLess64 = sub_group_reduce_min(depthLess64);
+                    treeletDesc.depthLess128 = sub_group_reduce_min(depthLess128);
+                    treeletDesc.depthLess256 = sub_group_reduce_min(depthLess256);
+                    treeletDesc.numNonTrivialStartpoints = treeletDesc.numStartpoints - cntOfDepth;
+
+                    if (sgid == 0) {
+                        treeletDescs[treeletId] = treeletDesc;
+#if REFIT_VERBOSE_LOG
+                        printf("treelet %d {\n  startpoint_offset = %d;\n  numStartpoints = %d;\n  numNonTrivialStartpoints = %d;  \n  maxDepth = %d;\n  depthLess64 = %d;\n  depthLess128 = %d;\n  depthLess256 = %d;\n}\n",
+                            treeletId,
+                            treeletDesc.startpoint_offset,
+                            treeletDesc.numStartpoints,
+                            treeletDesc.numNonTrivialStartpoints,
+                            treeletDesc.maxDepth,
+                            treeletDesc.depthLess64,
+                            treeletDesc.depthLess128,
+                            treeletDesc.depthLess256);
+#endif
+                    }
+                }
+            }
+            else if (local_id <= treeletDesc.maxDepth) {
+                uint thisdepthcount = depths[local_id];
+                treeletDesc.depthLess64 = 0;
+                treeletDesc.depthLess128 = 0;
+                treeletDesc.depthLess256 = 0;
+                uint numLeft = treeletDesc.numStartpoints;                
+                uint pref_sum = 0;
+
+                for (uint d = 0; d < local_id; d++)
+                {
+                    uint depthCnt = depths[d];
+                    if (numLeft > 64) { treeletDesc.depthLess64 = d + 1; }
+                    if (numLeft > 128) { treeletDesc.depthLess128 = d + 1; }
+                    if (numLeft > 256) { treeletDesc.depthLess256 = d + 1; }
+                    pref_sum += depthCnt;
+                    numLeft -= depthCnt;
+                    if (d == 0) { treeletDesc.numNonTrivialStartpoints = numLeft; }
+                }
+
+                if (local_id == treeletDesc.maxDepth)
+                {
+                    treeletDescs[treeletId] = treeletDesc;
+#if REFIT_VERBOSE_LOG
+                    printf("treelet %d {\n  startpoint_offset = %d;\n  numStartpoints = %d;\n  numNonTrivialStartpoints = %d;  maxDepth = %d;\n  depthLess64 = %d;  depthLess128 = %d;  depthLess256 = %d;\n}\n",
+                        treeletId,
+                        treeletDesc.startpoint_offset,
+                        treeletDesc.numStartpoints,
+                        treeletDesc.numNonTrivialStartpoints,
+                        treeletDesc.maxDepth,
+                        treeletDesc.depthLess64,
+                        treeletDesc.depthLess128,
+                        treeletDesc.depthLess256);
+#endif
+                }    
+            }
+
+            StartPoint* dstStartpoints = BVHBase_GetRefitStartPoints(bvh) + treeletDesc.startpoint_offset;
+
+            work_group_barrier(CLK_LOCAL_MEM_FENCE);
+            
+            loop_turn = 0;
+            if (treeletDesc.maxDepth < 8)
+            {
+                uint prefixSG = 0;
+
+                // make prefixSG keep interval for paths with sglid depth that is separated out for sg.
+                if (sgid <= treeletDesc.maxDepth && HistogramSG != 0)
+                {
+                    prefixSG = atomic_add((volatile local uint*) (depths + sgid), HistogramSG);
+                }
+
+                // from now on all sgs run independently
+
+                // make D_c keep offset interval that is separated out for given lane
+                for (uint d = 0; d <= treeletDesc.maxDepth; d++)
+                {
+                    uint thisDPrefixSg = sub_group_broadcast(prefixSG, d);
+                    uint thisLaneCount = D_c[d];
+                    uint laneOffset = sub_group_scan_exclusive_add(thisLaneCount);
+                    D_c[d] = laneOffset + thisDPrefixSg;
+                }
+
+                for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0))
+                {
+                    StartPoint S = S_c[loop_turn++];
+                    uint d = StartPoint_GetDepth(S);
+                    uint dstSlot = D_c[d]++;
+                    dstStartpoints[dstSlot] = S;
+                }
+            }
+            else
+            {
+                for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0))
+                {
+                    StartPoint S = S_c[loop_turn++];
+                    uint d = StartPoint_GetDepth(S);
+                    uint dstSlot = atomic_inc((volatile local uint*) (depths + d));
+                    dstStartpoints[dstSlot] = S;
+                }
+            }
+
+            for (uint srcStartpointID = local_id+ cached_threshold; srcStartpointID < treeletDesc.numStartpoints; srcStartpointID += get_local_size(0))
+            {
+                StartPoint S = srcStartpoints[srcStartpointID];
+                uint d = StartPoint_GetDepth(srcStartpoints[srcStartpointID]);
+                uint dstSlot = atomic_inc((volatile local uint*) (depths+ d));
+                dstStartpoints[dstSlot] = S;
+            }
+#endif //skip sorting
+        }
+    }
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_copy.cl b/src/intel/vulkan/grl/gpu/bvh_copy.cl
new file mode 100644
index 00000000000..6e76f195095
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_copy.cl
@@ -0,0 +1,763 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "d3d12.h"
+#include "common.h"
+#include "mem_utils.h"
+#include "misc_shared.h"
+
+#define offsetof(TYPE, ELEMENT) ((size_t)&(((TYPE *)0)->ELEMENT))
+
+GRL_INLINE
+uint GroupCountForCopySize(uint size)
+{
+    return (size >> 8) + 4;
+}
+
+GRL_INLINE
+uint GroupCountForCopy(BVHBase* base)
+{
+    return GroupCountForCopySize(base->Meta.allocationSize);
+}
+
+GRL_INLINE void copyInstanceDescs(InstanceDesc* instances, D3D12_RAYTRACING_INSTANCE_DESC* descs, uint64_t numInstances)
+{
+    for (uint64_t instanceIndex = get_local_id(0); instanceIndex < numInstances; instanceIndex += get_local_size(0))
+    {
+        for (uint row = 0; row < 3; row++)
+        {
+            for (uint column = 0; column < 4; column++)
+            {
+                D3D12_set_transform(&descs[instanceIndex], row, column, InstanceDesc_get_transform(&instances[instanceIndex], row, column));
+            }
+        }
+        D3D12_set_instanceID(&descs[instanceIndex], InstanceDesc_get_instanceID(&instances[instanceIndex]));
+        D3D12_set_InstanceMask(&descs[instanceIndex], InstanceDesc_get_InstanceMask(&instances[instanceIndex]));
+        D3D12_set_InstanceContributionToHitGroupIndex(&descs[instanceIndex], InstanceDesc_get_InstanceContributionToHitGroupIndex(&instances[instanceIndex]));
+        D3D12_set_InstanceFlags(&descs[instanceIndex], InstanceDesc_get_InstanceFlags(&instances[instanceIndex]));
+        D3D12_set_AccelerationStructure(&descs[instanceIndex], InstanceDesc_get_AccelerationStructure(&instances[instanceIndex]));
+    }
+}
+
+GRL_INLINE void createGeoDescs(GeoMetaData* geoMetaData, D3D12_RAYTRACING_GEOMETRY_DESC* descs, uint64_t numGeos, const uint64_t dataBufferStart)
+{
+    if (get_local_id(0) == 0)
+    {
+        uint64_t previousGeoDataBufferEnd = dataBufferStart;
+        for (uint64_t geoIndex = 0; geoIndex < numGeos; geoIndex += 1)
+        {
+            D3D12_set_Type(&descs[geoIndex], (uint8_t)(0xffff & geoMetaData[geoIndex].Type));
+            D3D12_set_Flags(&descs[geoIndex], (uint8_t)(0xffff & geoMetaData[geoIndex].Flags));
+            if (geoMetaData[geoIndex].Type == GEOMETRY_TYPE_TRIANGLES)
+            {
+                // Every triangle is stored separately
+                uint64_t vertexBufferSize = 9 * sizeof(float) * geoMetaData[geoIndex].PrimitiveCount;
+                D3D12_set_triangles_Transform(&descs[geoIndex], 0);
+                D3D12_set_triangles_IndexFormat(&descs[geoIndex], INDEX_FORMAT_NONE);
+                D3D12_set_triangles_VertexFormat(&descs[geoIndex], VERTEX_FORMAT_R32G32B32_FLOAT);
+                D3D12_set_triangles_IndexCount(&descs[geoIndex], 0);
+                D3D12_set_triangles_VertexCount(&descs[geoIndex], geoMetaData[geoIndex].PrimitiveCount * 3);
+                D3D12_set_triangles_IndexBuffer(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd);
+                D3D12_set_triangles_VertexBuffer_StartAddress(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd);
+                D3D12_set_triangles_VertexBuffer_StrideInBytes(&descs[geoIndex], 3 * sizeof(float));
+                previousGeoDataBufferEnd += vertexBufferSize;
+            }
+            else
+            {
+                D3D12_set_procedurals_AABBCount(&descs[geoIndex], geoMetaData[geoIndex].PrimitiveCount);
+                D3D12_set_procedurals_AABBs_StartAddress(&descs[geoIndex], (D3D12_GPU_VIRTUAL_ADDRESS)previousGeoDataBufferEnd);
+                D3D12_set_procedurals_AABBs_StrideInBytes(&descs[geoIndex], sizeof(D3D12_RAYTRACING_AABB));
+                previousGeoDataBufferEnd += sizeof(D3D12_RAYTRACING_AABB) * geoMetaData[geoIndex].PrimitiveCount;
+            }
+        }
+    }
+}
+
+GRL_INLINE void copyIndiciesAndVerticies(D3D12_RAYTRACING_GEOMETRY_DESC* desc, QuadLeaf* quad)
+{
+    float* vertices = (float*)D3D12_get_triangles_VertexBuffer_StartAddress(desc);
+    uint64_t firstTriangleIndex = quad->primIndex0;
+    uint64_t numTriangles = QuadLeaf_IsSingleTriangle(quad) ? 1 : 2;
+
+    vertices[firstTriangleIndex * 9] = quad->v[0][0];
+    vertices[firstTriangleIndex * 9 + 1] = quad->v[0][1];
+    vertices[firstTriangleIndex * 9 + 2] = quad->v[0][2];
+
+    vertices[firstTriangleIndex * 9 + 3] = quad->v[1][0];
+    vertices[firstTriangleIndex * 9 + 4] = quad->v[1][1];
+    vertices[firstTriangleIndex * 9 + 5] = quad->v[1][2];
+
+    vertices[firstTriangleIndex * 9 + 6] = quad->v[2][0];
+    vertices[firstTriangleIndex * 9 + 7] = quad->v[2][1];
+    vertices[firstTriangleIndex * 9 + 8] = quad->v[2][2];
+
+    if (numTriangles == 2)
+    {
+        uint64_t secondTriangleIndex = firstTriangleIndex + QuadLeaf_GetPrimIndexDelta(quad);
+        uint32_t packed_indices = QuadLeaf_GetSecondTriangleIndices(quad);
+        for( size_t i=0; i<3; i++ )
+        {
+            uint32_t idx = packed_indices & 3 ; packed_indices >>= 2;
+            for( size_t j=0; j<3; j++ )
+                vertices[secondTriangleIndex * 9 + i * 3 + j] = quad->v[idx][j];
+        }
+    }
+}
+
+GRL_INLINE
+void storeProceduralDesc(
+    struct AABB     procAABB,
+    uint32_t        primId,
+    D3D12_RAYTRACING_GEOMETRY_DESC* geoDesc)
+{
+    D3D12_RAYTRACING_AABB* proceduralDescs = (D3D12_RAYTRACING_AABB*)D3D12_get_procedurals_AABBs_StartAddress(geoDesc);
+    D3D12_set_raytracing_aabb(&proceduralDescs[primId], &procAABB);
+}
+
+GRL_INLINE
+void copyDataFromLProcedurals(
+    BVHBase* base,
+    D3D12_RAYTRACING_GEOMETRY_DESC* descs)
+{
+    unsigned numProcedurals = BVHBase_GetNumProcedurals(base);
+    InternalNode* innerNodes = BVHBase_GetInternalNodes(base);
+    unsigned numInnerNodes = BVHBase_GetNumInternalNodes(base);
+
+    if (BVHBase_GetNumProcedurals(base) > 0) //< there's no point entering here if there are no procedurals
+    {
+
+        // iterate on all inner nodes to identify those with procedural children, we have to take aabbs from them
+        for (uint32_t nodeI = get_local_id(0); nodeI < numInnerNodes; nodeI += get_local_size(0))
+        {
+            InternalNode* innerNode = innerNodes + nodeI;
+
+            if (innerNode->nodeType == NODE_TYPE_PROCEDURAL)
+            {
+                float* origin = innerNode->lower;
+
+                global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer((struct QBVHNodeN*)innerNode);
+
+                for (uint k = 0; k < 6; k++)
+                {
+                    if (InternalNode_IsChildValid(innerNode, k))
+                    {
+                        struct AABB3f qbounds = {
+                            (float)(innerNode->lower_x[k]), (float)(innerNode->lower_y[k]), (float)(innerNode->lower_z[k]),
+                            (float)(innerNode->upper_x[k]), (float)(innerNode->upper_y[k]), (float)(innerNode->upper_z[k]) };
+
+                        struct AABB dequantizedAABB;
+
+                        dequantizedAABB.lower[0] = origin[0] + bitShiftLdexp(qbounds.lower[0], innerNode->exp_x - 8);
+                        dequantizedAABB.lower[1] = origin[1] + bitShiftLdexp(qbounds.lower[1], innerNode->exp_y - 8);
+                        dequantizedAABB.lower[2] = origin[2] + bitShiftLdexp(qbounds.lower[2], innerNode->exp_z - 8);
+                        dequantizedAABB.upper[0] = origin[0] + bitShiftLdexp(qbounds.upper[0], innerNode->exp_x - 8);
+                        dequantizedAABB.upper[1] = origin[1] + bitShiftLdexp(qbounds.upper[1], innerNode->exp_y - 8);
+                        dequantizedAABB.upper[2] = origin[2] + bitShiftLdexp(qbounds.upper[2], innerNode->exp_z - 8);
+
+                        dequantizedAABB = conservativeAABB(&dequantizedAABB);
+                        /* extract geomID and primID from leaf */
+                        const uint startPrim = QBVHNodeN_startPrim((struct QBVHNodeN*) innerNode, k);
+                        const uint geomID = ProceduralLeaf_geomIndex(leaf);
+                        const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf!
+
+                        storeProceduralDesc(dequantizedAABB, primID, descs + geomID);
+                    }
+                    /* advance leaf pointer to next child */
+                    leaf += QBVHNodeN_blockIncr((struct QBVHNodeN*)innerNode, k);
+                }
+
+            }
+            else if (innerNode->nodeType == NODE_TYPE_MIXED) { ERROR(); }
+            else {/* do nothing for other internal node types, they can't have procedural child (directly)*/; }
+        }
+    }
+}
+
+GRL_INLINE
+void copyDataFromQuadLeaves(BVHBase* base,
+    D3D12_RAYTRACING_GEOMETRY_DESC* descs)
+{
+    QuadLeaf* quads = BVHBase_GetQuadLeaves(base);
+    uint64_t numQuads = BVHBase_GetNumQuads(base);
+    for (uint64_t quadIdx = get_local_id(0); quadIdx < numQuads; quadIdx += get_local_size(0))
+    {
+        uint64_t descIdx = PrimLeaf_GetGeoIndex(&quads[quadIdx].leafDesc);
+        copyIndiciesAndVerticies(&descs[descIdx], &quads[quadIdx]);
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel clone_indirect(global char* dest,
+    global char* src)
+{
+    BVHBase* base = (BVHBase*)src;
+    uint64_t bvhSize = base->Meta.allocationSize;
+
+    uint numGroups = GroupCountForCopy(base);
+    CopyMemory(dest, src, bvhSize, numGroups);
+}
+
+GRL_INLINE void compactT(global char* dest, global char* src, uint64_t compactedSize, uint skipCopy, uint groupCnt)
+{
+    global BVHBase* baseSrc = (global BVHBase*)src;
+    global BVHBase* baseDest = (global BVHBase*)dest;
+
+    uint32_t offset = sizeof(BVHBase);
+    uint32_t numNodes = BVHBase_GetNumInternalNodes(baseSrc);
+    uint32_t nodeSize = numNodes * sizeof(InternalNode);
+    offset += nodeSize;
+
+    int quadChildFix = baseSrc->quadLeafStart;
+    int procChildFix = baseSrc->proceduralDataStart;
+    int instChildFix = baseSrc->instanceLeafStart;
+
+    // serialization already copies part of bvh base so skip this part
+    CopyMemory(dest + skipCopy, src + skipCopy, sizeof(BVHBase) - skipCopy, groupCnt);
+    baseDest->Meta.allocationSize = compactedSize;
+
+    if (baseSrc->Meta.instanceCount)
+    {
+        const uint32_t instLeafsSize = BVHBase_GetNumHWInstanceLeaves(baseSrc) * sizeof(HwInstanceLeaf);
+        CopyMemory(dest + offset, (global char*)BVHBase_GetHWInstanceLeaves(baseSrc), instLeafsSize, groupCnt);
+        const uint instanceLeafStart = (uint)(offset / 64);
+        baseDest->instanceLeafStart = instanceLeafStart;
+        instChildFix -= instanceLeafStart;
+        offset += instLeafsSize;
+        baseDest->instanceLeafEnd = (uint)(offset / 64);
+    }
+    if (baseSrc->Meta.geoCount)
+    {
+        const uint quadLeafsSize = BVHBase_GetNumQuads(baseSrc) * sizeof(QuadLeaf);
+        if (quadLeafsSize)
+        {
+            CopyMemory(dest + offset, (global char*)BVHBase_GetQuadLeaves(baseSrc), quadLeafsSize, groupCnt);
+            const uint quadLeafStart = (uint)(offset / 64);
+            baseDest->quadLeafStart = quadLeafStart;
+            quadChildFix -= quadLeafStart;
+            offset += quadLeafsSize;
+            baseDest->quadLeafCur = (uint)(offset / 64);
+        }
+
+        const uint procLeafsSize = BVHBase_GetNumProcedurals(baseSrc) * sizeof(ProceduralLeaf);
+        if (procLeafsSize)
+        {
+            CopyMemory(dest + offset, (global char*)BVHBase_GetProceduralLeaves(baseSrc), procLeafsSize, groupCnt);
+            const uint proceduralDataStart = (uint)(offset / 64);
+            baseDest->proceduralDataStart = proceduralDataStart;
+            procChildFix -= proceduralDataStart;
+            offset += procLeafsSize;
+            baseDest->proceduralDataCur = (uint)(offset / 64);
+        }
+    }
+    // copy nodes with fixed child offsets
+    global uint* nodeDest = (global uint*)(dest + sizeof(BVHBase));
+    global InternalNode* nodeSrc = (global InternalNode*)BVHBase_GetInternalNodes(baseSrc);
+    // used in mixed case
+    char* instanceLeavesBegin = (char*)BVHBase_GetHWInstanceLeaves(baseSrc);
+    char* instanceLeavesEnd = (char*)BVHBase_GetHWInstanceLeaves_End(baseSrc);
+    uint localId = get_sub_group_local_id();
+    for (uint i = get_group_id(0); i < numNodes; i += groupCnt)
+    {
+        uint nodePart = CacheLineSubgroupRead((const global char*)&nodeSrc[i]);
+        char nodeType = as_char4(sub_group_broadcast(nodePart, offsetof(InternalNode, nodeType) / 4))[0];
+        if (localId * 4 == offsetof(InternalNode, childOffset))
+        {
+            int childOffset = as_int(nodePart);
+            if (nodeType == NODE_TYPE_MIXED)
+            {
+                char* childPtr = (char*)&nodeSrc[i] + 64 * childOffset;
+                if (childPtr > instanceLeavesBegin && childPtr < instanceLeavesEnd)
+                    nodePart = as_int(childOffset - instChildFix);
+            }
+            else if (nodeType == NODE_TYPE_INSTANCE)
+                nodePart = as_int(childOffset - instChildFix);
+            else if (nodeType == NODE_TYPE_QUAD)
+                nodePart = as_int(childOffset - quadChildFix);
+            else if (nodeType == NODE_TYPE_PROCEDURAL)
+                nodePart = as_int(childOffset - procChildFix);
+        }
+        nodeDest[i * 16 + localId] = nodePart;
+    }
+
+    if (baseSrc->Meta.instanceCount)
+    {
+        const uint32_t instanceDescSize = baseSrc->Meta.instanceCount * sizeof(InstanceDesc);
+        CopyMemory(dest + offset, src + baseSrc->Meta.instanceDescsStart, instanceDescSize, groupCnt);
+        baseDest->Meta.instanceDescsStart = offset;
+        offset += instanceDescSize;
+    }
+    if (baseSrc->Meta.geoCount)
+    {
+        const uint32_t geoMetaSize = baseSrc->Meta.geoCount * sizeof(GeoMetaData);
+        CopyMemory(dest + offset, src + baseSrc->Meta.geoDescsStart, geoMetaSize, groupCnt);
+        baseDest->Meta.geoDescsStart = offset;
+        offset += (geoMetaSize + 63) & ~63; // align to 64
+    }
+
+    uint backPointerDataStart     = offset / 64;
+    uint refitTreeletsDataStart   = backPointerDataStart;
+    uint refitStartPointDataStart = backPointerDataStart;
+    uint dataEnd                  = backPointerDataStart;
+    uint fatLeafTableStart = dataEnd;
+    uint fatLeafCount      = baseSrc->fatLeafCount;
+    uint innerTableStart   = dataEnd;
+    uint innerCount        = baseSrc->innerCount;
+    
+    uint quadLeftoversCountNewAtomicUpdate = baseSrc->quadLeftoversCountNewAtomicUpdate;
+    uint quadTableSizeNewAtomicUpdate = baseSrc->quadTableSizeNewAtomicUpdate;
+    uint quadIndicesDataStart = dataEnd;
+
+    if (BVHBase_HasBackPointers(baseSrc))
+    {
+#if 0 //
+        const uint oldbackpontersDataStart = baseSrc->backPointerDataStart;
+        const uint shift = oldbackpontersDataStart - backPointerDataStart;
+        const uint refitStructsSize = ((BVHBase_GetRefitStructsDataSize(baseSrc)) + 63) & ~63;
+
+        CopyMemory(dest + offset, (global char*)BVHBase_GetBackPointers(baseSrc), refitStructsSize, groupCnt);
+
+        refitTreeletsDataStart   = baseSrc->refitTreeletsDataStart - shift;
+        refitStartPointDataStart = baseSrc->refitStartPointDataStart - shift;
+        dataEnd                  = baseSrc->BVHDataEnd - shift;
+#else // compacting version
+        const uint backpointersSize = ((numNodes*sizeof(uint)) + 63) & ~63;
+        CopyMemory(dest + offset, (global char*)BVHBase_GetBackPointers(baseSrc), backpointersSize, groupCnt);
+        offset += backpointersSize;
+
+        refitTreeletsDataStart = offset / 64;
+        refitStartPointDataStart = offset / 64;
+
+        // TODO: remove treelets from .... everywhere
+        const uint treeletExecutedCnt = *BVHBase_GetRefitTreeletCntPtr(baseSrc);
+
+        if (treeletExecutedCnt)
+        {
+            const uint treeletCnt = treeletExecutedCnt > 1 ? treeletExecutedCnt + 1 : 1;
+
+            refitTreeletsDataStart = offset / 64;
+            const uint treeletsSize = ((treeletCnt * sizeof(RefitTreelet)) + 63) & ~63;
+            RefitTreelet* destTreelets = (RefitTreelet*)(dest + offset);
+            RefitTreelet* srcTreelets = BVHBase_GetRefitTreeletDescs(baseSrc);
+
+            uint numThreads = groupCnt * get_local_size(0);
+            uint globalID = (get_group_id(0) * get_local_size(0)) + get_local_id(0);
+
+            for (uint i = globalID; i < treeletCnt; i += numThreads)
+            {
+                RefitTreelet dsc = srcTreelets[i];
+                RefitTreeletTrivial* trivial_dsc = (RefitTreeletTrivial*)&dsc;
+                if (trivial_dsc->numStartpoints == 1 && trivial_dsc->childrenOffsetOfTheNode > numNodes) {
+                    trivial_dsc->childrenOffsetOfTheNode -= quadChildFix;
+                }
+                destTreelets[i] = dsc;
+            }
+
+            offset += treeletsSize;
+
+            refitStartPointDataStart = offset / 64;
+            const uint startPointsSize = (BVHBase_GetRefitStartPointsSize(baseSrc) + 63) & ~63;
+            CopyMemory(dest + offset, (global char*)BVHBase_GetRefitStartPoints(baseSrc), startPointsSize, groupCnt);
+            offset += startPointsSize;
+            dataEnd = offset / 64;
+        }
+
+        uint fatleafEntriesSize = ((fatLeafCount * sizeof(LeafTableEntry) + 63) & ~63);
+        fatLeafTableStart = offset / 64;
+        if (fatleafEntriesSize) {
+            CopyMemory(dest + offset, (global char*)BVHBase_GetFatLeafTable(baseSrc), fatleafEntriesSize, groupCnt);
+        }
+        offset += fatleafEntriesSize;
+
+        // New atomic update
+        if(baseSrc->quadIndicesDataStart > baseSrc->backPointerDataStart)
+        {
+            uint numQuads = BVHBase_GetNumQuads(baseSrc);
+            uint quadTableMainBufferSize = (numQuads + 255) & ~255;
+            uint quadLeftoversSize = (quadLeftoversCountNewAtomicUpdate + 255) & ~255;
+            uint quadTableEntriesSize = (((quadTableMainBufferSize + quadLeftoversSize) * sizeof(LeafTableEntry) + 63) & ~63);
+            if (quadTableEntriesSize) {
+                CopyMemory(dest + offset, (global char*)BVHBase_GetFatLeafTable(baseSrc), quadTableEntriesSize, groupCnt);
+            }
+            offset += quadTableEntriesSize;
+                
+            uint quadIndicesDataSize = ((numQuads * sizeof(QuadDataIndices) + 63) & ~63);
+            quadIndicesDataStart = offset / 64;
+            if (quadIndicesDataSize) {
+                CopyMemory(dest + offset, (global char*)BVHBase_GetQuadDataIndicesTable(baseSrc), quadIndicesDataSize, groupCnt);
+            }
+            offset += quadIndicesDataSize;
+        }
+
+        uint innerEntriesSize = ((innerCount * sizeof(InnerNodeTableEntry) + 63) & ~63);
+        innerTableStart = offset / 64;
+        if (innerEntriesSize) {
+            CopyMemory(dest + offset, (global char*)BVHBase_GetInnerNodeTable(baseSrc), innerEntriesSize, groupCnt);
+        }
+        offset += innerEntriesSize;
+
+        dataEnd = offset / 64;
+#endif
+    }
+
+    baseDest->backPointerDataStart = backPointerDataStart;
+    baseDest->refitTreeletsDataStart = refitTreeletsDataStart;
+    baseDest->refitStartPointDataStart = refitStartPointDataStart;
+    baseDest->fatLeafTableStart = fatLeafTableStart ;
+    baseDest->fatLeafCount = fatLeafCount;
+    baseDest->innerTableStart = innerTableStart;
+    baseDest->innerCount = innerCount;
+
+    baseDest->quadLeftoversCountNewAtomicUpdate = quadLeftoversCountNewAtomicUpdate;
+    baseDest->quadTableSizeNewAtomicUpdate = quadTableSizeNewAtomicUpdate;
+    baseDest->quadIndicesDataStart = quadIndicesDataStart;
+    baseDest->BVHDataEnd = dataEnd;
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel compact(global char* dest,
+    global char* src,
+    uint groupCnt)
+{
+    uint64_t compactedSize = compute_compacted_size((BVHBase*)src);
+    compactT(dest, src, compactedSize, 0, groupCnt);
+}
+
+// set serialization header along all lanes, each lane will get one dword of header plus 64bit reminding data
+GRL_INLINE
+unsigned prepare_header(
+    uint64_t headerSize,
+    uint64_t instancePtrSize,
+    uint64_t numInstances,
+    uint64_t bvhSize,
+    uint8_t* driverID,
+    uint64_t reminder)
+{
+
+    unsigned loc_id = get_sub_group_local_id();
+
+    uint64_t SerializedSizeInBytesIncludingHeader = headerSize + instancePtrSize * numInstances + bvhSize;
+    uint64_t DeserializedSizeInBytes = bvhSize;
+    uint64_t InstanceHandleCount = numInstances;
+
+    char bvh_magic_str[] = BVH_MAGIC_MACRO;
+    uint* bvh_magic_uint = (uint*)bvh_magic_str;
+
+    unsigned headerTempLanePiece;
+    if (loc_id < 4) { headerTempLanePiece = *((unsigned*)&driverID[4*loc_id]); }
+    else if (loc_id == 4) { headerTempLanePiece = bvh_magic_uint[0]; }
+    else if (loc_id == 5) { headerTempLanePiece = bvh_magic_uint[1]; }
+    else if (loc_id == 6) { headerTempLanePiece = bvh_magic_uint[2]; }
+    else if (loc_id == 7) { headerTempLanePiece = bvh_magic_uint[3]; }
+    else if (loc_id == 8) { headerTempLanePiece = (uint)SerializedSizeInBytesIncludingHeader; }
+    else if (loc_id == 9) { headerTempLanePiece = (uint)(SerializedSizeInBytesIncludingHeader >> 32ul); }
+    else if (loc_id == 10) { headerTempLanePiece = (uint)DeserializedSizeInBytes; }
+    else if (loc_id == 11) { headerTempLanePiece = (uint)(DeserializedSizeInBytes >> 32ul); }
+    else if (loc_id == 12) { headerTempLanePiece = (uint)InstanceHandleCount; }
+    else if (loc_id == 13) { headerTempLanePiece = (uint)(InstanceHandleCount >> 32ul); }
+    else if (loc_id == 14) { headerTempLanePiece = (uint)reminder; }
+    else if (loc_id == 15) { headerTempLanePiece = (uint)(reminder >> 32ul); }
+
+    return headerTempLanePiece;
+}
+
+
+
+
+GRL_INLINE
+void serializeT(
+    global byte_align64B* dest,
+    global byte_align64B* src,
+    global uint8_t* driverID,
+    uint groups_count)
+{
+    SerializationHeader* header = (SerializationHeader*)dest;
+    BVHBase* base = (BVHBase*)src;
+
+    const uint headerSize = sizeof(SerializationHeader);
+    const uint numInstances = base->Meta.instanceCount;
+    const uint instancePtrSize = sizeof(gpuva_t);
+    const uint compactedSize = compute_compacted_size(base);
+    uint local_id = get_sub_group_local_id();
+
+    // this is not 64byte aligned :(
+    const uint offsetToBvh = headerSize + instancePtrSize * numInstances;
+
+    global InstanceDesc* src_instances = 0;
+
+    if (numInstances) {
+        src_instances = (global InstanceDesc*)((uint64_t)base + base->Meta.instanceDescsStart);
+    }
+
+    // effectively this part should end up as one 64B aligned 64B write
+    if (get_group_id(0) == groups_count - 1)
+    {
+        Block64B headerPlus;
+
+        // we patch the missing piece with instance or bhv beginning (TRICK A and B)
+        // we assume header is 56B.
+        global uint64_t* srcPiece = (numInstances != 0) ? &src_instances[0].AccelerationStructureGPUVA : (global uint64_t*)src;
+
+        unsigned headerTemp;
+
+        headerTemp = prepare_header(
+            headerSize,
+            instancePtrSize,
+            numInstances,
+            compactedSize,
+            driverID,
+            *srcPiece);
+
+        CacheLineSubgroupWrite((global byte_align64B*)dest, headerTemp);
+    }
+
+    if (numInstances > 0)
+    {
+        uint instancesOffset = headerSize;
+        uint aligned_instance_ptrs_offset = ((instancesOffset + 63) >> 6) << 6;
+        uint unaligned_prefixing_instance_cnt = (aligned_instance_ptrs_offset - instancesOffset) >> 3;
+        unaligned_prefixing_instance_cnt = min(unaligned_prefixing_instance_cnt, numInstances);
+
+        global uint64_t* dst_instances = (global uint64_t*)(dest + instancesOffset);
+
+        // we've copied first instance onto a header, (see TRICK A)
+        // now we have only instances start at aligned memory
+        uint numAlignedInstances = numInstances - unaligned_prefixing_instance_cnt;
+        dst_instances += unaligned_prefixing_instance_cnt;
+        src_instances += unaligned_prefixing_instance_cnt;
+
+        if (numAlignedInstances)
+        {
+            // each 8 instances form a cacheline
+            uint numCachelines = numAlignedInstances >> 3; //qwords -> 64Bs
+            // qwords besides multiple of 8;
+            uint startReminder = numAlignedInstances & ~((1 << 3) - 1);
+            uint numreminder = numAlignedInstances & ((1 << 3) - 1);
+
+            uint task_id = get_group_id(0);
+
+            while (task_id < numCachelines)
+            {
+                uint src_id = task_id * 8 + (local_id >> 1);
+                uint* src_uncorected = (uint*)& src_instances[src_id].AccelerationStructureGPUVA;
+                uint* src = ((local_id & 1) != 0) ? src_uncorected + 1 : src_uncorected;
+                uint data = *src;
+
+                global char* dst = (global byte_align64B*)(dst_instances + (8 * task_id));
+                CacheLineSubgroupWrite(dst, data);
+                task_id += groups_count;
+            }
+
+            if (task_id == numCachelines && local_id < 8 && numreminder > 0)
+            {
+                // this should write full cacheline
+
+                uint index = startReminder + local_id;
+                // data will be taken from instances for lanes (local_id < numreminder)
+                // copy srcbvh beginning as uint64_t for remaining lanes (TRICK B)
+                global uint64_t* srcData = (local_id < numreminder) ?
+                    &src_instances[index].AccelerationStructureGPUVA :
+                    ((global uint64_t*)src) + (local_id - numreminder);
+                dst_instances[index] = *srcData;
+            }
+        }
+    }
+
+    // the parts above copied unaligned dst beginning of bvh (see TRICK B)
+    uint32_t unalignedPartCopiedElsewhere = (64u - (offsetToBvh & (64u - 1u)))&(64u - 1u);
+
+    compactT(dest + offsetToBvh, src, compactedSize, unalignedPartCopiedElsewhere, groups_count);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel serialize_indirect(
+    global char* dest,
+    global char* src,
+    global uint8_t* driverID)
+{
+    BVHBase* base = (BVHBase*)src;
+    uint groups_count = GroupCountForCopy(base);
+    serializeT(dest, src, driverID, groups_count);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel serialize_for_input_dump_indirect(
+    global struct OutputBatchPtrs* batchPtrs,
+    global dword* dstOffset,
+    global char* src,
+    global uint8_t* driverID)
+{
+    BVHBase* base = (BVHBase*)src;
+    uint groups_count = GroupCountForCopy(base);
+    global char* dest = (global char*)(batchPtrs->dataStart + *dstOffset);
+    dest += (sizeof(OutputData) + 127) & ~127;
+    serializeT(dest, src, driverID, groups_count);
+}
+
+GRL_INLINE
+void deserializeT(
+    global char* dest,
+    global char* src,
+    unsigned groupCnt)
+{
+    SerializationHeader* header = (SerializationHeader*)src;
+
+    const uint64_t headerSize = sizeof(struct SerializationHeader);
+    const uint64_t instancePtrSize = sizeof(gpuva_t);
+    const uint64_t numInstances = header->InstanceHandleCount;
+    const uint64_t offsetToBvh = headerSize + instancePtrSize * numInstances;
+    const uint64_t bvhSize = header->DeserializedSizeInBytes;
+
+    if (numInstances)
+    {
+        const bool instances_mixed_with_inner_nodes = false;
+        if (instances_mixed_with_inner_nodes)
+        {
+            // not implemented !
+            // copy each node with 64byte granularity if node is instance, patch it mid-copy
+        }
+        else
+        {
+            BVHBase* srcBvhBase = (BVHBase*)(src + offsetToBvh);
+
+            // numHWInstances can be bigger (because of rebraiding) or smaller (because of inactive instances) than
+            // numInstances (count of pointers and descriptors).
+            uint offsetToHwInstances = srcBvhBase->instanceLeafStart << 6;
+            uint numHwInstances = (srcBvhBase->instanceLeafEnd - srcBvhBase->instanceLeafStart) >> 1;
+
+            //
+            // instances are in separate memory intervals
+            // copy all the other data simple way
+            //
+            uint nodesEnd = srcBvhBase->Meta.instanceDescsStart;
+            // copy before instance leafs
+            CopyMemory(dest, (global char*)(src + offsetToBvh), offsetToHwInstances, groupCnt);
+
+            uint offsetPostInstances = srcBvhBase->instanceLeafEnd << 6;
+            uint instanceDescStart = srcBvhBase->Meta.instanceDescsStart;
+            uint sizePostInstances = instanceDescStart - offsetPostInstances;
+            // copy after instance leafs before instance desc
+            CopyMemory(dest + offsetPostInstances, (global char*)(src + offsetToBvh + offsetPostInstances), sizePostInstances, groupCnt);
+
+            uint instanceDescEnd = instanceDescStart + numInstances * sizeof(InstanceDesc);
+            uint sizePostInstanceDescs = bvhSize - instanceDescEnd;
+            // copy after instance desc
+            CopyMemory(dest + instanceDescEnd, (global char*)(src + offsetToBvh + instanceDescEnd), sizePostInstanceDescs, groupCnt);
+
+            global gpuva_t* newInstancePtrs = (global gpuva_t*)(src + headerSize);
+            global InstanceDesc* dstDesc = (global InstanceDesc*)(dest + instanceDescStart);
+            global InstanceDesc* srcDesc = (global InstanceDesc*)(src + offsetToBvh + instanceDescStart);
+
+            // copy and patch instance descriptors
+            for (uint64_t instanceIndex = get_group_id(0); instanceIndex < numInstances; instanceIndex += groupCnt)
+            {
+                InstanceDesc desc = srcDesc[instanceIndex];
+                uint64_t newInstancePtr = newInstancePtrs[instanceIndex];
+                desc.AccelerationStructureGPUVA = newInstancePtr; // patch it with new ptr;
+
+                dstDesc[instanceIndex] = desc;
+            }
+
+            // copy and patch hw instance leafs
+            global HwInstanceLeaf* dstInstleafs = (global HwInstanceLeaf*)(dest + offsetToHwInstances);
+            global HwInstanceLeaf* srcInstleafs = (global HwInstanceLeaf*)(src + offsetToBvh + offsetToHwInstances);
+
+            for (uint hwLeafIndex = get_group_id(0); hwLeafIndex < numHwInstances; hwLeafIndex += groupCnt)
+            {
+                // pull the instance from srcBVH
+                HwInstanceLeaf tmpInstleaf = srcInstleafs[hwLeafIndex];
+
+                uint swInstanceIndex = HwInstanceLeaf_GetInstanceIndex(&tmpInstleaf);
+                uint64_t childBvhPtr = (uint64_t)newInstancePtrs[swInstanceIndex];
+                uint64_t originalBvhPtr = (uint64_t)HwInstanceLeaf_GetBVH(&tmpInstleaf);
+
+                HwInstanceLeaf_SetBVH(&tmpInstleaf, childBvhPtr);
+                uint64_t startNode = HwInstanceLeaf_GetStartNode(&tmpInstleaf);
+
+                if (startNode != 0) {
+                    uint64_t rootNodeOffset = startNode - originalBvhPtr;
+                    HwInstanceLeaf_SetStartNode(&tmpInstleaf, childBvhPtr + rootNodeOffset);
+                }
+
+                dstInstleafs[hwLeafIndex] = tmpInstleaf;
+            }
+        }
+    }
+    else
+    {
+        CopyMemory(dest, (global char*)(src + offsetToBvh), bvhSize, groupCnt);
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel deserialize_indirect(
+    global char* dest,
+    global char* src)
+{
+    SerializationHeader* header = (SerializationHeader*)src;
+    const uint64_t bvhSize = header->DeserializedSizeInBytes;
+    unsigned groupCnt = GroupCountForCopySize(bvhSize);
+    deserializeT(dest, src, groupCnt);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel dxr_decode(global char* dest,
+    global char* src)
+{
+
+    DecodeHeader* header = (DecodeHeader*)dest;
+    BVHBase* base = (BVHBase*)src;
+
+    uint32_t numGeos = base->Meta.geoCount;
+    uint32_t numInstances = base->Meta.instanceCount;
+
+    if (numInstances > 0)
+    {
+        header->Type = TOP_LEVEL;
+        header->NumDesc = numInstances;
+
+        D3D12_RAYTRACING_INSTANCE_DESC* instanceDesc = (D3D12_RAYTRACING_INSTANCE_DESC*)(dest + sizeof(DecodeHeader));
+        copyInstanceDescs((InstanceDesc*)((uint64_t)base + (uint64_t)base->Meta.instanceDescsStart),
+            instanceDesc,
+            numInstances);
+    }
+    else if (numGeos > 0)
+    {
+        header->Type = BOTTOM_LEVEL;
+        header->NumDesc = numGeos;
+
+        D3D12_RAYTRACING_GEOMETRY_DESC* geomDescs = (D3D12_RAYTRACING_GEOMETRY_DESC*)(dest + sizeof(DecodeHeader));
+        uint64_t data = (uint64_t)geomDescs + sizeof(D3D12_RAYTRACING_GEOMETRY_DESC) * numGeos;
+        createGeoDescs((GeoMetaData*)((uint64_t)base + (uint64_t)base->Meta.geoDescsStart),
+            geomDescs,
+            numGeos,
+            data);
+
+        work_group_barrier(CLK_GLOBAL_MEM_FENCE);
+
+        copyDataFromQuadLeaves(base,
+            geomDescs);
+
+        copyDataFromLProcedurals(base,
+            geomDescs);
+    }
+    else
+    {
+        header->Type = BOTTOM_LEVEL;
+        header->NumDesc = 0;
+    }
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_debug.cl b/src/intel/vulkan/grl/gpu/bvh_debug.cl
new file mode 100644
index 00000000000..bce75fec3ff
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_debug.cl
@@ -0,0 +1,208 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+// @file bvh_debug.cl
+//
+// @brief routines to do basic integrity checks
+//
+// Notes:
+//
+
+#include "GRLGen12.h"
+#include "intrinsics.h"
+#include "libs/lsc_intrinsics.h"
+#include "GRLGen12IntegrityChecks.h"
+#include "api_interface.h"
+
+#define ERROR_PRINTF 0
+GRL_INLINE bool commit_err(
+    global uint* some_null,
+    global BVHBase* bvh,
+    global ERROR_INFO* err_info_slot,
+    ERROR_INFO err)
+{
+    if (err.type != error_t_no_error) {
+        uint expected = error_t_no_error;
+        atomic_compare_exchange_global(&err_info_slot->type, &expected, err.type);
+        if (expected == error_t_no_error)
+        {
+            err_info_slot->offset_in_BVH = err.offset_in_BVH;
+            err_info_slot->when = err.when;
+            err_info_slot->reserved = 0xAAACCAAA;
+            mem_fence_evict_to_memory();
+#if ERROR_PRINTF
+            printf("bvh = 0x%llX, err.type = %X, err.offset_in_BVH = %d\n", bvh, err.type, err.offset_in_BVH);
+#else 
+			// This is to trigger PF. Note we have to write directly to memory.
+            // If write would stay in L3 it won't give a PF untill this will get evicted to mem.
+            store_uint_L1UC_L3UC(some_null, 0, 0x0EEE0000 + err.type);
+#endif
+            return true;
+        }
+    }
+    return false;
+}
+
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel check_tree_topology(
+    global uint* some_null,
+    global BVHBase* bvh,
+    global ERROR_INFO* err,
+    uint phase)
+{
+    uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+
+    if (err->type != error_t_no_error) return;
+
+    uint dummy1, dummy2, dummy3;
+    ERROR_INFO reterr =  check_tree_topology_helper(bvh, globalID, &dummy1, &dummy2, &dummy3, false);
+    if (reterr.type == error_t_no_error)
+    {
+        reterr = check_backpointers(bvh, globalID);
+    }
+    if (reterr.type == error_t_no_error)
+    {
+        reterr = validate_atomic_update_structs(bvh, globalID);
+    }
+    reterr.when = phase;
+    commit_err(some_null, bvh, err, reterr);
+}
+
+GRL_INLINE bool IsValid48bPtr(qword ptr)
+{
+    qword CANONIZED_BITS = 0xFFFFul << 48ul;
+    qword canonized_part = ptr & CANONIZED_BITS;
+    bool isIt = ptr != 0 && (
+        canonized_part == 0 || canonized_part == CANONIZED_BITS);
+    return isIt;
+}
+
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel check_geos_before_quad_update(
+    global BVHBase* bvh, //dest bvh
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    global uint* some_null,
+    global ERROR_INFO* err,
+    uint phase,
+    uint numGeos,
+    uint numThreads)
+{
+    uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+
+    if (err->type != error_t_no_error) return;
+
+    // first check sanity of geos
+    ERROR_INFO geo_insanity_error = { error_t_input_geo_insane, 0 };  
+
+    for (uint ID = globalID; ID < numGeos; ID += numThreads * get_sub_group_size())
+    {
+        bool IsSane = IsValid48bPtr((qword)(qword)geomDesc);
+
+        if (IsSane) {
+            GRL_RAYTRACING_GEOMETRY_DESC geo = geomDesc[globalID];
+            IsSane = geo.Type < NUM_GEOMETRY_TYPES;
+            if (IsSane) {
+                if (geo.Type == GEOMETRY_TYPE_TRIANGLES) {
+                    if (geo.Desc.Triangles.IndexFormat >= INDEX_FORMAT_END) {
+                        IsSane = false;
+                    }
+                    else
+                    {
+                        if (geo.Desc.Triangles.IndexFormat != INDEX_FORMAT_NONE && geo.Desc.Triangles.IndexCount > 2)
+                        {
+                            IsSane = (geo.Desc.Triangles.VertexFormat < VERTEX_FORMAT_END) &&
+                                IsValid48bPtr((qword)geo.Desc.Triangles.pVertexBuffer) &&
+                                IsValid48bPtr((qword)geo.Desc.Triangles.pIndexBuffer);
+                        }   
+                        else if (geo.Desc.Triangles.VertexCount > 2)
+                        {
+                            IsSane =
+                                geo.Desc.Triangles.VertexFormat < VERTEX_FORMAT_END&&
+                                IsValid48bPtr((qword)geo.Desc.Triangles.pVertexBuffer) != 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        geo_insanity_error.offset_in_BVH = ID;
+        geo_insanity_error.when = phase;
+        if (!IsSane) {
+            commit_err(some_null, bvh, err, geo_insanity_error);
+        }
+        return;
+    }
+}
+
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel check_geos_vs_quads(
+    global BVHBase* bvh,
+    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
+    global uint* some_null,
+    global ERROR_INFO* err,
+    uint phase,
+    uint numGeos,
+    uint numThreads)
+{
+    uint numQuads = BVHBase_GetNumQuads(bvh);
+
+    QuadLeaf* quads = BVHBase_GetQuadLeaves(bvh);
+
+    uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+    uint qoffset = bvh->quadLeafStart;
+
+    if (err->type != error_t_no_error) return;
+    
+    ERROR_INFO theErr = { error_t_no_error, 0 };
+    
+    for (uint ID = globalID; ID < numQuads; ID += numThreads * get_sub_group_size())
+    {
+        ERROR_INFO quadErr = { error_t_quad_leaf_broken, qoffset + ID, phase };
+        
+        QuadLeaf quad = quads[ID];
+
+        uint geoIdx = PrimLeaf_GetGeoIndex(&quad.leafDesc);
+        
+        if (geoIdx > numGeos) { commit_err(some_null, bvh, err, quadErr); return; }
+
+        uint numPrimsInGeo = geomDesc[geoIdx].Desc.Triangles.IndexFormat != INDEX_FORMAT_NONE ?
+            geomDesc[geoIdx].Desc.Triangles.IndexCount  / 3 :
+            geomDesc[geoIdx].Desc.Triangles.VertexCount / 3;
+
+        if(quad.primIndex0 >= numPrimsInGeo) { 
+            commit_err(some_null, bvh, err, quadErr);
+            return; 
+        }
+        
+        if(!QuadLeaf_IsSingleTriangle(&quad) && 
+           (quad.primIndex0 + QuadLeaf_GetPrimIndexDelta(&quad) >= numPrimsInGeo))
+        {
+            commit_err(some_null, bvh, err, quadErr);
+            return; 
+        }
+    }
+}
+
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel check_instances_linked_bvhs(
+    global uint* some_null,
+    global BVHBase* bvh,
+    global ERROR_INFO* err,
+    uint phase)
+{
+    if (err->type != error_t_no_error) return;
+
+    uint instanceLeafStart = bvh->instanceLeafStart;
+    uint instanceLeafEnd = bvh->instanceLeafEnd;
+    uint numInstances = (instanceLeafEnd - instanceLeafStart) / 2;
+
+    uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+
+    ERROR_INFO reterr = check_instances_linked_bvhs_helper(bvh, globalID, /*touchBlas*/true);
+    reterr.when = phase;
+    commit_err(some_null, bvh, err, reterr);
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_debug.grl b/src/intel/vulkan/grl/gpu/bvh_debug.grl
new file mode 100644
index 00000000000..28008ab09ce
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_debug.grl
@@ -0,0 +1,107 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module bvh_on_gpu_checks;
+
+kernel_module debug_kernels ("bvh_debug.cl") 
+{
+    links lsc_intrinsics;
+    kernel opencl_check_tree_topology                        < kernelFunction="check_tree_topology">;
+    kernel opencl_check_instances_linked_bvhs                < kernelFunction="check_instances_linked_bvhs">;
+    kernel opencl_check_geos_before_quad_update              < kernelFunction="check_geos_before_quad_update">;
+    kernel opencl_check_geos_vs_quads                        < kernelFunction="check_geos_vs_quads">;
+}
+
+
+metakernel debug_checks_prepare_const_regs()
+{
+    define cRoundingSIMD REG4;
+    define cInit0        REG5;
+    define cShiftForSIMD REG3;
+    cRoundingSIMD = (16-1);
+    cShiftForSIMD = 4;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+}
+
+metakernel debug_checks_bvh_topology(
+    qword some_null_ptr,
+    qword bvh,
+    qword bvh_inner_nodes_end,
+    qword error_struct,
+    dword when,
+    dword bvh_inner_nodes_start_value )
+{
+    define cRoundingSIMD REG4;
+    define cShiftForSIMD REG3;
+    REG1.lo = load_dword(bvh_inner_nodes_end);
+    REG0 = bvh_inner_nodes_start_value;
+    REG1.hi = 0;
+    REG2 = REG1 - REG0;
+    REG2 = REG2 + cRoundingSIMD;
+    REG2 = REG2 >> cShiftForSIMD;
+
+    DISPATCHDIM_X = REG2.lo;
+
+    dispatch_indirect opencl_check_tree_topology args(
+        some_null_ptr,
+        bvh,
+        error_struct,
+        when);
+}
+
+metakernel debug_check_instances_linked_bvhs(
+    qword some_null_ptr,
+    qword bvh,
+    qword error_struct, 
+    dword numHWThreads,
+    dword when)
+{
+    dispatch opencl_check_instances_linked_bvhs(numHWThreads,1,1) args(
+        some_null_ptr,
+        bvh,
+        error_struct,
+        when);
+}
+
+metakernel debug_check_geos_before_quad_update(
+    qword bvh,
+    qword geos,
+    qword some_null_ptr,
+    qword error_struct, 
+    dword when,
+    dword numGeos,
+    dword numHWThreads )
+{
+    dispatch opencl_check_geos_before_quad_update(numHWThreads,1,1) args(
+        bvh,
+        geos,
+        some_null_ptr,
+        error_struct, 
+        when,
+        numGeos,
+        numHWThreads );
+}
+
+metakernel debug_check_geos_vs_quads(
+    qword bvh,
+    qword geos,
+    qword some_null_ptr,
+    qword error_struct, 
+    dword when,
+    dword numGeos,
+    dword numHWThreads )
+{
+    dispatch opencl_check_geos_vs_quads(numHWThreads,1,1) args(
+        bvh,
+        geos,
+        some_null_ptr,
+        error_struct, 
+        when,
+        numGeos,
+        numHWThreads );
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_postbuild_info.cl b/src/intel/vulkan/grl/gpu/bvh_postbuild_info.cl
new file mode 100644
index 00000000000..4fa222b53eb
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_postbuild_info.cl
@@ -0,0 +1,97 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "d3d12.h"
+#include "common.h"
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel compacted_size(global char *bvh_mem,
+                                                                          global char *postbuild_info)
+{
+    BVHBase *base = (BVHBase *)bvh_mem;
+    PostbuildInfoCompactedSize *postbuildInfoCompacted = (PostbuildInfoCompactedSize *)postbuild_info;
+
+    postbuildInfoCompacted->CompactedSizeInBytes = compute_compacted_size(base);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel current_size(global char *bvh_mem,
+                                                                        global char *postbuild_info)
+{
+
+    BVHBase *base = (BVHBase *)bvh_mem;
+    PostbuildInfoCurrentSize *postbuildInfoCurrent = (PostbuildInfoCurrentSize *)postbuild_info;
+
+    postbuildInfoCurrent->CurrentSizeInBytes = base->Meta.allocationSize;
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel serialized_size(global char *bvh_mem,
+                                                                           global char *postbuild_info)
+{
+
+    BVHBase *base = (BVHBase *)bvh_mem;
+    PostbuildInfoSerializationDesc *postbuildInfoSerialization = (PostbuildInfoSerializationDesc *)postbuild_info;
+
+    uint64_t headerSize = sizeof(SerializationHeader);
+    uint64_t numInstances = base->Meta.instanceCount;
+
+    postbuildInfoSerialization->SerializedSizeInBytes = sizeof(SerializationHeader) +
+                                                        numInstances * sizeof(gpuva_t) +
+                                                        compute_compacted_size(base);
+                                                        //base->Meta.allocationSize;
+    postbuildInfoSerialization->NumBottomLevelAccelerationStructurePointers = numInstances;
+}
+
+void countTrianglesAndProcedurals(GeoMetaData *geoMetaData,
+                                  uint64_t numGeos,
+                                  uint64_t *numTriangles,
+                                  uint64_t *numProcedurals)
+{
+    uint64_t numTrianglesLoc = 0;
+    uint64_t numProceduralsLoc = 0;
+
+    for (uint64_t geoIndex = get_local_id(0); geoIndex < numGeos; geoIndex += get_local_size(0))
+    {
+        if (geoMetaData[geoIndex].Type == GEOMETRY_TYPE_TRIANGLES)
+        {
+            *numTriangles += geoMetaData[geoIndex].PrimitiveCount;
+        }
+        else
+        {
+            *numProcedurals += geoMetaData[geoIndex].PrimitiveCount;
+        }
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel decoded_size(global char *bvh_mem,
+                                                                        global char *postbuild_info)
+{
+    BVHBase *base = (BVHBase *)bvh_mem;
+    PostbuildInfoToolsVisualizationDesc *postbuildInfoDecoded = (PostbuildInfoToolsVisualizationDesc *)postbuild_info;
+
+    uint64_t numTriangles = 0;
+    uint64_t numProcedurals = 0;
+    countTrianglesAndProcedurals((GeoMetaData *)((uint64_t)base + (uint64_t)base->Meta.geoDescsStart),
+                                 base->Meta.geoCount,
+                                 &numTriangles,
+                                 &numProcedurals);
+    uint64_t numInstances = base->Meta.instanceCount;
+    uint64_t numDescs = base->Meta.geoCount;
+    uint64_t headerSize = sizeof(DecodeHeader);
+    uint64_t descsSize = numDescs * sizeof(D3D12_RAYTRACING_GEOMETRY_DESC) +
+                         numInstances * sizeof(D3D12_RAYTRACING_INSTANCE_DESC);
+
+    // Each triangle is stored separately - 3 vertices (9 floats) per triangle
+    uint64_t triangleDataSize = 9 * sizeof(float);
+    uint64_t proceduralDataSize = sizeof(D3D12_RAYTRACING_AABB);
+    uint64_t geoDataSize = numTriangles * triangleDataSize + numProcedurals * proceduralDataSize;
+
+    postbuildInfoDecoded->DecodedSizeInBytes = headerSize + descsSize + geoDataSize;
+}
diff --git a/src/intel/vulkan/grl/gpu/bvh_rebraid.cl b/src/intel/vulkan/grl/gpu/bvh_rebraid.cl
new file mode 100644
index 00000000000..ab0f891acee
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/bvh_rebraid.cl
@@ -0,0 +1,1683 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "AABB.h"
+#include "GRLGen12.h"
+#include "api_interface.h"
+#include "common.h"
+#include "qbvh6.h"
+
+#define MAX_SPLITS_PER_INSTANCE 64
+#define NUM_REBRAID_BINS 32
+
+#define NUM_CHILDREN 6
+#define MAX_NODE_OFFSET 65535 // can't open nodes whose offsets exceed this
+
+// OCL/DPC++ *SHOULD* have a uniform keyword... but they dont... so I'm making my own
+#define uniform
+#define varying
+
+#define SGPRINT_UNIFORM(fmt,val)    {sub_group_barrier(CLK_LOCAL_MEM_FENCE); if( get_sub_group_local_id() == 0 ) { printf(fmt,val); }}
+
+#define SGPRINT_6x(prefix,fmt,type,val)  {\
+                                        type v0 = sub_group_broadcast( val, 0 );\
+                                        type v1 = sub_group_broadcast( val, 1 );\
+                                        type v2 = sub_group_broadcast( val, 2 );\
+                                        type v3 = sub_group_broadcast( val, 3 );\
+                                        type v4 = sub_group_broadcast( val, 4 );\
+                                        type v5 = sub_group_broadcast( val, 5 );\
+                                        sub_group_barrier(CLK_LOCAL_MEM_FENCE); \
+                                        if( get_sub_group_local_id() == 0 ) { \
+                                        printf(prefix fmt fmt fmt fmt fmt fmt "\n" , \
+                                            v0,v1,v2,v3,v4,v5);}}
+
+
+#define SGPRINT_16x(prefix,fmt,type,val)  {\
+                                        type v0 = sub_group_broadcast( val, 0 );\
+                                        type v1 = sub_group_broadcast( val, 1 );\
+                                        type v2 = sub_group_broadcast( val, 2 );\
+                                        type v3 = sub_group_broadcast( val, 3 );\
+                                        type v4 = sub_group_broadcast( val, 4 );\
+                                        type v5 = sub_group_broadcast( val, 5 );\
+                                        type v6 = sub_group_broadcast( val, 6 );\
+                                        type v7 = sub_group_broadcast( val, 7 );\
+                                        type v8 = sub_group_broadcast( val, 8 );\
+                                        type v9 = sub_group_broadcast( val, 9 );\
+                                        type v10 = sub_group_broadcast( val, 10 );\
+                                        type v11 = sub_group_broadcast( val, 11 );\
+                                        type v12 = sub_group_broadcast( val, 12 );\
+                                        type v13 = sub_group_broadcast( val, 13 );\
+                                        type v14 = sub_group_broadcast( val, 14 );\
+                                        type v15 = sub_group_broadcast( val, 15 );\
+                                        sub_group_barrier(CLK_LOCAL_MEM_FENCE); \
+                                        if( get_sub_group_local_id() == 0 ) { \
+                                        printf(prefix fmt fmt fmt fmt fmt fmt fmt fmt \
+                                                      fmt fmt fmt fmt fmt fmt fmt fmt"\n" , \
+                                            v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15);}}
+
+#if 1
+#define GRL_ATOMIC_INC(addr) atomic_add(addr, 1);
+#else 
+#define GRL_ATOMIC_INC(addr) atomic_inc(addr);
+#endif
+
+#if 0
+#define LOOP_TRIPWIRE_INIT uint _loop_trip=0;
+
+#define LOOP_TRIPWIRE_INCREMENT(max_iterations,name) \
+    _loop_trip++;\
+    if ( _loop_trip > max_iterations )\
+    {\
+        printf( "@@@@@@@@@@@@@@@@@@@@ TRIPWIRE!!!!!!!!!!!\n" );\
+        printf( name"\n");\
+        break;\
+    }
+#else
+
+#define LOOP_TRIPWIRE_INIT 
+#define LOOP_TRIPWIRE_INCREMENT(max_iterations,name)
+
+#endif
+
+
+
+typedef struct SGHeap
+{
+    uint32_t key_value;
+    bool lane_mask;
+} SGHeap;
+
+GRL_INLINE void SGHeap_init(uniform SGHeap *h)
+{
+    h->lane_mask = false;
+    h->key_value = 0xbaadf00d;
+}
+
+GRL_INLINE bool SGHeap_full(uniform SGHeap *h)
+{
+    return sub_group_all(h->lane_mask);
+}
+GRL_INLINE bool SGHeap_empty(uniform SGHeap *h)
+{
+    return sub_group_all(!h->lane_mask);
+}
+
+GRL_INLINE bool SGHeap_get_lane_mask(uniform SGHeap *h)
+{
+    return h->lane_mask;
+}
+GRL_INLINE uint16_t SGHeap_get_lane_values(uniform SGHeap *h)
+{
+    return (h->key_value & 0xffff);
+}
+
+GRL_INLINE ushort isolate_lowest_bit( ushort m )
+{
+    return m & ~(m - 1);
+}
+
+
+// lane i receives the index of the ith set bit in mask.  
+GRL_INLINE ushort subgroup_bit_rank( uniform ushort mask )
+{
+    varying ushort lane = get_sub_group_local_id();
+    ushort idx = 16;
+    for ( uint i = 0; i < NUM_CHILDREN; i++ )
+    {
+        ushort lo = isolate_lowest_bit( mask );
+        mask = mask ^ lo;
+        idx = (lane == i) ? lo : idx;
+    }
+
+    return ctz( idx );
+}
+
+// push a set of elements spread across a subgroup.  Return mask of elements that were not pushed
+GRL_INLINE uint16_t SGHeap_vectorized_push(uniform SGHeap *h, varying uint16_t key, varying uint16_t value, uniform ushort push_mask)
+{    
+
+#if 0 // an attempt to make this algorithm branchless
+    varying uint key_value = (((uint)key) << 16) | ((uint)value);
+    uniform ushort free_mask = intel_sub_group_ballot( !h->lane_mask );
+
+    varying ushort free_slot_idx = subgroup_bit_prefix_exclusive( free_mask ); // for each heap slot, what is its position in a compacted list of free slots (prefix sum)
+    varying ushort push_idx      = subgroup_bit_prefix_exclusive( push_mask );  // for each lane, what is its position in a compacted list of pushing lanes (prefix sum)
+
+    uniform ushort num_pushes = min( popcount( free_mask ), popcount( push_mask ) );
+
+    varying ushort push_index = subgroup_bit_rank( push_mask ); // lane i gets the index of the i'th set bit in push_mask
+    
+    varying uint shuffled = intel_sub_group_shuffle( key_value, intel_sub_group_shuffle( push_index, free_slot_idx ) );
+    varying bool pushed = false;
+    if ( !h->lane_mask && free_slot_idx < num_pushes )
+    {
+        h->lane_mask = true;
+        h->key_value = shuffled;
+        pushed = true;
+    }
+
+    return push_mask & intel_sub_group_ballot( push_idx >= num_pushes );
+#else
+
+    varying uint lane = get_sub_group_local_id();
+
+    varying uint key_value = (((uint)key) << 16) | ((uint)value);
+    uniform ushort free_mask = intel_sub_group_ballot(!h->lane_mask);
+
+    // TODO_OPT:  Look for some clever way to remove this loop
+    while (free_mask && push_mask)
+    {
+        // insert first active child into first available lane
+        uniform uint child_id = ctz(push_mask);
+        uniform uint victim_lane = ctz(free_mask);
+        uniform uint kv = sub_group_broadcast( key_value, child_id );
+        if (victim_lane == lane)
+        {
+            h->lane_mask = true;
+            h->key_value = kv;
+        }
+        push_mask ^= (1 << child_id);
+        free_mask ^= (1 << victim_lane);
+    }
+
+    return push_mask;
+
+#endif
+}
+
+// push an item onto a heap that is full except for one slot
+GRL_INLINE void SGHeap_push_and_fill(uniform SGHeap *h, uniform uint16_t key, uniform uint16_t value)
+{
+    uniform uint32_t key_value = (((uint)key) << 16) | value;
+    if (!h->lane_mask)
+    {
+        h->lane_mask = true;
+        h->key_value = key_value; // only one lane will be active at this point
+    }
+}
+
+// pop the min item from a full heap
+GRL_INLINE void SGHeap_full_pop_min(uniform SGHeap *h, uniform uint16_t *key_out, uniform uint16_t *value_out)
+{
+    varying uint lane = get_sub_group_local_id();
+    uniform uint kv = sub_group_reduce_min(h->key_value);
+    if (h->key_value == kv)
+        h->lane_mask = false;
+
+    *key_out   = (kv >> 16);
+    *value_out = (kv & 0xffff);
+}
+
+// pop the max item from a heap
+GRL_INLINE void SGHeap_pop_max(uniform SGHeap *h, uniform uint16_t *key_out, uniform uint16_t *value_out)
+{
+    uniform uint lane = get_sub_group_local_id();
+    uniform uint kv = sub_group_reduce_max(h->lane_mask ? h->key_value : 0);
+    if (h->key_value == kv)
+        h->lane_mask = false;
+
+    *key_out = (kv >> 16);
+    *value_out = (kv & 0xffff);
+}
+
+GRL_INLINE void SGHeap_printf( SGHeap* heap )
+{
+    uint key = heap->key_value >> 16;
+    uint value = heap->key_value & 0xffff;
+    
+    if ( get_sub_group_local_id() == 0)
+        printf( "HEAP: \n" );
+    SGPRINT_16x( "  mask: ", "%6u ", bool, heap->lane_mask );
+    SGPRINT_16x( "  key : ", "0x%04x ", uint, key );
+    SGPRINT_16x( "  val : ", "0x%04x ", uint, value );
+
+}
+
+GRL_INLINE float transformed_aabb_halfArea(float3 lower, float3 upper, const float *Transform)
+{
+    // Compute transformed extent per 'transform_aabb'.  Various terms cancel
+    float3 Extent = upper - lower;
+    float ex = Extent.x * fabs(Transform[0]) + Extent.y * fabs(Transform[1]) + Extent.z * fabs(Transform[2]);
+    float ey = Extent.x * fabs(Transform[4]) + Extent.y * fabs(Transform[5]) + Extent.z * fabs(Transform[6]);
+    float ez = Extent.x * fabs(Transform[8]) + Extent.y * fabs(Transform[9]) + Extent.z * fabs(Transform[10]);
+
+    return (ex * ey) + (ey * ez) + (ex * ez);
+}
+
+GRL_INLINE uint16_t quantize_area(float relative_area)
+{
+    // clamp relative area at 0.25 (1/4 of root area)
+    //  and apply a non-linear distribution because most things in real scenes are small
+    relative_area = pow(min(1.0f, relative_area * 4.0f), 0.125f);   
+    return convert_ushort_rtn( relative_area * 65535.0f );    
+}
+
+GRL_INLINE varying uint16_t SUBGROUP_get_child_areas(uniform InternalNode *n,
+                                                 uniform const float *Transform,
+                                                 uniform float relative_area_scale)
+{
+    varying uint16_t area;
+    varying uint16_t lane = get_sub_group_local_id();
+    varying int exp_x = n->exp_x;
+    varying int exp_y = n->exp_y;
+    varying int exp_z = n->exp_z;
+
+    {
+        // decode the AABB positions.  Lower in the bottom 6 lanes, upper in the top
+        uniform uint8_t *px = &n->lower_x[0];
+        uniform uint8_t *py = &n->lower_y[0];
+        uniform uint8_t *pz = &n->lower_z[0];
+
+        varying float fx = convert_float(px[lane]);
+        varying float fy = convert_float(py[lane]);
+        varying float fz = convert_float(pz[lane]);
+        fx = n->lower[0] + bitShiftLdexp(fx, exp_x - 8);
+        fy = n->lower[1] + bitShiftLdexp(fy, exp_y - 8);
+        fz = n->lower[2] + bitShiftLdexp(fz, exp_z - 8);
+
+        // transform the AABBs to world space
+        varying float3 lower = (float3)(fx, fy, fz);
+        varying float3 upper = intel_sub_group_shuffle(lower, lane + 6);
+
+        {
+ 
+            // TODO_OPT:  This is only utilizing 6 lanes.
+            //  We might be able to do better by vectorizing the calculation differently
+            float a1 = transformed_aabb_halfArea( lower, upper, Transform );
+            float a2 = a1 * relative_area_scale;
+            area = quantize_area( a2 );
+        }
+    }
+
+    return area;
+}
+
+
+
+GRL_INLINE ushort get_child_area( 
+    InternalNode* n, 
+    ushort child,
+    const float* Transform,
+    float relative_area_scale )
+{
+    uint16_t area;
+    uint16_t lane = get_sub_group_local_id();
+    int exp_x = n->exp_x;
+    int exp_y = n->exp_y;
+    int exp_z = n->exp_z;
+
+    // decode the AABB positions.  Lower in the bottom 6 lanes, upper in the top
+    uint8_t* px = &n->lower_x[0];
+    uint8_t* py = &n->lower_y[0];
+    uint8_t* pz = &n->lower_z[0];
+
+    float3 lower, upper;
+    lower.x = convert_float( n->lower_x[child] );
+    lower.y = convert_float( n->lower_y[child] );
+    lower.z = convert_float( n->lower_z[child] );
+    upper.x = convert_float( n->upper_x[child] );
+    upper.y = convert_float( n->upper_y[child] );
+    upper.z = convert_float( n->upper_z[child] );
+
+    lower.x = bitShiftLdexp( lower.x, exp_x - 8 ); // NOTE:  the node's 'lower' field cancels out, so don't add it
+    lower.y = bitShiftLdexp( lower.y, exp_y - 8 ); //    see transform_aabb_halfArea
+    lower.z = bitShiftLdexp( lower.z, exp_z - 8 );
+    upper.x = bitShiftLdexp( upper.x, exp_x - 8 );
+    upper.y = bitShiftLdexp( upper.y, exp_y - 8 );
+    upper.z = bitShiftLdexp( upper.z, exp_z - 8 );
+
+    float a1 = transformed_aabb_halfArea( lower, upper, Transform );
+    float a2 = a1 * relative_area_scale;
+    area = quantize_area( a2 );
+
+    return area;
+}
+
+
+GRL_INLINE varying int SUBGROUP_get_child_offsets(uniform InternalNode *n)
+{
+    varying uint lane = get_sub_group_local_id();
+    varying uint child = (lane < NUM_CHILDREN) ? lane : 0;
+
+    varying uint block_incr = InternalNode_GetChildBlockIncr( n, child );
+
+    //varying uint prefix = sub_group_scan_exclusive_add( block_incr );
+    varying uint prefix;
+    if ( NUM_CHILDREN == 6 ) 
+    {
+        prefix = block_incr + intel_sub_group_shuffle_up( 0u, block_incr, 1u );
+        prefix = prefix + intel_sub_group_shuffle_up( 0u, prefix, 2 );
+        prefix = prefix + intel_sub_group_shuffle_up( 0u, prefix, 4 );        
+        prefix = prefix - block_incr;
+    }
+
+    return n->childOffset + prefix;
+}
+
+
+// compute the maximum number of leaf nodes that will be produced given 'num_splits' node openings
+GRL_INLINE uint get_num_nodes(uint num_splits, uint max_children)
+{
+    // each split consumes one node and replaces it with N nodes
+    //   there is initially one node
+    //  number of nodes is thus:  N*s + 1 - s ==> (N-1)*s + 1
+    return (max_children - 1) * num_splits + 1;
+}
+
+// compute the number of node openings that can be performed given a fixed extra node budget
+GRL_INLINE uint get_num_splits(uint num_nodes, uint max_children)
+{
+    // inverse of get_num_nodes:   x = (n-1)s + 1
+    //                             s = (x-1)/(n-1)
+    if (num_nodes == 0)
+        return 0;
+
+    return (num_nodes - 1) / (max_children - 1);
+}
+
+GRL_INLINE uint get_rebraid_bin_index(uint16_t quantized_area, uint NUM_BINS)
+{
+    // arrange bins in descending order by size
+    float relative_area = quantized_area * (1.0f/65535.0f);
+    relative_area = 1.0f - relative_area; // arrange bins largest to smallest
+    size_t bin = round(relative_area * (NUM_BINS - 1));
+    return bin;
+}
+
+GRL_INLINE global InternalNode *get_node(global BVHBase *base, int incr)
+{
+    global char *ptr = (((global char *)base) + BVH_ROOT_NODE_OFFSET); // NOTE: Assuming this will be hoisted out of inner loops
+
+    return (global InternalNode *)(ptr + incr * 64);
+}
+
+GRL_INLINE bool is_aabb_valid(float3 lower, float3 upper)
+{
+    return all(isfinite(lower)) &&
+           all(isfinite(upper)) &&
+           all(lower <= upper);
+}
+
+GRL_INLINE bool is_node_openable(InternalNode *n)
+{
+    // TODO_OPT: Optimize me by fetching dwords instead of looping over bytes
+    // TODO: OPT:  Pre-compute openability and pack into the pad byte next to the nodeType field??
+    bool openable = n->nodeType == NODE_TYPE_INTERNAL;
+    if ( openable )
+    {
+        for ( uint i = 0; i < NUM_CHILDREN; i++ )
+        {
+            bool valid = InternalNode_IsChildValid( n, i );
+            uint childType = InternalNode_GetChildType( n, i );
+            openable = openable & (!valid || (childType == NODE_TYPE_INTERNAL));
+        }
+    }
+
+    return openable;
+}
+
+
+GRL_INLINE bool SUBGROUP_can_open_root(
+    uniform global BVHBase *bvh_base,
+    uniform const struct GRL_RAYTRACING_INSTANCE_DESC* instance
+    )
+{
+    if (bvh_base == 0 || GRL_get_InstanceMask(instance) == 0)
+        return false;
+
+    // TODO_OPT: SG-vectorize this AABB test
+    uniform float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds);
+    uniform float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds);
+    if (!is_aabb_valid(root_lower, root_upper))
+        return false;
+
+    uniform global InternalNode *node = get_node(bvh_base, 0);
+    if ( node->nodeType != NODE_TYPE_INTERNAL )
+        return false; 
+
+    varying bool openable = true;
+    varying uint lane = get_sub_group_local_id();
+    if (lane < NUM_CHILDREN)
+    {
+        varying uint childType = InternalNode_GetChildType(node, lane);
+        varying bool valid = InternalNode_IsChildValid(node, lane);
+        openable = childType == NODE_TYPE_INTERNAL || !valid;
+    }
+
+    return sub_group_all(openable);
+}
+
+
+
+GRL_INLINE
+varying uint2
+SUBGROUP_count_instance_splits(uniform global struct AABB3f *geometry_bounds,
+                               uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instance)
+{
+    uniform global BVHBase *bvh_base = (global BVHBase *)instance->AccelerationStructure;
+    if (!SUBGROUP_can_open_root(bvh_base, instance))
+        return (uint2)(0, 0);
+
+    uniform float relative_area_scale = 1.0f / AABB3f_halfArea(geometry_bounds);
+    uniform float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds);
+    uniform float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds);
+
+    uniform uint16_t quantized_area = quantize_area(transformed_aabb_halfArea(root_lower, root_upper, instance->Transform) * relative_area_scale);
+    uniform uint16_t node_offs = 0;
+
+    uniform SGHeap heap;
+    uniform uint num_splits = 0;
+
+    SGHeap_init(&heap);
+    varying uint sg_split_counts_hi = 0; // cross-subgroup bin counters
+    varying uint sg_split_counts_lo = 0;
+
+    uniform global InternalNode* node_array = get_node( bvh_base, 0 );
+
+    LOOP_TRIPWIRE_INIT;
+
+    while (1)
+    {
+        uniform global InternalNode* node = node_array + node_offs;
+
+        // count this split
+        uniform uint bin = get_rebraid_bin_index(quantized_area, NUM_REBRAID_BINS);
+        varying uint lane = get_sub_group_local_id();
+
+        sg_split_counts_hi += ((lane + 16) == bin) ? 1 : 0;
+        sg_split_counts_lo += (lane == bin) ? 1 : 0;
+
+        // open this node and push all of its openable children to heap
+        varying uint sg_offs = node_offs + SUBGROUP_get_child_offsets(node);
+        varying bool sg_openable = 0;
+        if (lane < NUM_CHILDREN & sg_offs <= MAX_NODE_OFFSET )
+            if (InternalNode_IsChildValid(node, lane))
+                sg_openable = is_node_openable( node_array + sg_offs);
+
+        uniform uint openable_children = intel_sub_group_ballot(sg_openable);
+
+        if ( openable_children )
+        {
+            varying uint16_t sg_area = SUBGROUP_get_child_areas( node, instance->Transform, relative_area_scale );
+
+            if ( !SGHeap_full( &heap ) )
+            {         
+                openable_children = SGHeap_vectorized_push( &heap, sg_area, sg_offs, openable_children );
+            }
+
+            while ( openable_children )
+            {
+                // pop min element
+                uniform uint16_t min_area;
+                uniform uint16_t min_offs;
+                SGHeap_full_pop_min( &heap, &min_area, &min_offs );
+
+                // eliminate all children smaller than heap minimum
+                openable_children &= intel_sub_group_ballot( sg_area > min_area );
+
+                if ( openable_children )
+                {
+                    // if any children survived,
+                    // kick out heap minimum and replace with first child.. otherwise we will re-push the minimum
+                    uniform uint child_id = ctz( openable_children );
+                    openable_children ^= (1 << child_id);
+                    min_area = sub_group_broadcast( sg_area, child_id );
+                    min_offs = sub_group_broadcast( sg_offs, child_id );
+                }
+
+                // re-insert onto heap
+                SGHeap_push_and_fill( &heap, min_area, min_offs );
+
+                // repeat until all children are accounted for.  It is possible
+                //  for multiple children to fit in the heap, because heap minimum is now changed and we need to recompute it
+            }
+        }
+
+        num_splits++;
+        if (num_splits == MAX_SPLITS_PER_INSTANCE)
+            break;
+
+        if (SGHeap_empty(&heap))
+            break;
+
+        // get next node from heap
+        SGHeap_pop_max(&heap, &quantized_area, &node_offs);
+
+        LOOP_TRIPWIRE_INCREMENT( 500, "rebraid_count_splits" );
+
+    }
+
+    return (uint2)(sg_split_counts_lo, sg_split_counts_hi);
+}
+
+typedef struct RebraidBuffers
+{
+    global uint *bin_split_counts;    // [num_bins]
+    global uint *bin_instance_counts; // [num_bins]
+    global uint *instance_bin_counts; // num_intances * num_bins
+} RebraidBuffers;
+
+GRL_INLINE RebraidBuffers cast_rebraid_buffers(global uint *scratch, uint instanceID)
+{
+    RebraidBuffers b;
+    b.bin_split_counts = scratch;
+    b.bin_instance_counts = scratch + NUM_REBRAID_BINS;
+    b.instance_bin_counts = scratch + (2 + instanceID) * NUM_REBRAID_BINS;
+    return b;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////
+//                            Compute AABB
+//                  Dispatch one work item per instance
+///////////////////////////////////////////////////////////////////////////////////////////
+
+GRL_INLINE void rebraid_compute_AABB(
+                          global struct BVHBase* bvh,
+                          global __const struct GRL_RAYTRACING_INSTANCE_DESC *instance)
+{
+    // don't open null rtas
+    global BVHBase *bvh_base = (global BVHBase *)instance->AccelerationStructure;
+
+    struct AABB new_primref;
+    if (bvh_base != 0)
+    {
+        float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds);
+        float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds);
+        const float *Transform = instance->Transform;
+
+        if (is_aabb_valid(root_lower, root_upper))
+        {
+            new_primref = AABBfromAABB3f(transform_aabb(root_lower, root_upper, Transform));
+        }
+        else
+        {
+            // degenerate instance which might be updated to be non-degenerate
+            //  use AABB position to guide BVH construction
+            //
+            new_primref.lower.x = Transform[3];
+            new_primref.lower.y = Transform[7];
+            new_primref.lower.z = Transform[11];
+            new_primref.upper = new_primref.lower;
+        }
+    }
+    else
+    {
+        AABB_init(&new_primref);
+    }
+
+    struct AABB subgroup_bbox = AABB_sub_group_reduce(&new_primref);
+
+    if (get_sub_group_local_id() == 0)
+    {
+        AABB3f_atomic_merge_global_lu(&bvh->Meta.bounds, subgroup_bbox.lower.xyz, subgroup_bbox.upper.xyz );
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+rebraid_computeAABB_DXR_instances(
+    global struct BVHBase* bvh,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances)
+{
+    const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0);
+    rebraid_compute_AABB(bvh, instances + instanceID);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+rebraid_computeAABB_DXR_instances_indirect(
+    global struct BVHBase* bvh,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances,
+    global struct IndirectBuildRangeInfo const * const indirect_data)
+{
+    const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0);
+    instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
+        (((global char*)instances) + indirect_data->primitiveOffset);
+    rebraid_compute_AABB(bvh, instances + instanceID);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+rebraid_computeAABB_DXR_instances_pointers(
+    global struct BVHBase* bvh,
+    global void *instances_in)
+{
+    global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
+        (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
+
+    const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0);
+    rebraid_compute_AABB(bvh, instances[instanceID]);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+rebraid_computeAABB_DXR_instances_pointers_indirect(
+    global struct BVHBase* bvh,
+    global void *instances_in,
+    global struct IndirectBuildRangeInfo const * const indirect_data)
+{
+    instances_in = ((global char*)instances_in) + indirect_data->primitiveOffset;
+    global const struct GRL_RAYTRACING_INSTANCE_DESC **instances =
+        (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instances_in;
+
+    const uint instanceID = get_local_id(0) + get_group_id(0)*get_local_size(0);
+    rebraid_compute_AABB(bvh, instances[instanceID]);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////
+//                            Init scratch:  Dispatch one work group
+///////////////////////////////////////////////////////////////////////////////////////////
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(64, 1, 1))) void kernel rebraid_init_scratch(global uint *scratch)
+{
+    scratch[get_local_id(0) + get_group_id(0)*get_local_size(0)] = 0;
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel rebraid_chase_instance_pointers(global struct GRL_RAYTRACING_INSTANCE_DESC *instances_out,
+                                                                                           global void *instance_buff)
+{
+    global const struct GRL_RAYTRACING_INSTANCE_DESC **instances_in =
+        (global const struct GRL_RAYTRACING_INSTANCE_DESC **)instance_buff;
+
+    instances_out[get_local_id(0)] = *instances_in[get_local_id(0)];
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+void kernel rebraid_chase_instance_pointers_indirect(
+    global struct GRL_RAYTRACING_INSTANCE_DESC*       instances_out,
+    global void*                                      instance_buff,
+    global struct IndirectBuildRangeInfo const* const indirect_data)
+{
+    instance_buff = ((global char*)instance_buff) + indirect_data->primitiveOffset;
+    global const struct GRL_RAYTRACING_INSTANCE_DESC**
+        instances_in = (global const struct GRL_RAYTRACING_INSTANCE_DESC**)instance_buff;
+
+    instances_out[get_local_id(0)] = *instances_in[get_local_id(0)];
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////
+//                             Count splits
+///////////////////////////////////////////////////////////////////////////////////////////
+
+GRL_INLINE void DEBUG_SUBGROUP_print_split_counts( uniform uint instanceID, varying uint split_counts_lo, varying uint split_counts_hi )
+{
+    uniform uint vals[32] = {
+       sub_group_broadcast( split_counts_lo, 0 ),  sub_group_broadcast( split_counts_lo, 1 ),
+       sub_group_broadcast( split_counts_lo, 2 ),  sub_group_broadcast( split_counts_lo, 3 ),
+       sub_group_broadcast( split_counts_lo, 4 ),  sub_group_broadcast( split_counts_lo, 5 ),
+       sub_group_broadcast( split_counts_lo, 6 ),  sub_group_broadcast( split_counts_lo, 7 ),
+       sub_group_broadcast( split_counts_lo, 8 ),  sub_group_broadcast( split_counts_lo, 9 ),
+       sub_group_broadcast( split_counts_lo, 10 ), sub_group_broadcast( split_counts_lo, 11 ),
+       sub_group_broadcast( split_counts_lo, 12 ), sub_group_broadcast( split_counts_lo, 13 ),
+       sub_group_broadcast( split_counts_lo, 14 ), sub_group_broadcast( split_counts_lo, 15 ),
+
+       sub_group_broadcast( split_counts_hi, 0 ),  sub_group_broadcast( split_counts_hi, 1 ),
+       sub_group_broadcast( split_counts_hi, 2 ),  sub_group_broadcast( split_counts_hi, 3 ),
+       sub_group_broadcast( split_counts_hi, 4 ),  sub_group_broadcast( split_counts_hi, 5 ),
+       sub_group_broadcast( split_counts_hi, 6 ),  sub_group_broadcast( split_counts_hi, 7 ),
+       sub_group_broadcast( split_counts_hi, 8 ),  sub_group_broadcast( split_counts_hi, 9 ),
+       sub_group_broadcast( split_counts_hi, 10 ), sub_group_broadcast( split_counts_hi, 11 ),
+       sub_group_broadcast( split_counts_hi, 12 ), sub_group_broadcast( split_counts_hi, 13 ),
+       sub_group_broadcast( split_counts_hi, 14 ), sub_group_broadcast( split_counts_hi, 15 )
+    };
+
+    if ( get_sub_group_local_id() == 0 )
+    {
+        printf(
+            "Instance: %4u  "
+            "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u "
+            "%2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u %2u \n"
+            ,
+            instanceID,
+            vals[0], vals[1], vals[2], vals[3], vals[4], vals[5], vals[6], vals[7],
+            vals[8], vals[9], vals[10], vals[11], vals[12], vals[13], vals[14], vals[15],
+            vals[16], vals[17], vals[18], vals[19], vals[20], vals[21], vals[22], vals[23],
+            vals[24], vals[25], vals[26], vals[27], vals[28], vals[29], vals[30], vals[31]
+        );
+    }
+}
+
+GRL_INLINE void do_rebraid_count_splits_SG(
+    uniform global struct BVHBase* bvh,
+    uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances,
+    uniform global uint *rebraid_scratch)
+{
+    uniform const uint instanceID = get_sub_group_global_id();
+    uniform RebraidBuffers buffers = cast_rebraid_buffers(rebraid_scratch,instanceID);
+
+    varying uint lane = get_sub_group_local_id();
+    varying uint2 splits = SUBGROUP_count_instance_splits(&bvh->Meta.bounds, instances + instanceID);
+    varying uint split_counts_lo = splits.x;
+    varying uint split_counts_hi = splits.y;
+
+    // write this instance's per-bin counts
+    global uint* counts = buffers.instance_bin_counts;
+    intel_sub_group_block_write2( counts, splits );
+
+    // update the per-bin split and instance counters
+    if (split_counts_lo > 0)
+    {
+        atomic_add(&buffers.bin_split_counts[lane], split_counts_lo);
+        GRL_ATOMIC_INC(&buffers.bin_instance_counts[lane]);
+    }
+    if (split_counts_hi > 0)
+    {
+        atomic_add(&buffers.bin_split_counts[lane + 16], split_counts_hi);
+        GRL_ATOMIC_INC(&buffers.bin_instance_counts[lane + 16]);
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+rebraid_count_splits_SG(
+    uniform global struct BVHBase* bvh,
+    uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances,
+    uniform global uint *rebraid_scratch)
+{
+    do_rebraid_count_splits_SG(bvh, instances, rebraid_scratch);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+rebraid_count_splits_SG_indirect(
+    uniform global struct BVHBase* bvh,
+    uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instances,
+    uniform global uint *rebraid_scratch,
+    global struct IndirectBuildRangeInfo const * const indirect_data)
+{
+    instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
+        (((global char*)instances) + indirect_data->primitiveOffset);
+    do_rebraid_count_splits_SG(bvh, instances, rebraid_scratch);
+}
+
+
+#define HEAP_SIZE 16 
+#define COUNT_SPLITS_WG_SIZE 16
+
+struct SLMHeapNode
+{
+    short offs;
+    ushort area;
+};
+
+struct SLMHeap
+{
+    struct SLMHeapNode nodes[HEAP_SIZE];
+    ushort size;
+    ushort min_key;
+};
+
+GRL_INLINE bool SLMHeapNode_Greater( struct SLMHeapNode a, struct SLMHeapNode b )
+{
+    return a.area > b.area;
+}
+
+GRL_INLINE ushort SLMHeapNode_UnpackKey( struct SLMHeapNode a )
+{
+    return a.area;
+}
+
+GRL_INLINE void SLMHeapNode_Unpack( struct SLMHeapNode a, ushort* area_out, short* offs_out )
+{
+    *area_out = a.area;
+    *offs_out = a.offs;
+}
+
+GRL_INLINE struct SLMHeapNode SLMHeapNode_Pack( ushort area, short offs )
+{
+    struct SLMHeapNode n;
+    n.offs = offs;
+    n.area = area;
+    return n;
+}
+
+
+GRL_INLINE void SLMHeap_Init( struct SLMHeap* heap )
+{
+    heap->size = 0;
+    heap->min_key = 0xffff;
+}
+
+GRL_INLINE bool SLMHeap_empty( struct SLMHeap* heap )
+{
+    return heap->size == 0;
+}
+
+GRL_INLINE bool SLMHeap_full( struct SLMHeap* heap )
+{
+    return heap->size == HEAP_SIZE;
+}
+
+
+GRL_INLINE void SLMHeap_push( struct SLMHeap* heap, ushort area, short offs )
+{
+    ushort insert_pos;
+    if ( SLMHeap_full( heap ) )
+    {
+        ushort current_min_key = heap->min_key;
+        if ( area <= current_min_key )
+            return; // don't push stuff that's smaller than the current minimum
+
+        // search for the minimum element
+        //  The heap is laid out in level order, so it is sufficient to search only the last half 
+        ushort last_leaf = HEAP_SIZE - 1;
+        ushort first_leaf = (last_leaf / 2) + 1;
+
+        // as we search, keep track of what the new min-key will be so we can cull future pushes
+        ushort new_min_key = area;
+        ushort min_pos = 0;
+
+        do
+        {
+            ushort idx = first_leaf++;
+
+            ushort current_key = SLMHeapNode_UnpackKey( heap->nodes[idx] );
+            bool found_min_pos = (min_pos == 0) && (current_key == current_min_key);
+            
+            if ( found_min_pos )
+                min_pos = idx;
+            else
+                new_min_key = min( current_key, new_min_key );
+
+        } while ( first_leaf != last_leaf );
+
+        heap->min_key = new_min_key;
+        insert_pos = min_pos;
+    }
+    else
+    {
+        insert_pos = heap->size++;
+        heap->min_key = min( area, heap->min_key );
+    }
+
+    heap->nodes[insert_pos] = SLMHeapNode_Pack( area, offs );
+
+    // heap-up
+    while ( insert_pos )
+    {
+        ushort parent = insert_pos / 2;
+
+        struct SLMHeapNode parent_node = heap->nodes[parent];
+        struct SLMHeapNode current_node = heap->nodes[insert_pos];
+        if ( SLMHeapNode_Greater( parent_node, current_node ) )
+            break;
+         
+        heap->nodes[insert_pos]    = parent_node;
+        heap->nodes[parent]        = current_node;
+        insert_pos = parent;
+    }
+
+}
+
+bool SLMHeap_pop_max( struct SLMHeap* heap, ushort* area_out, short* offs_out )
+{
+    if ( SLMHeap_empty( heap ) )
+        return false;
+
+    SLMHeapNode_Unpack( heap->nodes[0], area_out, offs_out );
+
+    // heap down
+    ushort size = heap->size;
+    ushort idx = 0;
+    do
+    {
+        ushort left = 2 * idx + 1;
+        ushort right = 2 * idx + 2;
+        if ( left >= size )
+            break;
+
+        if ( right >= size )
+        {
+            heap->nodes[idx] = heap->nodes[left];
+            break;
+        }
+
+        struct SLMHeapNode left_node = heap->nodes[left];
+        struct SLMHeapNode right_node = heap->nodes[right];
+        bool go_left = SLMHeapNode_Greater( left_node, right_node );
+        heap->nodes[idx] = go_left ? left_node : right_node;
+        idx = go_left ? left : right;
+
+    } while ( 1 );
+
+    heap->size = size - 1;
+    return true;
+}
+
+void SLMHeap_Print( struct SLMHeap* heap )
+{
+    printf( " size=%u min=%u {", heap->size, heap->min_key );
+    for ( uint i = 0; i < heap->size; i++ )
+        printf( "%04x:%04x", heap->nodes[i].area, heap->nodes[i].offs );
+}
+
+
+GRL_INLINE bool can_open_root( 
+    global struct BVHBase* bvh_base, 
+    const struct GRL_RAYTRACING_INSTANCE_DESC* instance 
+    )
+{    
+    float3 root_lower = AABB3f_load_lower( &bvh_base->Meta.bounds );
+    float3 root_upper = AABB3f_load_upper( &bvh_base->Meta.bounds );
+    if ( !is_aabb_valid( root_lower, root_upper ) || GRL_get_InstanceMask(instance) == 0 )
+        return false;
+
+    global InternalNode* node = get_node( bvh_base, 0 );
+    if ( node->nodeType != NODE_TYPE_INTERNAL )
+        return false;
+
+    return is_node_openable( node );
+}
+
+
+GRL_INLINE void count_instance_splits(
+    global struct AABB3f* geometry_bounds,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance,
+    local ushort* bin_split_counts,
+    local struct SLMHeap* heap
+)
+{
+    global BVHBase* bvh_base = (global BVHBase*)instance->AccelerationStructure;
+    
+    SLMHeap_Init( heap );
+
+    float relative_area_scale = 1.0f / AABB3f_halfArea( geometry_bounds );
+    float3 root_lower = AABB3f_load_lower( &bvh_base->Meta.bounds );
+    float3 root_upper = AABB3f_load_upper( &bvh_base->Meta.bounds );
+
+    ushort quantized_area = quantize_area( transformed_aabb_halfArea( root_lower, root_upper, instance->Transform ) * relative_area_scale );
+    short  node_offs = 0;
+    ushort num_splits = 0;
+    
+    global InternalNode* node_array = get_node( bvh_base, 0 );
+
+    while ( 1 )
+    {
+        global InternalNode* node = node_array + node_offs;
+
+        // count this split
+        uint bin = get_rebraid_bin_index( quantized_area, NUM_REBRAID_BINS );
+        bin_split_counts[bin]++;
+
+        // open this node and push children to heap
+
+        // TODO_OPT:  Restructure this control flow to prevent differnet lanes from skipping different loop iterations and diverging
+        // TODO_OPT:  Precompute openability masks in BLAS nodes at build time... one bit for self and N bits for each child
+        int offs = node->childOffset;
+        for ( ushort i = 0; i < NUM_CHILDREN; i++ )
+        {
+            if ( InternalNode_IsChildValid( node, i ) )
+            {
+                if ( offs >= SHRT_MIN && offs <= SHRT_MAX )
+                {
+                    if ( is_node_openable( node_array + offs ) )
+                    {
+                        ushort area = get_child_area( node, i, instance->Transform, relative_area_scale );
+                        SLMHeap_push( heap, area, (short)offs );
+                    }
+                }
+            }
+            offs += InternalNode_GetChildBlockIncr( node, i );
+        }
+
+        num_splits++;
+        if ( num_splits == MAX_SPLITS_PER_INSTANCE )
+            break;
+
+        if ( !SLMHeap_pop_max( heap, &quantized_area, &node_offs ) )
+            break;
+    }
+
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL      
+__attribute__( (reqd_work_group_size( COUNT_SPLITS_WG_SIZE, 1, 1 )) )
+void kernel
+rebraid_count_splits(
+    global struct BVHBase* bvh_base,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
+    global uint* rebraid_scratch,
+    uint num_instances
+    )
+{
+    local struct SLMHeap heap[COUNT_SPLITS_WG_SIZE];
+    local ushort split_counts[COUNT_SPLITS_WG_SIZE][NUM_REBRAID_BINS];
+
+    // initialize stuff
+    // TODO_OPT:  transpose this and subgroup-vectorize it so that
+    //     block-writes can be used
+    for ( uint i = 0; i < NUM_REBRAID_BINS; i++ )
+        split_counts[get_local_id( 0 )][i] = 0;
+
+
+    // count splits for this thread's instance
+    uniform uint base_instance = get_group_id( 0 ) * get_local_size( 0 );
+    uint instanceID = base_instance + get_local_id( 0 );
+    
+    if ( instanceID < num_instances )
+    {
+        global BVHBase* bvh_base = (global BVHBase*)instances[instanceID].AccelerationStructure;
+        if ( can_open_root( bvh_base, &instances[instanceID] ) )
+        {
+            count_instance_splits( &bvh_base->Meta.bounds,
+                &instances[instanceID],
+                &split_counts[get_local_id( 0 )][0],
+                &heap[get_local_id(0)] );
+        }
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+    
+    RebraidBuffers buffers = cast_rebraid_buffers( rebraid_scratch, instanceID );
+
+
+    // reduce bins
+    for ( uint bin = get_local_id( 0 ); bin < NUM_REBRAID_BINS; bin += get_local_size( 0 ) )
+    {
+        // TODO_OPT:  There's probably a better way to arrange this computation
+        uint bin_split_count = 0;
+        uint bin_instance_count = 0;
+        for ( uint i = 0; i < COUNT_SPLITS_WG_SIZE; i++ )
+        {
+            uint s = split_counts[i][bin];
+            bin_split_count     += s;
+            bin_instance_count  += (s > 0) ? 1 : 0;
+        }
+
+        if ( bin_split_count > 0 )
+        {
+            atomic_add( &buffers.bin_split_counts[bin], bin_split_count );
+            atomic_add( &buffers.bin_instance_counts[bin], bin_instance_count );
+        }
+    }
+
+    // write out bin counts for each instance
+    for ( uniform uint i = get_sub_group_id(); i < COUNT_SPLITS_WG_SIZE; i += get_num_sub_groups() )
+    {
+        uniform uint iid = base_instance + i;
+        if ( iid > num_instances )
+            break;
+
+        global uint* instance_bin_counts = cast_rebraid_buffers( rebraid_scratch, iid ).instance_bin_counts;
+
+        for ( uniform ushort j = 0; j < NUM_REBRAID_BINS; j += get_sub_group_size() )
+        {
+            uint count = split_counts[i][j + get_sub_group_local_id() ];
+            intel_sub_group_block_write( instance_bin_counts + j, count );
+        }
+    }
+
+}
+
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////
+//                             Build PrimRefs
+///////////////////////////////////////////////////////////////////////////////////////////
+
+GRL_INLINE uint get_instance_split_count(RebraidBuffers buffers, uint instanceID, uint available_splits)
+{
+    global uint* instance_desired_split_count = buffers.instance_bin_counts;
+    global uint *bin_split_counts = buffers.bin_split_counts;
+    global uint *bin_instance_counts = buffers.bin_instance_counts;
+
+    uint total_splits = 0;
+    uint remaining_available_splits = available_splits;
+    uint max_bin = 0;
+    uint desired_splits_this_bin = 0;
+    uint instance_splits = 0;
+
+    do
+    {
+        // stop when we reach a level where we can't satisfy the demand
+        desired_splits_this_bin = instance_desired_split_count[max_bin];
+        uint total_bin_splits = bin_split_counts[max_bin];
+
+        if (total_bin_splits > remaining_available_splits)
+            break;
+
+        // we have enough budget to give all instances everything they want at this level, so do it
+        remaining_available_splits -= total_bin_splits;
+        instance_splits += desired_splits_this_bin;
+        desired_splits_this_bin = 0;
+        max_bin++;
+
+    } while (max_bin < NUM_REBRAID_BINS);
+
+    if (max_bin < NUM_REBRAID_BINS)
+    {
+        // we have more split demand than we have splits available.  The current bin is the last one that gets any splits
+        //   distribute the leftovers as evenly as possible to instances that want them
+        if (desired_splits_this_bin > 0)
+        {
+            // this instance wants splits.  how many does it want?
+            uint desired_total = instance_splits + desired_splits_this_bin;
+            
+            // distribute to all instances as many as possible
+            uint count = bin_instance_counts[max_bin];
+            uint whole = remaining_available_splits / count;
+            remaining_available_splits -= whole * count;
+
+            // distribute remainder to lower numbered instances
+            size_t partial = (instanceID < remaining_available_splits) ? 1 : 0;
+
+            // give the instance its share.
+            instance_splits += whole + partial;
+            instance_splits = min(instance_splits, desired_total); // don't give it more than it needs
+        }
+    }
+
+    return instance_splits;
+}
+
+GRL_INLINE void build_unopened_primref(
+    struct AABB3f* centroid_bounds,
+    global __const BVHBase *bvh_base,
+    global volatile uint *primref_counter,
+    global struct AABB *primref_buffer,
+    global __const float *Transform,
+    uint instanceID,
+    float matOverhead,
+    ushort instanceMask)
+{
+    float3 root_lower = AABB3f_load_lower(&bvh_base->Meta.bounds);
+    float3 root_upper = AABB3f_load_upper(&bvh_base->Meta.bounds);
+
+    struct AABB primRef;
+    AABB_init( &primRef );
+    
+    uint bvhoffset = (uint)BVH_ROOT_NODE_OFFSET;
+    if (is_aabb_valid(root_lower, root_upper) && instanceMask != 0)
+    {
+        primRef = AABBfromAABB3f(compute_xfm_bbox(Transform, BVHBase_GetRootNode(bvh_base), XFM_BOX_NOT_REFINED_TAKE_CLIPBOX, &bvh_base->Meta.bounds, matOverhead));
+    }
+    else
+    {
+        primRef.lower.x = Transform[3];
+        primRef.lower.y = Transform[7];
+        primRef.lower.z = Transform[11];
+        primRef.upper.xyz = primRef.lower.xyz;
+
+        instanceMask = 0;
+        bvhoffset = NO_NODE_OFFSET;
+    }
+
+    primRef.lower.w = as_float(instanceID | (instanceMask << 24));
+    primRef.upper.w = as_float(bvhoffset);
+
+    float3 centroid = primRef.lower.xyz + primRef.upper.xyz;
+    centroid_bounds->lower[0] = centroid.x;
+    centroid_bounds->upper[0] = centroid.x;
+    centroid_bounds->lower[1] = centroid.y;
+    centroid_bounds->upper[1] = centroid.y;
+    centroid_bounds->lower[2] = centroid.z;
+    centroid_bounds->upper[2] = centroid.z;
+
+    uint place = GRL_ATOMIC_INC(primref_counter);
+    primref_buffer[place] = primRef;
+}
+
+GRL_INLINE void build_opened_primrefs(
+    varying bool lane_mask,
+    varying uint offset,
+    varying InternalNode* node,
+    varying struct AABB3f* centroid_bounds,
+    uniform global BVHBase *bvh_base,
+    uniform volatile global uint *primref_counter,
+    uniform global struct AABB *primref_buffer,
+    uniform uint instanceID,
+    uniform const float *Transform,
+    uniform float matOverhead, 
+    varying ushort instanceMask)
+{
+    // TODO_OPT: This function is often called with <= 6 active lanes
+    //  If lanes are sparse, consider jumping to a sub-group vectorized variant...
+
+    if (lane_mask)
+    {
+        varying uint place = GRL_ATOMIC_INC(primref_counter);
+        
+        struct AABB box = AABBfromAABB3f(compute_xfm_bbox(Transform, node, XFM_BOX_NOT_REFINED_CLIPPED, &bvh_base->Meta.bounds, matOverhead));
+
+        box.lower.w = as_float(instanceID | (instanceMask << 24));
+        box.upper.w = as_float(offset * 64 + (uint)BVH_ROOT_NODE_OFFSET);
+        primref_buffer[place] = box;
+
+        AABB3f_extend_point( centroid_bounds, box.lower.xyz + box.upper.xyz );
+    }
+}
+
+
+GRL_INLINE void SUBGROUP_open_nodes(
+    uniform global struct AABB3f *geometry_bounds,
+    uniform uint split_limit,
+    uniform global __const struct GRL_RAYTRACING_INSTANCE_DESC *instance,
+    uniform uint instanceID,
+    uniform volatile global uint *primref_counter,
+    uniform global struct AABB *primref_buffer,
+    varying struct AABB3f* centroid_bounds, 
+    float transformOverhead)
+{
+    uniform SGHeap heap;
+    SGHeap_init(&heap);
+
+    uniform float relative_area_scale = 1.0f / AABB3f_halfArea(geometry_bounds);
+    uniform global BVHBase *bvh_base = (global BVHBase *)instance->AccelerationStructure;
+
+    uniform uint16_t node_offs = 0;
+    varying uint lane = get_sub_group_local_id();
+
+    uniform InternalNode* node_array = get_node( bvh_base, 0 );
+
+    LOOP_TRIPWIRE_INIT;
+
+    while ( 1 )
+    {        
+        uniform InternalNode *node = node_array + node_offs;
+
+        varying uint sg_offs   = node_offs + SUBGROUP_get_child_offsets(node);
+        varying bool sg_valid = false;
+        varying bool sg_openable = false;
+        if (lane < NUM_CHILDREN)
+        {
+            sg_valid = InternalNode_IsChildValid(node, lane);
+            if (sg_valid && (sg_offs <= MAX_NODE_OFFSET))
+            {
+                sg_openable = is_node_openable( node_array + sg_offs);
+            }
+        }
+
+        uniform uint16_t valid_children = intel_sub_group_ballot(sg_valid);
+        uniform uint16_t openable_children = intel_sub_group_ballot(sg_openable);
+        uniform uint16_t unopenable_children = valid_children & (~openable_children);
+
+        if ( openable_children )
+        {
+            varying uint16_t sg_area = SUBGROUP_get_child_areas( node, instance->Transform, relative_area_scale );
+
+            // try to push all openable children to the heap
+            if ( !SGHeap_full( &heap ) )
+            {
+                openable_children = SGHeap_vectorized_push( &heap, sg_area, sg_offs, openable_children );
+            }
+
+            // we have more openable children than will fit in the heap
+            //  process these one by one.
+            //          TODO: Try re-writing with sub_group_any() and see if compiler does a better job
+            while ( openable_children )
+            {
+                // pop min element
+                uniform uint16_t min_area;
+                uniform uint16_t min_offs;
+                SGHeap_full_pop_min( &heap, &min_area, &min_offs );
+
+                // eliminate all children smaller than heap minimum.
+                // mark eliminated children as unopenable
+                varying uint culled_children = openable_children & intel_sub_group_ballot( sg_area <= min_area );
+                unopenable_children ^= culled_children;
+                openable_children &= ~culled_children;
+
+                if ( openable_children )
+                {
+                    // if any children survived the purge
+                    //   find the first such child and swap its offset for the one from the heap
+                    //
+                    uniform uint child_id = ctz( openable_children );
+                    uniform uint16_t old_min_offs = min_offs;
+                    min_area = sub_group_broadcast( sg_area, child_id );
+                    min_offs = sub_group_broadcast( sg_offs, child_id );
+
+                    if ( lane == child_id )
+                        sg_offs = old_min_offs;
+
+                    openable_children ^= (1 << child_id);
+                    unopenable_children ^= (1 << child_id);
+                }
+
+                SGHeap_push_and_fill( &heap, min_area, min_offs );
+
+            }
+        }
+
+        if (unopenable_children)
+        {
+            varying bool sg_create_primref = ((1 << lane) & unopenable_children);
+            build_opened_primrefs(sg_create_primref, sg_offs, node_array + sg_offs, centroid_bounds, bvh_base, primref_counter, primref_buffer, instanceID, instance->Transform, transformOverhead, GRL_get_InstanceMask(instance));
+        }
+
+        --split_limit;
+        if (split_limit == 0)
+        {
+            // split limit exceeded
+            //  create primrefs for all remaining openable nodes in heap
+            varying bool sg_mask = SGHeap_get_lane_mask(&heap);
+            sg_offs = SGHeap_get_lane_values(&heap);
+            build_opened_primrefs(sg_mask, sg_offs, node_array + sg_offs, centroid_bounds, bvh_base, primref_counter, primref_buffer, instanceID, instance->Transform, transformOverhead, GRL_get_InstanceMask(instance));
+
+            break;
+        }
+
+       
+        // NOTE: the heap should never be empty.  If it is, the instance was given too many splits.
+
+        // get next node from heap
+        uint16_t quantized_area;
+        SGHeap_pop_max(&heap, &quantized_area, &node_offs);
+
+        LOOP_TRIPWIRE_INCREMENT( 500, "rebraid_build_primrefs" );
+
+    }
+}
+
+
+#define OPEN_QUEUE_SIZE 256
+#define OPEN_QUEUE_NUM_SGS 16
+
+typedef struct OpenQueueEntry
+{
+    uint instanceID;
+    ushort num_splits;
+} OpenQueueEntry;
+
+typedef struct OpenQueue
+{
+    uint num_produced;
+    uint num_consumed;
+    OpenQueueEntry Q[OPEN_QUEUE_SIZE];
+} OpenQueue;
+
+uniform uint SUBGROUP_GetNextQueueEntry( local OpenQueue* queue )
+{
+    uint next = 0;
+    if ( get_sub_group_local_id() == 0 )
+        next = GRL_ATOMIC_INC( &queue->num_consumed );
+    return sub_group_broadcast( next, 0 );
+}
+
+
+GRL_INLINE void do_rebraid_build_primrefs(
+    local struct AABB3f* SLM_CentroidBounds,
+    local OpenQueue* SLM_Q,
+    global struct Globals* globals,
+    global struct BVHBase* base,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer,
+    global uint* rebraid_scratch,
+    global struct AABB* primref_buffer,
+    uint extra_primref_count,
+    uint num_instances)
+{
+    varying uint instanceID = get_sub_group_size() * get_sub_group_global_id() + get_sub_group_local_id();
+
+    uniform volatile global uint* primref_counter = &globals->numPrimitives;
+    uniform RebraidBuffers buffers = cast_rebraid_buffers( rebraid_scratch, instanceID );
+    uniform uint available_splits = get_num_splits( extra_primref_count, NUM_CHILDREN );
+
+
+
+    varying struct AABB3f centroidBounds;
+    AABB3f_init( &centroidBounds );
+
+    if ( get_local_id( 0 ) == 0 )
+    {
+        SLM_Q->num_produced = 0;
+        SLM_Q->num_consumed = 0;
+        AABB3f_init( SLM_CentroidBounds );
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    // assign splits to unopened instances.  Build primrefs for unsplit instances in vectorized form
+    varying uint num_splits = 0;
+    if ( instanceID < num_instances )
+    {
+        num_splits = get_instance_split_count( buffers, instanceID, available_splits );
+        if ( num_splits == 0 )
+        {
+            varying global const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instance_buffer + instanceID;
+            varying global BVHBase* bvh_base = (global BVHBase*)instance->AccelerationStructure;
+            if ( bvh_base != 0 )
+            {
+                build_unopened_primref( &centroidBounds, bvh_base, primref_counter, primref_buffer, instance->Transform, instanceID, 0.0f, GRL_get_InstanceMask(instance));
+            }
+        }
+        else
+        {
+            // defer opened instances
+            uint place = GRL_ATOMIC_INC( &SLM_Q->num_produced );
+            SLM_Q->Q[place].instanceID = instanceID;
+            SLM_Q->Q[place].num_splits = (ushort)num_splits;
+        }
+    }
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    // if there were opened instances, process them, one per subgroup
+    uniform uint num_produced = SLM_Q->num_produced;
+    uniform uint next = SUBGROUP_GetNextQueueEntry( SLM_Q );
+
+    while ( next < num_produced )
+    {
+        uniform uint instanceID = SLM_Q->Q[next].instanceID;
+        uniform uint num_splits = SLM_Q->Q[next].num_splits;
+
+        uniform global const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instance_buffer + instanceID;
+
+        float transformOverhead =
+#if FINE_TRANSFORM_NODE_BOX
+            transformation_bbox_surf_overhead(instance->Transform);
+#else
+            0.0f;
+#endif
+
+        SUBGROUP_open_nodes(
+            &base->Meta.bounds,
+            num_splits,
+            instance,
+            instanceID,
+            primref_counter,
+            primref_buffer,
+            &centroidBounds,
+            transformOverhead);
+
+        next = SUBGROUP_GetNextQueueEntry( SLM_Q );
+    }
+
+    // reduce the centroid bounds AABB
+    struct AABB3f reduced = AABB3f_sub_group_reduce( &centroidBounds );    
+    if ( get_sub_group_local_id() == 0 )
+        AABB3f_atomic_merge_localBB_nocheck( SLM_CentroidBounds, &reduced );
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    if( get_local_id(0) == 0 )
+    {
+        atomic_min( (global float*) (&globals->centroidBounds.lower) + 0, SLM_CentroidBounds->lower[0] );
+        atomic_min( (global float*) (&globals->centroidBounds.lower) + 1, SLM_CentroidBounds->lower[1] );
+        atomic_min( (global float*) (&globals->centroidBounds.lower) + 2, SLM_CentroidBounds->lower[2] );
+        atomic_max( (global float*) (&globals->centroidBounds.upper) + 0, SLM_CentroidBounds->upper[0] );
+        atomic_max( (global float*) (&globals->centroidBounds.upper) + 1, SLM_CentroidBounds->upper[1] );
+        atomic_max( (global float*) (&globals->centroidBounds.upper) + 2, SLM_CentroidBounds->upper[2] );
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( OPEN_QUEUE_SIZE, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+void kernel rebraid_build_primrefs(
+    global struct Globals* globals,
+    global struct BVHBase* base,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer,
+    global uint* rebraid_scratch,
+    global struct AABB* primref_buffer,
+    uint extra_primref_count,
+    uint num_instances)
+{
+    local struct AABB3f SLM_CentroidBounds;
+    local OpenQueue SLM_Q;
+    do_rebraid_build_primrefs(
+        &SLM_CentroidBounds,
+        &SLM_Q,
+        globals,
+        base,
+        instance_buffer,
+        rebraid_scratch,
+        primref_buffer,
+        extra_primref_count,
+        num_instances);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( OPEN_QUEUE_SIZE, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) )
+void kernel rebraid_build_primrefs_indirect(
+    global struct Globals* globals,
+    global struct BVHBase* base,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer,
+    global uint* rebraid_scratch,
+    global struct AABB* primref_buffer,
+    global struct IndirectBuildRangeInfo const * const indirect_data,
+    uint extra_primref_count )
+{
+    local struct AABB3f SLM_CentroidBounds;
+    local OpenQueue SLM_Q;
+
+    instance_buffer = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
+        (((global char*)instance_buffer) + indirect_data->primitiveOffset);
+
+    do_rebraid_build_primrefs(
+        &SLM_CentroidBounds,
+        &SLM_Q,
+        globals,
+        base,
+        instance_buffer,
+        rebraid_scratch,
+        primref_buffer,
+        extra_primref_count,
+        indirect_data->primitiveCount);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////
+//                             Misc
+///////////////////////////////////////////////////////////////////////////////////////////
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+ISA_TEST(global InternalNode *n, global uint *out, global float *xform, float scale)
+{
+
+    out[get_sub_group_local_id()] = InternalNode_IsChildValid(n, get_sub_group_local_id());
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( (reqd_work_group_size( 1, 1, 1 )) ) void kernel
+DEBUG_PRINT( 
+    global struct Globals* globals,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance_buffer,
+    global uint* rebraid_scratch,
+    global struct AABB* primref_buffer,
+    dword num_extra, 
+    dword input_instances )
+{
+#if 0
+    // validate primrefs
+    if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 )
+    {
+        uint refs = globals->numPrimitives;
+        for ( uint i = 0; i < refs; i++ )
+        {
+            if ( any( primref_buffer[i].lower.xyz < globals->geometryBounds.lower.xyz ) ||
+                 any( primref_buffer[i].upper.xyz > globals->geometryBounds.upper.xyz ) ||
+                 any( isnan(primref_buffer[i].lower.xyz) ) ||
+                any( isnan(primref_buffer[i].upper.xyz) ) )
+            {
+                struct AABB box = primref_buffer[i];
+                printf( "BAD BOX:      %u  {%f,%f,%f} {%f,%f,%f} %u\n", as_uint( box.lower.w ),
+                    box.lower.x, box.lower.y, box.lower.z,
+                    box.upper.x, box.upper.y, box.upper.z,
+                    as_uint( box.lower.w ) );
+            }
+
+            const uint instIndex = PRIMREF_instanceID(&primref_buffer[i]);    // TODO: Refactor me.  We should not be using struct AABB for primRefs
+            const uint rootByteOffset = as_uint( primref_buffer[i].upper.w ); // It should be struct PrimRef
+            if ( instIndex >= input_instances )
+                printf( "BAD INSTANCE INDEX: %u", i );
+            else
+            {
+                global struct BVHBase* blas = (global struct BVHBase*)instance_buffer[instIndex].AccelerationStructure;
+                if ( blas )
+                {
+                    struct InternalNode* start = BVHBase_GetInternalNodes( blas );
+                    struct InternalNode* end = BVHBase_GetInternalNodesEnd( blas );
+
+                    InternalNode* entryPoint = (struct InternalNode*)((char*)instance_buffer[instIndex].AccelerationStructure + rootByteOffset);
+                    if ( entryPoint < start || entryPoint >= end )
+                        printf( "BAD ENTRYPOINT: %u\n", i );
+                    if ( (rootByteOffset & 63) != 0 )
+                        printf( "MISALIGNED ENTRYPOInt: %u\n", i );
+                    
+                }
+            }
+        }
+    }
+#endif
+#if 0
+    if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 )
+        printf( "REBRAIDED: %u\n", globals->numPrimitives );
+
+    // print instance bin information
+    if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 )
+    {
+        printf( "REBRAIDED: %u\n", globals->numPrimitives );
+        for( uint i=0; i<231; i++  )
+        {
+            RebraidBuffers buffers = cast_rebraid_buffers( rebraid_scratch,i );
+            printf( " ID:%4u ", i );
+            for ( uint j = 0; j < NUM_REBRAID_BINS; j++ )
+            {
+                global uint* count = buffers.instance_bin_counts;
+                printf( " %2u ", count[j] );
+            }
+            printf( "\n" );
+        }
+    }
+#endif
+#if 0
+    if ( (get_local_id(0) + get_group_id(0)*get_local_size(0)) == 0 )
+    {
+        printf( "Instances: %u\n", globals->numPrimitives );
+
+        for ( uint i = 0; i < globals->numPrimitives; i++ )
+        {
+            if ( any( primref_buffer[i].lower.xyz < globals->geometryBounds.lower.xyz ) ||
+                 any( primref_buffer[i].upper.xyz > globals->geometryBounds.upper.xyz ) )
+            {
+                struct AABB box = primref_buffer[i];
+                printf( "      %u  {%f,%f,%f} {%f,%f,%f} %u\n", as_uint( box.lower.w ),
+                    box.lower.x, box.lower.y, box.lower.z,
+                    box.upper.x, box.upper.y, box.upper.z,
+                    as_uint( box.lower.w ) );
+            }
+
+        }
+    }
+#endif
+}
+
diff --git a/src/intel/vulkan/grl/gpu/common.h b/src/intel/vulkan/grl/gpu/common.h
new file mode 100644
index 00000000000..5fa0e117ae4
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/common.h
@@ -0,0 +1,429 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "shared.h"
+#include "intrinsics.h"
+#include "AABB.h"
+#include "AABB3f.h"
+#include "qbvh6.h"
+
+/* ====== BVH_BUILDER config ====== */
+
+__constant const float cfg_intCost = 4.0f;
+__constant const float cfg_travCost = 1.0f;
+__constant const uint cfg_minLeafSize = BVH_LEAF_N_MIN;
+__constant const uint cfg_maxLeafSize = BVH_LEAF_N_MAX;
+__constant const uint cfg_maxDepth = BUILDRECORD_STACK_SIZE;
+
+#define ENABLE_CONVERSION_CHECKS 0
+
+#ifdef ENABLE_BIG_REG_ANNOTATION
+#define GRL_ANNOTATE_BIG_REG_REQ __attribute__((annotate("num-thread-per-eu 4")))
+#else
+#define GRL_ANNOTATE_BIG_REG_REQ
+#endif
+
+#ifdef ENABLE_IGC_DO_NOT_SPILL
+#define GRL_ANNOTATE_IGC_DO_NOT_SPILL __attribute__((annotate("igc-do-not-spill")))
+#else
+#define GRL_ANNOTATE_IGC_DO_NOT_SPILL
+#endif
+
+#define ERROR()
+
+/* =================================================================================================================================================== */
+/* =================================================================================================================================================== */
+/* =================================================================================================================================================== */
+/* =================================================================================================================================================== */
+
+GRL_INLINE unsigned int getNumLeafPrims(unsigned int offset)
+{
+    return (offset & 0x7) - 3;
+}
+
+GRL_INLINE unsigned int getLeafOffset(unsigned int offset)
+{
+    return offset & (~0x7);
+}
+
+GRL_INLINE float4 triangleNormal(const float4 v0, const float4 v1, const float4 v2)
+{
+    const float4 a = v1 - v0;
+    const float4 b = v2 - v0;
+    return cross(a, b);
+}
+
+GRL_INLINE float areaTriangle(const float4 v0, const float4 v1, const float4 v2)
+{
+    const float4 normal = triangleNormal(v0, v1, v2);
+    return length((float3)(normal.x, normal.y, normal.z)) * 0.5f;
+}
+
+GRL_INLINE float det2(const float2 a, const float2 b)
+{
+    return a.x * b.y - a.y * b.x;
+}
+
+GRL_INLINE float areaProjectedTriangle(const float4 v0, const float4 v1, const float4 v2)
+{
+    const float xy = 0.5f * fabs(det2(v1.xy - v0.xy, v2.xy - v0.xy));
+    const float yz = 0.5f * fabs(det2(v1.yz - v0.yz, v2.yz - v0.yz));
+    const float zx = 0.5f * fabs(det2(v1.zx - v0.zx, v2.zx - v0.zx));
+    return xy + yz + zx;
+}
+
+typedef struct Block64B  {
+    char data[64];
+} Block64B __attribute__((aligned(64)));
+
+typedef char byte_align64B __attribute__((aligned(64)));
+
+/* ====================================================================== */
+/* ============================== GLOBALS =============================== */
+/* ====================================================================== */
+
+GRL_INLINE bool Globals_OnFinish(global struct Globals *globals)
+{
+    /* last active HW thread ? */
+    if (get_local_id(0) == 0)
+    {
+        const uint sync = atomic_add(&globals->sync, 1);
+        if (sync + 1 == get_num_groups(0))
+        {
+            globals->sync = 0;
+            return true;
+        }
+    }
+    return false;
+}
+
+GRL_INLINE uint BlockAllocator_BytesUsed(struct BlockAllocator *p)
+{
+    return p->cur - p->start;
+};
+
+GRL_INLINE uint BlockAllocator_Alloc(__global struct BlockAllocator *p, const uint size)
+{
+    return atomic_add(&p->cur, size);
+}
+
+GRL_INLINE uint BlockAllocator_Alloc_Single(__global struct BlockAllocator *p, const uint size)
+{
+    uint offset = 0;
+    if (get_sub_group_local_id() == 0)
+        offset = atomic_add(&p->cur, size);
+    return sub_group_broadcast(offset, 0);
+}
+
+// node allocation returns an offset from beginning of BVH to allocated node
+//  in multiples of 64B
+GRL_INLINE uint allocate_inner_nodes(global struct BVHBase* base, uint num_nodes )
+{
+    return atomic_add_global( &base->nodeDataCur, num_nodes );
+}
+GRL_INLINE uint allocate_procedural_leaves(global struct BVHBase* base, uint num_nodes)
+{
+    return atomic_add_global(&base->proceduralDataCur, num_nodes);
+}
+
+GRL_INLINE uint allocate_quad_leaves(global struct BVHBase* base, uint num_nodes)
+{
+    return atomic_add_global(&base->quadLeafCur, num_nodes);
+}
+
+#if 0
+GRL_INLINE uint alloc_node_mem(global struct Globals *globals, const uint size)
+{
+    const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */
+    return BlockAllocator_Alloc(&globals->node_mem_allocator, aligned_size);
+}
+
+GRL_INLINE uint alloc_single_node_mem(global struct Globals *globals, const uint size)
+{
+    const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */
+    return BlockAllocator_Alloc_Single(&globals->node_mem_allocator, aligned_size);
+}
+
+GRL_INLINE uint alloc_quad_leaf_mem(global struct Globals *globals, const uint size)
+{
+    const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */
+    return BlockAllocator_Alloc(&globals->quad_mem_allocator, aligned_size);
+}
+
+GRL_INLINE uint alloc_procedural_leaf_mem(global struct Globals *globals, const uint size)
+{
+    const uint aligned_size = ((size + 63) / 64) * 64; /* allocate in 64 bytes blocks */
+    return BlockAllocator_Alloc(&globals->procedural_mem_allocator, aligned_size);
+}
+#endif
+
+GRL_INLINE global struct BuildRecord *getBuildRecords(char *bvh_mem, struct Globals *globals)
+{
+    return (global struct BuildRecord *)(bvh_mem + globals->build_record_start);
+}
+
+/* ======================================================================= */
+/* ============================== TRIANGLE =============================== */
+/* ======================================================================= */
+
+/*GRL_INLINE void printTriangle(struct Triangle *t)
+{
+  printf("vtx[0] %d vtx[1] %d vtx[2] %d primID %d geomID %d \n",t->vtx[0],t->vtx[1],t->vtx[2],t->primID,t->geomID);
+  }*/
+
+/* ==================================================================== */
+/* ============================== SPLIT =============================== */
+/* ==================================================================== */
+
+GRL_INLINE void printSplit(struct Split *split)
+{
+    printf("split sah %f dim %d pos %d \n", split->sah, split->dim, split->pos);
+}
+
+/* ========================================================================== */
+/* ============================== BUILDRECORD =============================== */
+/* ========================================================================== */
+
+GRL_INLINE void initBuildRecord(struct BuildRecord *buildRecord, uint start, uint end)
+{
+    AABB_init(&buildRecord->centroidBounds);
+    buildRecord->start = start;
+    buildRecord->end = end;
+}
+
+GRL_INLINE void extendBuildRecord(struct BuildRecord *buildRecord, struct AABB *primref)
+{
+    AABB_extend_point(&buildRecord->centroidBounds, AABB_centroid2(primref));
+}
+
+GRL_INLINE uint getBuildRecursionDepth(struct BuildRecord *buildRecord)
+{
+    return as_uint(buildRecord->centroidBounds.upper.w);
+}
+
+GRL_INLINE void setBuildRecursionDepth(struct BuildRecord *buildRecord, uint depth)
+{
+    buildRecord->centroidBounds.upper.w = as_float(depth);
+}
+
+GRL_INLINE uint getNumPrimsBuildRecord(struct BuildRecord *buildRecord)
+{
+    return buildRecord->end - buildRecord->start;
+}
+
+/* ========================================================================== */
+/* =================== BinaryMortonCodeHierarchy ============================= */
+/* ========================================================================== */
+
+GRL_INLINE void BinaryMortonCodeHierarchy_init(struct BinaryMortonCodeHierarchy *record, uint start, uint end)
+{
+    record->range.start = start;
+    record->range.end = end;
+    record->leftChild = -1;
+    record->rightChild = -1;
+//    record->flag = 0;
+}
+
+GRL_INLINE uint BinaryMortonCodeHierarchy_getNumPrimitives(global struct BinaryMortonCodeHierarchy *nodes, uint nodeID)
+{
+    /* leaf case */
+    if (nodeID & (uint)(1 << 31))
+        return 1;
+
+    /* inner node case*/
+    else
+        return nodes[nodeID].range.end - nodes[nodeID].range.start + 1;
+}
+
+GRL_INLINE struct BinaryMortonCodeHierarchy BinaryMortonCodeHierarchy_getEntry(global struct BinaryMortonCodeHierarchy* nodes, uint nodeID)
+{
+    struct BinaryMortonCodeHierarchy entry;
+
+    if (nodeID & (uint)(1 << 31)) {
+        /* leaf case */
+        uint rangeStart = nodeID ^ (uint)(1 << 31);
+        BinaryMortonCodeHierarchy_init(&entry, rangeStart, rangeStart);
+    }
+    else {
+        /* inner node case*/
+        entry = nodes[nodeID];
+    }
+
+    return entry;
+}
+
+GRL_INLINE uint BinaryMortonCodeHierarchy_getRangeStart(global struct BinaryMortonCodeHierarchy *nodes, uint nodeID)
+{
+    /* leaf case */
+    if (nodeID & (uint)(1 << 31))
+        return nodeID ^ (uint)(1 << 31);
+
+    /* inner node case*/
+    else
+        return nodes[nodeID].range.start;
+}
+
+/* ==================================================================== */
+/* ============================== RANGE =============================== */
+/* ==================================================================== */
+
+GRL_INLINE void printRange(struct Range *range)
+{
+    printf("start %d end %d \n", range->start, range->end);
+}
+
+GRL_INLINE bool equalRange(struct Range *range0, struct Range *range1)
+{
+    if (range0->start == range1->start &&
+        range0->end == range1->end)
+        return true;
+    return false;
+}
+
+GRL_INLINE uint getSizeRange(struct Range *range)
+{
+    return range->end - range->start;
+}
+
+/* ==================================================================== */
+/* ========================= ProceduralLeaf =========================== */
+/* ==================================================================== */
+
+#if 0
+struct ProceduralLeaf
+{
+  uint shaderIndex_geomMask;
+  uint geomIndex_flags;
+  uint N_last;
+  uint primIndex[13];
+};
+#endif
+
+GRL_INLINE uint ProceduralLeaf_geomIndex(global struct ProceduralLeaf *This)
+{
+    return This->leafDesc.geomIndex_flags & 0x1FFFFFFF;
+}
+
+GRL_INLINE uint ProceduralLeaf_primIndex(global struct ProceduralLeaf *This, uint i)
+{
+    //assert(i < N);
+    return This->_primIndex[i];
+}
+
+/* ==================================================================== */
+/* =========================== TrianglePair =========================== */
+/* ==================================================================== */
+
+struct TrianglePair
+{
+    uint4 a;    // indices of the 4 verts to store in the quad
+    uint3 lb;   //   index of the second triangle's verts in 'a'
+};
+
+GRL_INLINE struct TrianglePair TrianglePair_Constructor(uint3 tri0, uint primID0, uint3 tri1, uint primID1)
+{
+    struct TrianglePair q;
+    q.a.x = tri0.x;
+    q.a.y = tri0.y;
+    q.a.z = tri0.z;
+    q.a.w = tri0.z;
+
+    uint3 b;
+    b.x = tri1.x;
+    b.y = tri1.y;
+    b.z = tri1.z;
+
+    q.lb = (uint3)(3);
+
+    q.lb.x = (b.x == q.a.x) ? 0 : q.lb.x;
+    q.lb.y = (b.y == q.a.x) ? 0 : q.lb.y;
+    q.lb.z = (b.z == q.a.x) ? 0 : q.lb.z;
+
+    q.lb.x = (b.x == q.a.y) ? 1 : q.lb.x;
+    q.lb.y = (b.y == q.a.y) ? 1 : q.lb.y;
+    q.lb.z = (b.z == q.a.y) ? 1 : q.lb.z;
+
+    q.lb.x = (b.x == q.a.z) ? 2 : q.lb.x;
+    q.lb.y = (b.y == q.a.z) ? 2 : q.lb.y;
+    q.lb.z = (b.z == q.a.z) ? 2 : q.lb.z;
+
+    q.lb.x = (primID0 != primID1) ? q.lb.x : 0;
+    q.lb.y = (primID0 != primID1) ? q.lb.y : 0;
+    q.lb.z = (primID0 != primID1) ? q.lb.z : 0;
+
+    q.a.w = (q.lb.x == 3) ? b.x : q.a.w;
+    q.a.w = (q.lb.y == 3) ? b.y : q.a.w;
+    q.a.w = (q.lb.z == 3) ? b.z : q.a.w;
+
+    return q;
+}
+
+GRL_INLINE float InstanceDesc_get_transform(const InstanceDesc *d, const uint32_t row, const uint32_t column)
+{
+    return d->Transform[row][column];
+}
+
+GRL_INLINE uint32_t InstanceDesc_get_instanceID(const InstanceDesc *d)
+{
+    return d->InstanceIDAndMask & (0x00FFFFFF);
+}
+
+GRL_INLINE uint32_t InstanceDesc_get_InstanceMask(const InstanceDesc *d)
+{
+    return d->InstanceIDAndMask >> 24;
+}
+
+GRL_INLINE uint32_t InstanceDesc_get_InstanceContributionToHitGroupIndex(const InstanceDesc *d)
+{
+    return d->InstanceContributionToHitGroupIndexAndFlags & ((1 << 24) - 1);
+}
+
+GRL_INLINE uint32_t InstanceDesc_get_InstanceFlags(const InstanceDesc *d)
+{
+    return d->InstanceContributionToHitGroupIndexAndFlags >> 24;
+}
+
+GRL_INLINE gpuva_t InstanceDesc_get_AccelerationStructure(const InstanceDesc *d)
+{
+    return d->AccelerationStructureGPUVA;
+}
+
+GRL_INLINE void InstanceDesc_set_transform(InstanceDesc *d, const uint32_t row, const uint32_t column, float value)
+{
+    d->Transform[row][column] = value;
+}
+
+GRL_INLINE void InstanceDesc_set_instanceID(InstanceDesc *d, const uint32_t id)
+{
+    d->InstanceIDAndMask &= 255 << 24;
+    d->InstanceIDAndMask |= id & ((1 << 24) - 1);
+}
+
+GRL_INLINE void InstanceDesc_set_InstanceMask(InstanceDesc *d, const uint32_t mask)
+{
+    d->InstanceIDAndMask &= ((1 << 24) - 1);
+    d->InstanceIDAndMask |= mask << 24;
+}
+
+GRL_INLINE void InstanceDesc_set_InstanceContributionToHitGroupIndex(InstanceDesc *d, const uint32_t contribution)
+{
+    d->InstanceContributionToHitGroupIndexAndFlags &= 255 << 24;
+    d->InstanceContributionToHitGroupIndexAndFlags |= contribution & ((1 << 24) - 1);
+}
+
+GRL_INLINE void InstanceDesc_set_InstanceFlags(InstanceDesc *d, const uint32_t flags)
+{
+    d->InstanceContributionToHitGroupIndexAndFlags &= ((1 << 24) - 1);
+    d->InstanceContributionToHitGroupIndexAndFlags |= flags << 24;
+}
+
+GRL_INLINE void InstanceDesc_set_AccelerationStructure(InstanceDesc *d, gpuva_t address)
+{
+    d->AccelerationStructureGPUVA = address;
+}
diff --git a/src/intel/vulkan/grl/gpu/copy.grl b/src/intel/vulkan/grl/gpu/copy.grl
new file mode 100644
index 00000000000..1bb500a4ea0
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/copy.grl
@@ -0,0 +1,129 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module copy; // In copy we assume output data structure to be DXR compatible
+
+kernel clone_indirect < source="bvh_copy.cl", kernelFunction="clone_indirect" >
+kernel compact < source="bvh_copy.cl", kernelFunction="compact" >
+kernel serialize_indirect < source="bvh_copy.cl", kernelFunction="serialize_indirect" >
+kernel serialize_for_input_dump_indirect < source="bvh_copy.cl", kernelFunction="serialize_for_input_dump_indirect" >
+kernel deserialize_indirect < source="bvh_copy.cl", kernelFunction="deserialize_indirect" >
+kernel dxr_decode < source="bvh_copy.cl", kernelFunction="dxr_decode" >
+
+metakernel clone_indirect(
+    qword dest,
+    qword src,
+    qword srcBVHsizedwordAddr)
+{
+// this has to be compatible with in kernel GroupCountForCopy(...)
+    define byteSize REG0;
+    define numGroupsRqd REG1;
+    define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2;   BYTE_PER_GROUP_CHUNK_ROUNDUP = 255;
+    define BYTE_PER_GROUP_CHUNK_SHIFT   REG3;   BYTE_PER_GROUP_CHUNK_SHIFT   = 8;
+    define REMINDER_NUM_GROUPS          REG4;   REMINDER_NUM_GROUPS = 4;
+    byteSize = load_dword(srcBVHsizedwordAddr);
+    numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT;
+    numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS;
+
+    DISPATCHDIM_X = numGroupsRqd.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect clone_indirect args(
+        dest,
+        src);
+}
+
+metakernel compact(
+    qword dest,
+    qword src)
+{
+    dispatch compact(32,1,1) args(
+        dest,
+        src,
+        32);
+}
+
+metakernel serialize_indirect(
+    qword dest,
+    qword src,
+    qword driverID,
+    qword srcBVHsizedwordAddr)
+{
+    define byteSize REG0;
+    define numGroupsRqd REG1;
+    define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2;   BYTE_PER_GROUP_CHUNK_ROUNDUP = 255;
+    define BYTE_PER_GROUP_CHUNK_SHIFT   REG3;   BYTE_PER_GROUP_CHUNK_SHIFT   = 8;
+    define REMINDER_NUM_GROUPS          REG4;   REMINDER_NUM_GROUPS = 4;
+    byteSize = load_dword(srcBVHsizedwordAddr);
+    numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT;
+    numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS;
+    DISPATCHDIM_X = numGroupsRqd.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect serialize_indirect args(
+        dest,
+        src,
+        driverID);
+}
+
+metakernel serialize_for_input_dump_indirect(
+    qword batchPtrs,
+    qword dstOffset,
+    qword src,
+    qword driverID,
+    qword srcBVHsizedwordAddr)
+{
+    define byteSize REG0;
+    define numGroupsRqd REG1;
+    define BYTE_PER_GROUP_CHUNK_SHIFT   REG2;   BYTE_PER_GROUP_CHUNK_SHIFT   = 8;
+    define REMINDER_NUM_GROUPS          REG3;   REMINDER_NUM_GROUPS = 4;
+    byteSize = load_dword(srcBVHsizedwordAddr);
+    numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT;
+    numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS;
+    DISPATCHDIM_X = numGroupsRqd.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect serialize_for_input_dump_indirect args(
+        batchPtrs,
+        dstOffset,
+        src,
+        driverID);
+}
+
+metakernel deserialize_indirect(
+    qword dest,
+    qword src,
+    qword srcBVHsizedwordAddr)
+{
+    define byteSize REG0;
+    define numGroupsRqd REG1;
+    define BYTE_PER_GROUP_CHUNK_ROUNDUP REG2;   BYTE_PER_GROUP_CHUNK_ROUNDUP = 255;
+    define BYTE_PER_GROUP_CHUNK_SHIFT   REG3;   BYTE_PER_GROUP_CHUNK_SHIFT   = 8;
+    define REMINDER_NUM_GROUPS          REG4;   REMINDER_NUM_GROUPS = 4;
+    byteSize = load_dword(srcBVHsizedwordAddr);
+    numGroupsRqd = byteSize >> BYTE_PER_GROUP_CHUNK_SHIFT;
+    numGroupsRqd = numGroupsRqd + REMINDER_NUM_GROUPS;
+    DISPATCHDIM_X = numGroupsRqd.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect deserialize_indirect args(
+        dest,
+        src);
+}
+
+metakernel dxr_decode(
+    qword dest,
+    qword src)
+{
+    dispatch dxr_decode(1,1,1) args(
+        dest,
+        src);
+}
diff --git a/src/intel/vulkan/grl/gpu/d3d12.h b/src/intel/vulkan/grl/gpu/d3d12.h
new file mode 100644
index 00000000000..32a7654eac5
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/d3d12.h
@@ -0,0 +1,525 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+#include "GRLStructs.h"
+#include "shared.h"
+
+typedef global void *D3D12_GPU_VIRTUAL_ADDRESS;
+typedef void *ID3D12StateObjectPrototype;
+
+enum DXGI_FORMAT
+{
+    DXGI_FORMAT_UNKNOWN,
+    DXGI_FORMAT_R32G32B32A32_TYPELESS,
+    DXGI_FORMAT_R32G32B32A32_FLOAT,
+    DXGI_FORMAT_R32G32B32A32_UINT,
+    DXGI_FORMAT_R32G32B32A32_SINT,
+    DXGI_FORMAT_R32G32B32_TYPELESS,
+    DXGI_FORMAT_R32G32B32_FLOAT,
+    DXGI_FORMAT_R32G32B32_UINT,
+    DXGI_FORMAT_R32G32B32_SINT,
+    DXGI_FORMAT_R16G16B16A16_TYPELESS,
+    DXGI_FORMAT_R16G16B16A16_FLOAT,
+    DXGI_FORMAT_R16G16B16A16_UNORM,
+    DXGI_FORMAT_R16G16B16A16_UINT,
+    DXGI_FORMAT_R16G16B16A16_SNORM,
+    DXGI_FORMAT_R16G16B16A16_SINT,
+    DXGI_FORMAT_R32G32_TYPELESS,
+    DXGI_FORMAT_R32G32_FLOAT,
+    DXGI_FORMAT_R32G32_UINT,
+    DXGI_FORMAT_R32G32_SINT,
+    DXGI_FORMAT_R32G8X24_TYPELESS,
+    DXGI_FORMAT_D32_FLOAT_S8X24_UINT,
+    DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS,
+    DXGI_FORMAT_X32_TYPELESS_G8X24_UINT,
+    DXGI_FORMAT_R10G10B10A2_TYPELESS,
+    DXGI_FORMAT_R10G10B10A2_UNORM,
+    DXGI_FORMAT_R10G10B10A2_UINT,
+    DXGI_FORMAT_R11G11B10_FLOAT,
+    DXGI_FORMAT_R8G8B8A8_TYPELESS,
+    DXGI_FORMAT_R8G8B8A8_UNORM,
+    DXGI_FORMAT_R8G8B8A8_UNORM_SRGB,
+    DXGI_FORMAT_R8G8B8A8_UINT,
+    DXGI_FORMAT_R8G8B8A8_SNORM,
+    DXGI_FORMAT_R8G8B8A8_SINT,
+    DXGI_FORMAT_R16G16_TYPELESS,
+    DXGI_FORMAT_R16G16_FLOAT,
+    DXGI_FORMAT_R16G16_UNORM,
+    DXGI_FORMAT_R16G16_UINT,
+    DXGI_FORMAT_R16G16_SNORM,
+    DXGI_FORMAT_R16G16_SINT,
+    DXGI_FORMAT_R32_TYPELESS,
+    DXGI_FORMAT_D32_FLOAT,
+    DXGI_FORMAT_R32_FLOAT,
+    DXGI_FORMAT_R32_UINT,
+    DXGI_FORMAT_R32_SINT,
+    DXGI_FORMAT_R24G8_TYPELESS,
+    DXGI_FORMAT_D24_UNORM_S8_UINT,
+    DXGI_FORMAT_R24_UNORM_X8_TYPELESS,
+    DXGI_FORMAT_X24_TYPELESS_G8_UINT,
+    DXGI_FORMAT_R8G8_TYPELESS,
+    DXGI_FORMAT_R8G8_UNORM,
+    DXGI_FORMAT_R8G8_UINT,
+    DXGI_FORMAT_R8G8_SNORM,
+    DXGI_FORMAT_R8G8_SINT,
+    DXGI_FORMAT_R16_TYPELESS,
+    DXGI_FORMAT_R16_FLOAT,
+    DXGI_FORMAT_D16_UNORM,
+    DXGI_FORMAT_R16_UNORM,
+    DXGI_FORMAT_R16_UINT,
+    DXGI_FORMAT_R16_SNORM,
+    DXGI_FORMAT_R16_SINT,
+    DXGI_FORMAT_R8_TYPELESS,
+    DXGI_FORMAT_R8_UNORM,
+    DXGI_FORMAT_R8_UINT,
+    DXGI_FORMAT_R8_SNORM,
+    DXGI_FORMAT_R8_SINT,
+    DXGI_FORMAT_A8_UNORM,
+    DXGI_FORMAT_R1_UNORM,
+    DXGI_FORMAT_R9G9B9E5_SHAREDEXP,
+    DXGI_FORMAT_R8G8_B8G8_UNORM,
+    DXGI_FORMAT_G8R8_G8B8_UNORM,
+    DXGI_FORMAT_BC1_TYPELESS,
+    DXGI_FORMAT_BC1_UNORM,
+    DXGI_FORMAT_BC1_UNORM_SRGB,
+    DXGI_FORMAT_BC2_TYPELESS,
+    DXGI_FORMAT_BC2_UNORM,
+    DXGI_FORMAT_BC2_UNORM_SRGB,
+    DXGI_FORMAT_BC3_TYPELESS,
+    DXGI_FORMAT_BC3_UNORM,
+    DXGI_FORMAT_BC3_UNORM_SRGB,
+    DXGI_FORMAT_BC4_TYPELESS,
+    DXGI_FORMAT_BC4_UNORM,
+    DXGI_FORMAT_BC4_SNORM,
+    DXGI_FORMAT_BC5_TYPELESS,
+    DXGI_FORMAT_BC5_UNORM,
+    DXGI_FORMAT_BC5_SNORM,
+    DXGI_FORMAT_B5G6R5_UNORM,
+    DXGI_FORMAT_B5G5R5A1_UNORM,
+    DXGI_FORMAT_B8G8R8A8_UNORM,
+    DXGI_FORMAT_B8G8R8X8_UNORM,
+    DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM,
+    DXGI_FORMAT_B8G8R8A8_TYPELESS,
+    DXGI_FORMAT_B8G8R8A8_UNORM_SRGB,
+    DXGI_FORMAT_B8G8R8X8_TYPELESS,
+    DXGI_FORMAT_B8G8R8X8_UNORM_SRGB,
+    DXGI_FORMAT_BC6H_TYPELESS,
+    DXGI_FORMAT_BC6H_UF16,
+    DXGI_FORMAT_BC6H_SF16,
+    DXGI_FORMAT_BC7_TYPELESS,
+    DXGI_FORMAT_BC7_UNORM,
+    DXGI_FORMAT_BC7_UNORM_SRGB,
+    DXGI_FORMAT_AYUV,
+    DXGI_FORMAT_Y410,
+    DXGI_FORMAT_Y416,
+    DXGI_FORMAT_NV12,
+    DXGI_FORMAT_P010,
+    DXGI_FORMAT_P016,
+    DXGI_FORMAT_420_OPAQUE,
+    DXGI_FORMAT_YUY2,
+    DXGI_FORMAT_Y210,
+    DXGI_FORMAT_Y216,
+    DXGI_FORMAT_NV11,
+    DXGI_FORMAT_AI44,
+    DXGI_FORMAT_IA44,
+    DXGI_FORMAT_P8,
+    DXGI_FORMAT_A8P8,
+    DXGI_FORMAT_B4G4R4A4_UNORM,
+    DXGI_FORMAT_P208,
+    DXGI_FORMAT_V208,
+    DXGI_FORMAT_V408,
+    DXGI_FORMAT_FORCE_UINT
+};
+
+typedef enum D3D12_RAYTRACING_GEOMETRY_FLAGS
+{
+    D3D12_RAYTRACING_GEOMETRY_FLAG_NONE = 0,
+    D3D12_RAYTRACING_GEOMETRY_FLAG_OPAQUE = 0x1,
+    D3D12_RAYTRACING_GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION = 0x2
+} D3D12_RAYTRACING_GEOMETRY_FLAGS;
+
+typedef enum D3D12_RAYTRACING_GEOMETRY_TYPE
+{
+    D3D12_RAYTRACING_GEOMETRY_TYPE_TRIANGLES = 0,
+    D3D12_RAYTRACING_GEOMETRY_TYPE_PROCEDURAL_PRIMITIVE_AABBS = (D3D12_RAYTRACING_GEOMETRY_TYPE_TRIANGLES + 1)
+} D3D12_RAYTRACING_GEOMETRY_TYPE;
+
+typedef enum D3D12_RAYTRACING_INSTANCE_FLAGS
+{
+    D3D12_RAYTRACING_INSTANCE_FLAG_NONE = 0,
+    D3D12_RAYTRACING_INSTANCE_FLAG_TRIANGLE_CULL_DISABLE = 0x1,
+    D3D12_RAYTRACING_INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE = 0x2,
+    D3D12_RAYTRACING_INSTANCE_FLAG_FORCE_OPAQUE = 0x4,
+    D3D12_RAYTRACING_INSTANCE_FLAG_FORCE_NON_OPAQUE = 0x8
+} D3D12_RAYTRACING_INSTANCE_FLAGS;
+
+typedef struct D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE
+{
+    D3D12_GPU_VIRTUAL_ADDRESS StartAddress;
+    unsigned long StrideInBytes;
+} D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE;
+
+typedef struct D3D12_GPU_VIRTUAL_ADDRESSRANGE
+{
+    D3D12_GPU_VIRTUAL_ADDRESS StartAddress;
+    unsigned long SizeInBytes;
+} D3D12_GPU_VIRTUAL_ADDRESSRANGE;
+
+typedef struct D3D12_GPU_VIRTUAL_ADDRESSRANGE_AND_STRIDE
+{
+    D3D12_GPU_VIRTUAL_ADDRESS StartAddress;
+    unsigned long SizeInBytes;
+    unsigned long StrideInBytes;
+} D3D12_GPU_VIRTUAL_ADDRESSRANGE_AND_STRIDE;
+
+typedef struct D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC
+{
+    D3D12_GPU_VIRTUAL_ADDRESS Transform;
+    enum DXGI_FORMAT IndexFormat;
+    enum DXGI_FORMAT VertexFormat;
+    unsigned int IndexCount;
+    unsigned int VertexCount;
+    D3D12_GPU_VIRTUAL_ADDRESS IndexBuffer;
+    struct D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE VertexBuffer;
+} D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC;
+
+typedef struct D3D12_RAYTRACING_AABB
+{
+    float MinX;
+    float MinY;
+    float MinZ;
+    float MaxX;
+    float MaxY;
+    float MaxZ;
+} D3D12_RAYTRACING_AABB;
+
+GRL_INLINE void D3D12_set_raytracing_aabb(D3D12_RAYTRACING_AABB* dest, struct AABB* source)
+{
+    dest->MinX = source->lower.x;
+    dest->MinY = source->lower.y;
+    dest->MinZ = source->lower.z;
+    dest->MaxX = source->upper.x;
+    dest->MaxY = source->upper.y;
+    dest->MaxZ = source->upper.z;
+}
+
+typedef struct D3D12_RAYTRACING_GEOMETRY_AABBS_DESC
+{
+    unsigned long AABBCount;
+    D3D12_GPU_VIRTUAL_ADDRESSAND_STRIDE AABBs;
+} D3D12_RAYTRACING_GEOMETRY_AABBS_DESC;
+
+typedef struct D3D12_RAYTRACING_GEOMETRY_DESC
+{
+    D3D12_RAYTRACING_GEOMETRY_TYPE Type;
+    D3D12_RAYTRACING_GEOMETRY_FLAGS Flags;
+    //unsigned int ShaderIndex : 24; // extension
+    //unsigned int Mask : 8; // extension
+    //unsigned int ShaderIndex_Mask; // extension
+    union {
+        D3D12_RAYTRACING_GEOMETRY_TRIANGLES_DESC Triangles;
+        D3D12_RAYTRACING_GEOMETRY_AABBS_DESC AABBs;
+    };
+} D3D12_RAYTRACING_GEOMETRY_DESC;
+
+GRL_INLINE void D3D12_set_Type(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_RAYTRACING_GEOMETRY_TYPE type)
+{
+    geomDesc->Type = type;
+}
+
+GRL_INLINE D3D12_RAYTRACING_GEOMETRY_TYPE D3D12_get_Type(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->Type;
+}
+
+GRL_INLINE void D3D12_set_Flags(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_RAYTRACING_GEOMETRY_FLAGS flags)
+{
+    geomDesc->Flags = flags;
+}
+
+GRL_INLINE D3D12_RAYTRACING_GEOMETRY_FLAGS D3D12_get_Flags(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->Flags;
+}
+
+GRL_INLINE void D3D12_set_triangles_Transform(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS transform)
+{
+    geomDesc->Triangles.Transform = transform;
+}
+
+GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_Transform(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->Triangles.Transform;
+}
+
+GRL_INLINE void D3D12_set_triangles_IndexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, IndexFormat format)
+{
+    switch (format)
+    {
+    case INDEX_FORMAT_NONE:
+        geomDesc->Triangles.IndexFormat = DXGI_FORMAT_UNKNOWN;
+        break;
+    case INDEX_FORMAT_R16_UINT:
+        geomDesc->Triangles.IndexFormat = DXGI_FORMAT_R16_UINT;
+        break;
+    case INDEX_FORMAT_R32_UINT:
+        geomDesc->Triangles.IndexFormat = DXGI_FORMAT_R32_UINT;
+        break;
+    }
+}
+
+GRL_INLINE IndexFormat D3D12_get_triangles_IndexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    switch (geomDesc->Triangles.IndexFormat)
+    {
+    case DXGI_FORMAT_R16_UINT:
+        return INDEX_FORMAT_R16_UINT;
+    case DXGI_FORMAT_R32_UINT:
+        return INDEX_FORMAT_R32_UINT;
+    case DXGI_FORMAT_UNKNOWN:
+    default:
+        return INDEX_FORMAT_NONE;
+    }
+}
+
+GRL_INLINE void D3D12_set_triangles_VertexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, VertexFormat format)
+{
+    switch (format)
+    {
+    case VERTEX_FORMAT_R32G32_FLOAT:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R32G32_FLOAT;
+        break;
+    case VERTEX_FORMAT_R32G32B32_FLOAT:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R32G32B32_FLOAT;
+        break;
+    case VERTEX_FORMAT_R16G16_FLOAT:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_FLOAT;
+        break;
+    case VERTEX_FORMAT_R16G16B16A16_FLOAT:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_FLOAT;
+        break;
+    case VERTEX_FORMAT_R16G16_SNORM:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_SNORM;
+        break;
+    case VERTEX_FORMAT_R16G16B16A16_SNORM:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_SNORM;
+        break;
+    case VERTEX_FORMAT_R16G16B16A16_UNORM:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16B16A16_UNORM;
+        break;
+    case VERTEX_FORMAT_R16G16_UNORM:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R16G16_UNORM;
+        break;
+    case VERTEX_FORMAT_R10G10B10A2_UNORM:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R10G10B10A2_UNORM;
+        break;
+    case VERTEX_FORMAT_R8G8B8A8_UNORM:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8B8A8_UNORM;
+        break;
+    case VERTEX_FORMAT_R8G8_UNORM:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8_UNORM;
+        break;
+    case VERTEX_FORMAT_R8G8B8A8_SNORM:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8B8A8_SNORM;
+        break;
+    case VERTEX_FORMAT_R8G8_SNORM:
+        geomDesc->Triangles.VertexFormat = DXGI_FORMAT_R8G8_SNORM;
+        break;
+    }
+}
+
+GRL_INLINE VertexFormat D3D12_get_triangles_VertexFormat(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    switch(geomDesc->Triangles.VertexFormat)
+    {
+    case DXGI_FORMAT_R32G32_FLOAT:
+        return VERTEX_FORMAT_R32G32_FLOAT;
+    case DXGI_FORMAT_R32G32B32_FLOAT:
+        return VERTEX_FORMAT_R32G32B32_FLOAT;
+    case DXGI_FORMAT_R16G16_FLOAT:
+        return VERTEX_FORMAT_R16G16_FLOAT;
+    case DXGI_FORMAT_R16G16B16A16_FLOAT:
+        return VERTEX_FORMAT_R16G16B16A16_FLOAT;
+    case DXGI_FORMAT_R16G16_SNORM:
+        return VERTEX_FORMAT_R16G16_SNORM;
+    case DXGI_FORMAT_R16G16B16A16_SNORM:
+        return VERTEX_FORMAT_R16G16B16A16_SNORM;
+    case DXGI_FORMAT_R16G16B16A16_UNORM:
+        return VERTEX_FORMAT_R16G16B16A16_UNORM;
+    case DXGI_FORMAT_R16G16_UNORM:
+        return VERTEX_FORMAT_R16G16_UNORM;
+    case DXGI_FORMAT_R10G10B10A2_UNORM:
+        return VERTEX_FORMAT_R10G10B10A2_UNORM;
+    case DXGI_FORMAT_R8G8B8A8_UNORM:
+        return VERTEX_FORMAT_R8G8B8A8_UNORM;
+    case DXGI_FORMAT_R8G8_UNORM:
+        return VERTEX_FORMAT_R8G8_UNORM;
+    case DXGI_FORMAT_R8G8B8A8_SNORM:
+        return VERTEX_FORMAT_R8G8B8A8_SNORM;
+    case DXGI_FORMAT_R8G8_SNORM:
+        return VERTEX_FORMAT_R8G8_SNORM;
+    default:
+        return VERTEX_FORMAT_R32G32_FLOAT;
+    }
+}
+
+GRL_INLINE void D3D12_set_triangles_IndexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned int count)
+{
+    geomDesc->Triangles.IndexCount = count;
+}
+
+GRL_INLINE unsigned int D3D12_get_triangles_IndexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->Triangles.IndexCount;
+}
+
+GRL_INLINE void D3D12_set_triangles_VertexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned int count)
+{
+    geomDesc->Triangles.VertexCount = count;
+}
+
+GRL_INLINE unsigned int D3D12_get_triangles_VertexCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->Triangles.VertexCount;
+}
+
+GRL_INLINE void D3D12_set_triangles_IndexBuffer(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS buffer)
+{
+    geomDesc->Triangles.IndexBuffer = buffer;
+}
+
+GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_IndexBuffer(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->Triangles.IndexBuffer;
+}
+
+GRL_INLINE void D3D12_set_triangles_VertexBuffer_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS address)
+{
+    geomDesc->Triangles.VertexBuffer.StartAddress = address;
+}
+
+GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_triangles_VertexBuffer_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->Triangles.VertexBuffer.StartAddress;
+}
+
+GRL_INLINE void D3D12_set_triangles_VertexBuffer_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long stride)
+{
+    geomDesc->Triangles.VertexBuffer.StrideInBytes = stride;
+}
+
+GRL_INLINE unsigned long D3D12_get_triangles_VertexBuffer_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->Triangles.VertexBuffer.StrideInBytes;
+}
+
+GRL_INLINE void D3D12_set_procedurals_AABBCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long count)
+{
+    geomDesc->AABBs.AABBCount = count;
+}
+
+GRL_INLINE unsigned long D3D12_get_procedurals_AABBCount(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->AABBs.AABBCount;
+}
+
+GRL_INLINE void D3D12_set_procedurals_AABBs_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, D3D12_GPU_VIRTUAL_ADDRESS address)
+{
+    geomDesc->AABBs.AABBs.StartAddress = address;
+}
+
+GRL_INLINE D3D12_GPU_VIRTUAL_ADDRESS D3D12_get_procedurals_AABBs_StartAddress(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->AABBs.AABBs.StartAddress;
+}
+
+GRL_INLINE void D3D12_set_procedurals_AABBs_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc, unsigned long stride)
+{
+    geomDesc->AABBs.AABBs.StrideInBytes = stride;
+}
+
+GRL_INLINE unsigned long D3D12_get_procedurals_AABBs_StrideInBytes(D3D12_RAYTRACING_GEOMETRY_DESC *geomDesc)
+{
+    return geomDesc->AABBs.AABBs.StrideInBytes;
+}
+
+typedef struct D3D12_RAYTRACING_INSTANCE_DESC
+{
+    float Transform[12];
+    //     unsigned int InstanceID : 24;
+    //     unsigned int InstanceMask : 8;
+    uint32_t DW0;
+    //     unsigned int InstanceContributionToHitGroupIndex : 24;
+    //     unsigned int Flags : 8;
+    uint32_t DW1;
+    global char *AccelerationStructure;
+} D3D12_RAYTRACING_INSTANCE_DESC;
+
+GRL_INLINE float D3D12_get_transform(const D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t row, const uint32_t column)
+{
+    return d->Transform[row * 4 + column];
+}
+
+GRL_INLINE uint32_t D3D12_get_instanceID(const D3D12_RAYTRACING_INSTANCE_DESC *d)
+{
+    return d->DW0 & ((1 << 24) - 1);
+}
+
+GRL_INLINE uint32_t D3D12_get_InstanceMask(const D3D12_RAYTRACING_INSTANCE_DESC *d)
+{
+    return d->DW0 >> 24;
+}
+
+GRL_INLINE uint32_t D3D12_get_InstanceContributionToHitGroupIndex(const D3D12_RAYTRACING_INSTANCE_DESC *d)
+{
+    return d->DW1 & ((1 << 24) - 1);
+}
+
+GRL_INLINE uint32_t D3D12_get_InstanceFlags(const D3D12_RAYTRACING_INSTANCE_DESC *d)
+{
+    return d->DW1 >> 24;
+}
+
+GRL_INLINE gpuva_t D3D12_get_AccelerationStructure(const D3D12_RAYTRACING_INSTANCE_DESC *d)
+{
+    return (gpuva_t)d->AccelerationStructure;
+}
+
+GRL_INLINE void D3D12_set_transform(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t row, const uint32_t column, float value)
+{
+    d->Transform[row * 4 + column] = value;
+}
+
+GRL_INLINE void D3D12_set_instanceID(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t id)
+{
+    d->DW0 &= 255 << 24;
+    d->DW0 |= id & ((1 << 24) - 1);
+}
+
+GRL_INLINE void D3D12_set_InstanceMask(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t mask)
+{
+    d->DW0 &= ((1 << 24) - 1);
+    d->DW0 |= mask << 24;
+}
+
+GRL_INLINE void D3D12_set_InstanceContributionToHitGroupIndex(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t contribution)
+{
+    d->DW1 &= 255 << 24;
+    d->DW1 |= contribution & ((1 << 24) - 1);
+}
+
+GRL_INLINE void D3D12_set_InstanceFlags(D3D12_RAYTRACING_INSTANCE_DESC *d, const uint32_t flags)
+{
+    d->DW1 &= ((1 << 24) - 1);
+    d->DW1 |= flags << 24;
+}
+
+GRL_INLINE void D3D12_set_AccelerationStructure(D3D12_RAYTRACING_INSTANCE_DESC *d, gpuva_t address)
+{
+    d->AccelerationStructure = (global char*)address;
+}
diff --git a/src/intel/vulkan/grl/gpu/grl_api_interface_verify.cl b/src/intel/vulkan/grl/gpu/grl_api_interface_verify.cl
new file mode 100644
index 00000000000..d37adbbbb2b
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/grl_api_interface_verify.cl
@@ -0,0 +1,59 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel CopyGeom(
+    global struct Geo *src,
+    global struct Geo *dst,
+    global float4 *vec,
+    global ushort *indices,
+    dword step)
+{
+    src = src + get_group_id(0);
+    dst = dst + get_group_id(0);
+    dst->Flags = src->Flags;
+    dst->Type = src->Type;
+    if (src->Type == GEOMETRY_TYPE_PROCEDURAL)
+    {
+        dst->Desc.Procedural.AABBByteStride = src->Desc.Procedural.AABBByteStride;
+        dst->Desc.Procedural.AABBCount = src->Desc.Procedural.AABBCount;
+        dst->Desc.Procedural.AABBByteStride = src->Desc.Procedural.AABBByteStride;
+    }
+    else
+    {
+        dst->Desc.Triangles.pTransformBuffer = src->Desc.Triangles.pTransformBuffer;
+        if (step == 0)
+            return;
+        dst->Desc.Triangles.IndexCount = src->Desc.Triangles.IndexCount;
+        if (step == 1)
+            return;
+        dst->Desc.Triangles.VertexCount = src->Desc.Triangles.VertexCount;
+        if (step == 2)
+            return;
+        dst->Desc.Triangles.IndexFormat = src->Desc.Triangles.IndexFormat;
+        if (step == 3)
+            return;
+        dst->Desc.Triangles.pIndexBuffer = src->Desc.Triangles.pIndexBuffer;
+        if (step == 4)
+            return;
+        dst->Desc.Triangles.pVertexBuffer = src->Desc.Triangles.pVertexBuffer;
+        if (step == 5)
+            return;
+        dst->Desc.Triangles.VertexBufferByteStride = src->Desc.Triangles.VertexBufferByteStride;
+
+        dst->Desc.Triangles.VertexFormat = src->Desc.Triangles.VertexFormat;
+
+        for (uint t = 0; t * 3 < dst->Desc.Triangles.IndexCount; t++)
+        {
+            uint3 tri = GRL_load_triangle(src, t);
+            vec[t * 3] = GRL_load_vertex(src, tri[0]);
+            vec[t * 3 + 1] = GRL_load_vertex(src, tri[1]);
+            vec[t * 3 + 2] = GRL_load_vertex(src, tri[2]);
+        }
+    }
+}
diff --git a/src/intel/vulkan/grl/gpu/grl_api_interface_verify.grl b/src/intel/vulkan/grl/gpu/grl_api_interface_verify.grl
new file mode 100644
index 00000000000..3779439c54b
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/grl_api_interface_verify.grl
@@ -0,0 +1,27 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module api_interface_verify;
+
+kernel copy_geom                   < source="grl_api_interface_verify.cl", kernelFunction="CopyGeom" >
+
+metakernel ifc0_copy( 
+    qword src,
+    qword dst,
+    qword vec,
+    qword srcIndices,
+    dword numGroups,
+    dword step)
+{
+    dispatch copy_geom(numGroups,1,1) args(
+        src,
+        dst,
+        vec,
+        srcIndices,
+        step
+        );
+}
diff --git a/src/intel/vulkan/grl/gpu/input_dump.cl b/src/intel/vulkan/grl/gpu/input_dump.cl
new file mode 100644
index 00000000000..f668f053f1f
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/input_dump.cl
@@ -0,0 +1,723 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "common.h"
+#include "d3d12.h"
+#include "mem_utils.h"
+#include "misc_shared.h"
+
+/// Align value to 128
+///
+/// @param value vale to align
+/// @return aligned value
+GRL_INLINE ulong AlignTo128(ulong value) { return ((value + 127) / 128) * 128; }
+
+GRL_INLINE char* GetVertexBuffersStart(global InputBatchPtrs* batchPtrs) {
+    return (global char*)(batchPtrs->dumpDst + AlignTo128(sizeof(InputBatch)));
+}
+
+/// Finds max used byte in vertex buffer
+///
+/// @param indexBuffPtr pointer to index buffer
+/// @param vertexBufferUsedByteEnd pointer to max used byte of vertex buffers
+/// @param IndexCount number of indices in index buffer
+/// @param IndexFormat index format
+/// @param VertexCount number of vertices in vertex buffer
+/// @param VertexBufferByteStride vertex buffer byte stride
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel find_max_used_byte_in_buff(
+    global void* indexBuffPtr,
+    global uint* vertexBufferUsedByteEnd,
+    dword IndexCount,
+    dword IndexFormat,
+    dword VertexCount,
+    qword VertexBufferByteStride)
+{
+    local uint sgMax[16];
+    uint glob_id = get_group_id(0) * get_local_size(0) + get_local_id(0);
+
+    if (IndexFormat != INDEX_FORMAT_NONE)
+    {
+        uint endByte = 0;
+        if (glob_id < IndexCount)
+        {
+            if (IndexFormat == INDEX_FORMAT_R16_UINT)
+            {
+                global ushort* indexBuffPtrShort = (global ushort*) indexBuffPtr;
+                endByte = indexBuffPtrShort[glob_id];
+            }
+            else
+            {
+                global uint* indexBuffPtrUint = (global uint*) indexBuffPtr;
+                endByte = indexBuffPtrUint[glob_id];
+            }
+        }
+
+        endByte = sub_group_reduce_max(endByte);
+
+        if (get_sub_group_local_id() == 0) { sgMax[get_sub_group_id()] = endByte; }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if (get_sub_group_id() == 0)
+        {
+            endByte = sub_group_reduce_max(sgMax[get_sub_group_local_id()]);
+            if (get_sub_group_local_id() == 0) 
+            {
+                endByte = min(endByte, VertexCount);
+                if (endByte < VertexCount && IndexCount != 0)
+                    ++endByte;
+                endByte *= (dword)VertexBufferByteStride;
+                atomic_max(vertexBufferUsedByteEnd, endByte);
+            }
+        }
+    }
+    else if (glob_id == 0)
+    {
+        uint endByte = VertexCount * VertexBufferByteStride;
+        atomic_max(vertexBufferUsedByteEnd, endByte);
+    }
+}
+
+/// Allocates buffer for vertices
+///
+/// @param batchPtrs batch pointers struct
+/// @param vertexBufferUsedByteEnd pointer to sizes of vertex buffers
+/// @param vertexBufferOffset pointer to offsets to vertex buffers
+/// @param numVertexBuffers number of vertex buffers
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel allocate_linear_offsets_for_vertex_buffers(
+    global InputBatchPtrs* batchPtrs,
+    global uint* vertexBufferUsedByteEnd,
+    global uint* vertexBufferOffset,
+    dword numVertexBuffers)
+{
+    uint glob_id = get_group_id(0) * get_local_size(0) + get_sub_group_local_id();
+
+    if (glob_id < numVertexBuffers)
+    {
+        uint numBytes = AlignTo128(vertexBufferUsedByteEnd[glob_id]);
+        uint position = atomic_add_global( &batchPtrs->vertexBuffersSize, numBytes);
+        vertexBufferOffset[glob_id] = position;
+    }
+}
+
+/// Sets the dst data space for input dump of this batch
+///
+/// @param inputDumpMainBuffer pointer to main dump buffer
+/// @param batchPtrs batch pointers struct
+/// @param nonVertexSize size of non vertex data
+/// @param batchIdPtr pointer to batch id
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel allocate_data_space_for_inputs(
+    global DebugBufferHeader* inputDumpMainBuffer,
+    global InputBatchPtrs* batchPtrs,
+    uint nonVertexSize,
+    global qword* batchIdPtr)
+{
+    if (get_sub_group_local_id() == 0) 
+    {
+        uint vertexBufferSize = batchPtrs->vertexBuffersSize;
+        uint sizeOfThisBatch = vertexBufferSize + AlignTo128(sizeof(InputBatch)) + nonVertexSize;
+
+        if ((sizeOfThisBatch + sizeof(InputBatch)) > ((inputDumpMainBuffer->totalSize - inputDumpMainBuffer->headStart) / 2)) 
+        {
+            inputDumpMainBuffer->overflow = 1;
+            batchPtrs->dumpDst = 0;
+            batchPtrs->globalDumpBuffer = 0;
+            batchPtrs->nonVertexDataStart = 0;
+            batchPtrs->totalSize = 0;
+            return;
+        }
+
+        dword prevHead = inputDumpMainBuffer->gpuHead;
+        dword newHead;
+        bool circled;
+
+        do
+        {
+            circled = false;
+            newHead = prevHead + sizeOfThisBatch;
+            dword bufferBegin = prevHead;
+            if ((newHead + sizeof(InputBatch)) > inputDumpMainBuffer->totalSize)
+            {
+                circled = true;
+                newHead = inputDumpMainBuffer->headStart + sizeOfThisBatch;
+                bufferBegin = inputDumpMainBuffer->headStart;
+            }
+            dword bufferEnd = newHead + sizeof(InputBatch);
+
+            uint tail;
+            uint tail2 = 7;
+            bool wait;
+            do
+            {
+                wait = true;
+                tail = load_uint_L1UC_L3UC(&inputDumpMainBuffer->tail, 0);
+
+                // dead code, workaround so IGC won't move tail load out of loop
+                if (tail > inputDumpMainBuffer->totalSize) 
+                {
+                   store_uint_L1UC_L3UC(&inputDumpMainBuffer->tail, 0, tail + tail2);
+                   tail2 = tail;
+                }
+
+                if( prevHead >= tail )
+                {
+                    //colision example:
+                    //  ----------T=======H------------
+                    //  -------B=====E-----------------
+                    //
+                    if((bufferEnd < tail) || (bufferBegin >= prevHead))
+                    {
+                        wait = false;
+                    }
+                }
+                else 
+                {
+                    //colision example:
+                    //  ==========H-------T============
+                    //  B==============E---------------
+                    // caution: we will never have H circled completely so that H == T
+                    if((bufferEnd < tail) && (bufferBegin >= prevHead)) 
+                    {
+                        wait = false;
+                    }
+                }
+            } while (wait);
+        } while (!atomic_compare_exchange_global(&inputDumpMainBuffer->gpuHead, &prevHead, newHead));
+
+        if (circled)
+        {
+            global InputBatch* endBufferOp = (global InputBatch*)(((global char*)inputDumpMainBuffer) + prevHead);
+            endBufferOp->header.opHeader.operationType = INPUT_DUMP_OP_END_BUFFER;
+            prevHead = inputDumpMainBuffer->headStart;
+        }
+
+        global char* thisBatchDump = ((global char*)inputDumpMainBuffer) + prevHead;
+        batchPtrs->dumpDst = (qword)thisBatchDump;
+        batchPtrs->globalDumpBuffer = (qword)inputDumpMainBuffer;
+        batchPtrs->nonVertexDataStart = (qword)(thisBatchDump + AlignTo128(sizeof(InputBatch)) + vertexBufferSize);
+        batchPtrs->totalSize = sizeOfThisBatch;
+
+        global InputBatch* batchOp = (global InputBatch*) thisBatchDump;
+        batchOp->header.opHeader.operationType = INPUT_DUMP_OP_BATCH;
+        batchOp->header.opHeader.endOfData = sizeOfThisBatch;
+        batchOp->vertexBufferDataSize = vertexBufferSize;
+        batchOp->firstContainedOpOffset = AlignTo128(sizeof(InputBatch)) + vertexBufferSize;
+        batchOp->batchId = *batchIdPtr;
+    }
+}
+
+/// Sets the dst data space for output dump of this batch
+///
+/// @param outputDumpMainBuffer pointer to main dump buffer
+/// @param batchPtrs batch pointers struct
+/// @param batchIdPtr pointer to batch id
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel allocate_data_space_for_outputs(
+    global DebugBufferHeader* outputDumpMainBuffer,
+    global OutputBatchPtrs* batchPtrs,
+    global qword* batchIdPtr)
+{
+    if (get_sub_group_local_id() == 0) 
+    {
+        uint sizeOfThisBatch = AlignTo128(sizeof(OutputBatch)) + batchPtrs->dataSize;
+
+        if ((sizeOfThisBatch + sizeof(OutputBatch)) > ((outputDumpMainBuffer->totalSize - outputDumpMainBuffer->headStart) / 2)) 
+        {
+            outputDumpMainBuffer->overflow = 1;
+            batchPtrs->dumpDst = 0;
+            batchPtrs->dataStart = 0;
+            batchPtrs->totalSize = 0;
+            return;
+        }
+
+        dword prevHead = *((volatile global uint*)(&outputDumpMainBuffer->gpuHead));
+        dword newHead;
+        bool circled;
+
+        do
+        {
+            //mem_fence_gpu_invalidate();
+            //prevHead = *((volatile global uint*)(&outputDumpMainBuffer->gpuHead));
+            circled = false;
+            newHead = prevHead + sizeOfThisBatch;
+            dword bufferBegin = prevHead;
+            if ((newHead + sizeof(OutputBatch)) > outputDumpMainBuffer->totalSize)
+            {
+                circled = true;
+                newHead = outputDumpMainBuffer->headStart + sizeOfThisBatch;
+                bufferBegin = outputDumpMainBuffer->headStart;
+            }
+            dword bufferEnd = newHead + sizeof(OutputBatch);
+
+            uint tail;
+            uint tail2 = 7;
+            bool wait;
+            do
+            {
+                wait = true;
+                tail = load_uint_L1UC_L3UC(&outputDumpMainBuffer->tail, 0);
+
+                // dead code, workaround so IGC won't move tail load out of loop
+                if (tail > outputDumpMainBuffer->totalSize) 
+                {
+                   store_uint_L1UC_L3UC(&outputDumpMainBuffer->tail, 0, tail + tail2);
+                   tail2 = tail;
+                }
+
+                if( prevHead >= tail )
+                {
+                    //colision example:
+                    //  ----------T=======H------------
+                    //  -------B=====E-----------------
+                    //
+                    if((bufferEnd < tail) || (bufferBegin >= prevHead))
+                    {
+                        wait = false;
+                    }
+                }
+                else 
+                {
+                    //colision example:
+                    //  ==========H-------T============
+                    //  B==============E---------------
+                    // caution: we will never have H circled completely so that H == T
+                    if((bufferEnd < tail) && (bufferBegin >= prevHead)) 
+                    {
+                        wait = false;
+                    }
+                }
+            } while (wait);
+        } while (!atomic_compare_exchange_global(&outputDumpMainBuffer->gpuHead, &prevHead, newHead));
+
+        if (circled)
+        {
+            global OutputBatch* endBufferOp = (global OutputBatch*)(((global char*)outputDumpMainBuffer) + prevHead);
+            endBufferOp->header.opHeader.operationType = OUTPUT_DUMP_OP_END_BUFFER;
+            prevHead = outputDumpMainBuffer->headStart;
+        }
+
+        global char* thisBatchDump = ((global char*)outputDumpMainBuffer) + prevHead;
+        batchPtrs->dumpDst = (qword)thisBatchDump;
+        batchPtrs->dataStart = (qword)(thisBatchDump + AlignTo128(sizeof(OutputBatch)));
+        batchPtrs->totalSize = sizeOfThisBatch;
+
+        global OutputBatch* batchOp = (global OutputBatch*) thisBatchDump;
+        batchOp->header.opHeader.operationType = OUTPUT_DUMP_OP_BATCH;
+        batchOp->header.opHeader.endOfData = sizeOfThisBatch;
+        batchOp->firstContainedOpOffset = AlignTo128(sizeof(OutputBatch));
+        batchOp->batchId = *batchIdPtr;
+    }
+}
+
+/// Calculates sum of output sizes
+///
+/// @param pbi pointer to post build infos
+/// @param destOffset offset in dest buffer
+/// @param numOutputs number of outputs
+/// @param batchPtrs batch pointers struct
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel calc_outputs_data_size(
+    global PostbuildInfoSerializationDesc* pbi,
+    global dword* destOffsets,
+    qword numOutputs,
+    global OutputBatchPtrs* batchPtrs)
+{
+    uint offset = 0;
+    for (uint i = get_sub_group_local_id(); i < numOutputs + (MAX_HW_SIMD_WIDTH - 1); i += MAX_HW_SIMD_WIDTH)
+    {
+        uint size = 0;
+        if (i < numOutputs)
+        {
+            size = AlignTo128(pbi[i].SerializedSizeInBytes);
+            size += AlignTo128(sizeof(OutputData));
+            destOffsets[i] = offset + sub_group_scan_exclusive_add(size);
+        }
+        offset += sub_group_reduce_add(size);
+    }
+    if (get_sub_group_local_id() == 0)
+        batchPtrs->dataSize = offset;
+}
+
+/// Adds output data operation to batch
+///
+/// @param batchPtrs batch pointers struct
+/// @param destOffset offset in dest buffer
+/// @param src pointer to source bvh
+/// @param pbi pointer to post build info
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel write_output_data_op(
+    global OutputBatchPtrs* batchPtrs,
+    global dword* destOffset,
+    qword src,
+    global PostbuildInfoSerializationDesc* pbi)
+{
+    if (batchPtrs->dataStart == 0)
+        return;
+
+    global OutputData* out = (global OutputData*)(batchPtrs->dataStart + *destOffset);
+    out->header.operationType = OUTPUT_DUMP_OP_DATA;
+    out->header.endOfData = AlignTo128(sizeof(OutputData)) + AlignTo128(pbi->SerializedSizeInBytes);
+    out->srcBvhPtr = src;
+}
+
+/// Writes indices and transform or procedurals data
+///
+/// @param batchPtrs batch pointers struct
+/// @param srcDesc description of source geometry
+/// @param pVertexBufferOffsetInLinearisedUniqueVertexBuffers pointer to offset to vertices in vertex buffer
+/// @param dstDescOffset offset to dest geo desc
+/// @param dstDataOffset offset to dest geo data
+/// @param numThreads number of threads
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 
+void kernel write_geo_data(
+    global InputBatchPtrs* batchPtrs,
+    global GRL_RAYTRACING_GEOMETRY_DESC* srcDesc,
+    global uint* pVertexBufferOffsetInLinearisedUniqueVertexBuffers,
+    global uint* pVertexBufferSize,
+    qword dstDescOffset,
+    qword dstDataOffset,
+    dword numThreads)
+{
+    if (batchPtrs->dumpDst == 0) return;
+
+    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
+
+    GRL_RAYTRACING_GEOMETRY_DESC geoDescToStore = *srcDesc;
+
+    global char* dstDataPtr = (global char*)(
+        batchPtrs->nonVertexDataStart + dstDataOffset);
+
+    global char* srcDataPtr;
+    global char* dstTransform;
+    uint bytesToCopy = 0;
+
+    if (geoDescToStore.Type == GEOMETRY_TYPE_TRIANGLES)
+    {
+        uint sizeOfMatrix = 0;
+
+        if (geoDescToStore.Desc.Triangles.pTransformBuffer)
+        {
+            sizeOfMatrix = AlignTo128(4 * 3 * sizeof(float));
+            if (glob_id < 12)
+            {
+                global float* matrixSrc = (global float*)geoDescToStore.Desc.Triangles.pTransformBuffer;
+                global float* matrixDst = (global float*)dstDataPtr;
+                matrixDst[glob_id] = matrixSrc[glob_id];
+                if (glob_id == 0) 
+                {
+                    geoDescToStore.Desc.Triangles.pTransformBuffer = ((qword)matrixDst) - batchPtrs->globalDumpBuffer;
+                }
+            }
+        }
+        
+        dstDataPtr += sizeOfMatrix;
+        srcDataPtr = (global char*)geoDescToStore.Desc.Triangles.pIndexBuffer;
+
+        bytesToCopy = AlignTo128(geoDescToStore.Desc.Triangles.IndexFormat * geoDescToStore.Desc.Triangles.IndexCount);
+
+        if (bytesToCopy && (glob_id == 0)) 
+        {
+            qword vertBuff = (qword)(GetVertexBuffersStart(batchPtrs) + *pVertexBufferOffsetInLinearisedUniqueVertexBuffers);
+            // for this we remember offset relative to global debug buffer
+            geoDescToStore.Desc.Triangles.pVertexBuffer = ((qword)vertBuff) - batchPtrs->globalDumpBuffer;
+            geoDescToStore.Desc.Triangles.pIndexBuffer = ((qword)dstDataPtr) - batchPtrs->globalDumpBuffer;
+            geoDescToStore.Desc.Triangles.VertexCount = *pVertexBufferSize / geoDescToStore.Desc.Triangles.VertexBufferByteStride;
+        }
+        else if (geoDescToStore.Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE && geoDescToStore.Desc.Triangles.VertexCount > 0 && glob_id == 0)
+        {
+            if (geoDescToStore.Desc.Triangles.pVertexBuffer)
+            {
+                qword vertBuff = (qword)(GetVertexBuffersStart(batchPtrs) + *pVertexBufferOffsetInLinearisedUniqueVertexBuffers);
+                // for this we remember offset relative to global debug buffer
+                geoDescToStore.Desc.Triangles.pVertexBuffer = ((qword)vertBuff) - batchPtrs->globalDumpBuffer;
+            }
+        }
+        else if (glob_id == 0)
+        {
+            geoDescToStore.Desc.Triangles.IndexCount = 0;
+            geoDescToStore.Desc.Triangles.VertexCount = 0;
+            geoDescToStore.Desc.Triangles.pVertexBuffer = 0;
+            geoDescToStore.Desc.Triangles.pIndexBuffer = 0;
+        }
+    }
+    else 
+    {
+        srcDataPtr  = (global char*)geoDescToStore.Desc.Procedural.pAABBs_GPUVA;
+        bytesToCopy = AlignTo128(geoDescToStore.Desc.Procedural.AABBByteStride * geoDescToStore.Desc.Procedural.AABBCount);
+        if (glob_id == 0) 
+        {
+            geoDescToStore.Desc.Procedural.pAABBs_GPUVA = ((qword)dstDataPtr) - batchPtrs->globalDumpBuffer;
+        }
+    }
+
+    if (bytesToCopy) 
+    {
+        CopyMemory(dstDataPtr, srcDataPtr, bytesToCopy, numThreads);
+    }
+
+    if (glob_id == 0) 
+    {
+        global GRL_RAYTRACING_GEOMETRY_DESC* dstDescPtr = (global GRL_RAYTRACING_GEOMETRY_DESC*)(
+            batchPtrs->nonVertexDataStart + dstDescOffset);
+        *dstDescPtr = geoDescToStore;
+    }
+}
+
+/// Adds build operation to batch
+///
+/// @param batchPtrs batch pointers struct
+/// @param buildOpOffset offset in dst buffer
+/// @param srcBvh address of src bvh (in case of update)
+/// @param dstBvhAddr address of dest bvh buffer
+/// @param offsetToEnd offset to end of this operation
+/// @param flags build flags
+/// @param numGeometries number of geometries in build
+/// @param numInstances number of instances in build
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel write_input_build_op(
+    global InputBatchPtrs* batchPtrs,
+    qword buildOpOffset,
+    qword srcBvh,
+    qword dstBvhAddr,
+    dword offsetToEnd,
+    dword flags,
+    dword numGeometries, 
+    dword numInstances,
+    dword instArrayOfPtrs)
+{
+    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
+    if (batchPtrs->dumpDst == 0 || glob_id != 0) return;
+    
+    global InputBuild* buildOp = (global InputBuild*)(
+        batchPtrs->nonVertexDataStart + buildOpOffset);
+    buildOp->header.operationType = srcBvh ? INPUT_DUMP_OP_UPDATE : INPUT_DUMP_OP_BUILD;
+    buildOp->header.endOfData = offsetToEnd;
+    buildOp->dstBvhPtr = dstBvhAddr;
+    buildOp->srcBvhPtr = srcBvh;
+    buildOp->flags = flags;
+    buildOp->numGeos = numGeometries;
+    buildOp->numInstances = numInstances;
+    buildOp->instArrayOfPtrs = instArrayOfPtrs;
+}
+
+/// Copies instance description
+///
+/// @param batchPtrs batch pointers struct
+/// @param instanceDescArr inst desc source
+/// @param offset ptr to offset in dst buffer
+/// @param numInstances number of instances to copy
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+copy_instance_descriptors_array(
+    global InputBatchPtrs* batchPtrs,
+    global GRL_RAYTRACING_INSTANCE_DESC* instanceDescArr,
+    qword offset,                               
+    dword numInstances) 
+{
+    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
+    if (batchPtrs->dumpDst == 0) return;
+
+    global GRL_RAYTRACING_INSTANCE_DESC* dst = (global GRL_RAYTRACING_INSTANCE_DESC* )(
+        batchPtrs->nonVertexDataStart + offset);
+
+    if (glob_id < numInstances)
+    {
+        dst[glob_id] = instanceDescArr[glob_id];
+    }
+}
+
+/// Copies instance description, array of pointers version
+///
+/// @param batchPtrs batch pointers struct
+/// @param pInstanceDescPtrsArr inst desc source
+/// @param offset ptr to offset in dst buffer
+/// @param numInstances number of instances to copy
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+copy_instance_descriptors_array_of_ptrs(
+    global InputBatchPtrs* batchPtrs,
+    global qword* pInstanceDescPtrsArr,
+    qword offset,
+    dword numInstances)
+{
+    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
+    if (batchPtrs->dumpDst == 0) return;
+
+    // save gpuva of instance descs for debug
+    global qword* gpuvaDst = (global qword*)(batchPtrs->nonVertexDataStart + offset);
+
+    global GRL_RAYTRACING_INSTANCE_DESC* dst = (global GRL_RAYTRACING_INSTANCE_DESC*)(
+        batchPtrs->nonVertexDataStart + AlignTo128(numInstances * sizeof(qword)) + offset);
+    global GRL_RAYTRACING_INSTANCE_DESC** instanceDescPtrsArr = (global GRL_RAYTRACING_INSTANCE_DESC **)pInstanceDescPtrsArr;
+
+    if (glob_id < numInstances)
+    {
+        gpuvaDst[glob_id] = (qword)instanceDescPtrsArr[glob_id];
+        dst[glob_id] = *(instanceDescPtrsArr[glob_id]);
+    }
+}
+
+/// Adds copy operation to batch
+///
+/// @param batchPtrs batch pointers struct
+/// @param offset ptr to offset in dst buffer
+/// @param src copy source pointer
+/// @param dst copy destination pointer
+/// @param copyOpType copy type
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 
+void kernel insert_copy_op(
+    global InputBatchPtrs* batchPtrs,
+    qword offset,
+    global void* src,
+    global void* dst,
+    uint copyOpType)
+{
+    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
+    if (batchPtrs->dumpDst == 0 || glob_id != 0) return;
+
+    global InputCopy* copyOp = (global InputCopy*)(batchPtrs->nonVertexDataStart + offset);
+
+    copyOp->header.operationType = copyOpType;
+    copyOp->header.endOfData = AlignTo128(sizeof(InputCopy));
+    copyOp->srcBvhPtr = (qword)src;
+    copyOp->dstBvhPtr = (qword)dst;
+}
+
+/// Copies vertex buffer
+///
+/// @param batchPtrs batch pointers struct
+/// @param src input buffer
+/// @param offset ptr to offset in dst buffer
+/// @param size ptr to number of bytes to copy
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 
+void kernel copy_vertex_data(
+    global InputBatchPtrs* batchPtrs,
+    global const char* src,
+    global const uint* offset,
+    global const uint* size) 
+{
+    if (batchPtrs->dumpDst == 0) return;
+
+    global char *dst = (global char *)(GetVertexBuffersStart(batchPtrs) + *offset);
+    uint numGroups = (*size >> 6) + 1;
+    CopyMemory(dst, src, *size, numGroups);
+}
+
+/// Generate unique batch id
+///
+/// @param batchIds array of unique batch ids
+/// @param index index of batch id to generate
+__attribute__((reqd_work_group_size(1, 1, 1)))
+void kernel generate_unique_batch_id(global unsigned long *batchIds, unsigned int index) {
+    global unsigned int *counterPtrs = (global unsigned int *)batchIds;
+    atomic_add(&counterPtrs[index * 2 + 1], 1);
+    batchIds[index] |= (unsigned long)index;
+}
+
+/// Sets batch as ready to read and moves cpuHead forward, inputs case
+///
+/// @param batchPtrs batch pointers struct
+/// @param dumpMainBuffer pointer to main dump buffer
+__attribute__((reqd_work_group_size(1, 1, 1)))
+void kernel finish_batch_dump_inputs(
+    global InputBatchPtrs* batchPtrs,
+    global DebugBufferHeader* dumpMainBuffer)
+{
+    if (batchPtrs->dumpDst == 0)
+        return;
+
+    global InputBatch* myBatchOp = (global InputBatch*)batchPtrs->dumpDst;
+
+    dword myDstOffset = (batchPtrs->dumpDst - (qword)dumpMainBuffer);
+
+    dword seven = 7;
+    while (true)
+    {
+        dword currentHead = load_uint_L1UC_L3C(&dumpMainBuffer->cpuHead, 0);
+        if (currentHead > dumpMainBuffer->totalSize) // dead code - workaround so IGC won't move currentHead load out of loop
+        {
+            store_uint_L1UC_L3UC(&dumpMainBuffer->cpuHead, 0, currentHead + seven);
+            currentHead = seven;
+        }
+
+        if (currentHead == myDstOffset)
+        {
+            mem_fence_evict_to_memory();
+            dumpMainBuffer->cpuHead = currentHead + myBatchOp->header.opHeader.endOfData;
+            break;
+        }
+        else if (myDstOffset == dumpMainBuffer->headStart)
+        {
+            global InputBatch* curBatchOp = (global InputBatch*)(((global char*)dumpMainBuffer) + currentHead);
+            if (curBatchOp->header.opHeader.operationType == INPUT_DUMP_OP_END_BUFFER)
+            {
+                mem_fence_evict_to_memory();
+                dumpMainBuffer->cpuHead = dumpMainBuffer->headStart + myBatchOp->header.opHeader.endOfData;
+                break;
+            }
+        }
+    }
+}
+
+/// Sets batch as ready to read and moves cpuHead forward, outputs case
+///
+/// @param batchPtrs batch pointers struct
+/// @param dumpMainBuffer pointer to main dump buffer
+__attribute__((reqd_work_group_size(1, 1, 1)))
+void kernel finish_batch_dump_outputs(
+    global OutputBatchPtrs* batchPtrs,
+    global DebugBufferHeader* dumpMainBuffer)
+{
+    if (batchPtrs->dumpDst == 0)
+        return;
+
+    global OutputBatch* myBatchOp = (global OutputBatch*)batchPtrs->dumpDst;
+
+    dword myDstOffset = (batchPtrs->dumpDst - (qword)dumpMainBuffer);
+
+    dword seven = 7;
+    while (true)
+    {
+        dword currentHead = load_uint_L1UC_L3C(&dumpMainBuffer->cpuHead, 0);
+        if (currentHead > dumpMainBuffer->totalSize) // dead code - workaround so IGC won't move currentHead load out of loop
+        {
+            store_uint_L1UC_L3UC(&dumpMainBuffer->cpuHead, 0, currentHead + seven);
+            currentHead = seven;
+        }
+
+        if (currentHead == myDstOffset)
+        {
+            mem_fence_evict_to_memory();
+            dumpMainBuffer->cpuHead = currentHead + myBatchOp->header.opHeader.endOfData;
+            break;
+        }
+        else if (myDstOffset == dumpMainBuffer->headStart)
+        {
+            global OutputBatch* curBatchOp = (global OutputBatch*)(((global char*)dumpMainBuffer) + currentHead);
+            if (curBatchOp->header.opHeader.operationType == OUTPUT_DUMP_OP_END_BUFFER)
+            {
+                mem_fence_evict_to_memory();
+                dumpMainBuffer->cpuHead = dumpMainBuffer->headStart + myBatchOp->header.opHeader.endOfData;
+                break;
+            }
+        }
+    }
+}
diff --git a/src/intel/vulkan/grl/gpu/input_dump.grl b/src/intel/vulkan/grl/gpu/input_dump.grl
new file mode 100644
index 00000000000..7cc6e60a95d
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/input_dump.grl
@@ -0,0 +1,252 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module input_dump;
+
+kernel_module input_dumper("input_dump.cl")
+{
+    links lsc_intrinsics;
+
+    kernel opencl_kernel_find_max_used_byte_in_buff                  < kernelFunction="find_max_used_byte_in_buff" >;
+    kernel opencl_kernel_allocate_linear_offsets_for_vertex_buffers  < kernelFunction="allocate_linear_offsets_for_vertex_buffers" >;
+    kernel opencl_kernel_allocate_data_space_for_inputs              < kernelFunction="allocate_data_space_for_inputs" >;
+    kernel opencl_kernel_allocate_data_space_for_outputs             < kernelFunction="allocate_data_space_for_outputs" >;
+    kernel opencl_kernel_calc_outputs_data_size                      < kernelFunction="calc_outputs_data_size" >;
+    kernel opencl_kernel_write_output_data_op                        < kernelFunction="write_output_data_op" >;
+    kernel opencl_kernel_write_geo_data                              < kernelFunction="write_geo_data" >;
+    kernel opencl_kernel_write_input_build_op                        < kernelFunction="write_input_build_op" >;
+    kernel opencl_kernel_copy_instance_descriptors_array             < kernelFunction="copy_instance_descriptors_array" >;
+    kernel opencl_kernel_copy_instance_descriptors_array_of_ptrs     < kernelFunction="copy_instance_descriptors_array_of_ptrs" >;
+    kernel opencl_kernel_insert_copy_op                              < kernelFunction="insert_copy_op" >;
+    kernel opencl_kernel_copy_vertex_data                            < kernelFunction="copy_vertex_data" >;
+    kernel opencl_kernel_generate_unique_batch_id                    < kernelFunction="generate_unique_batch_id" >;
+    kernel opencl_kernel_finish_batch_dump_inputs                    < kernelFunction="finish_batch_dump_inputs" >;
+    kernel opencl_kernel_finish_batch_dump_outputs                   < kernelFunction="finish_batch_dump_outputs" >;
+}
+
+
+metakernel find_max_used_byte_in_buff(
+    qword indexBuffPtr,
+    qword vertexBufferUsedByteEnd,
+    dword IndexCount,
+    dword IndexFormat,
+    dword VertexCount,
+    qword VertexBufferByteStride,
+    dword numPhysThreads)
+{ 
+    dispatch opencl_kernel_find_max_used_byte_in_buff(numPhysThreads, 1, 1)   args(
+        indexBuffPtr,
+        vertexBufferUsedByteEnd,
+        IndexCount,
+        IndexFormat,
+        VertexCount,
+        VertexBufferByteStride);
+}
+
+metakernel allocate_linear_offsets_for_vertex_buffers(
+    qword batchPtrs,
+    qword m_VertexBufferUsedByteEnd,
+    qword m_VertexBufferOffset,
+    dword numVertexBuffers,
+    dword numPhysThreads)
+{ 
+    dispatch opencl_kernel_allocate_linear_offsets_for_vertex_buffers(numPhysThreads, 1, 1) args(
+        batchPtrs,
+        m_VertexBufferUsedByteEnd,
+        m_VertexBufferOffset,
+        numVertexBuffers);
+}
+
+metakernel allocate_data_space_for_inputs(
+    qword inputDumpMainBuffer,
+    qword batchPtrs,
+    dword nonVertexSize,
+    qword batchIdPtr)
+{  
+    dispatch opencl_kernel_allocate_data_space_for_inputs(1, 1, 1) args(
+        inputDumpMainBuffer,
+        batchPtrs,
+        nonVertexSize,
+        batchIdPtr);
+}
+
+metakernel allocate_data_space_for_outputs(
+    qword inputDumpMainBuffer,
+    qword batchPtrs,
+    qword batchIdPtr)
+{  
+    dispatch opencl_kernel_allocate_data_space_for_outputs(1, 1, 1) args(
+        inputDumpMainBuffer,
+        batchPtrs,
+        batchIdPtr);
+}
+
+metakernel calc_outputs_data_size(
+    qword pbi,
+    qword destOffsets,
+    qword numOutputs,
+    qword batchPtrs)
+{
+    dispatch opencl_kernel_calc_outputs_data_size(1, 1, 1) args(
+        pbi,
+        destOffsets,
+        numOutputs,
+        batchPtrs);
+}
+
+metakernel write_output_data_op(
+    qword batchPtrs,
+    qword destOffset,
+    qword src,
+    qword pbi)
+{
+    dispatch opencl_kernel_write_output_data_op(1, 1, 1) args(
+        batchPtrs,
+        destOffset,
+        src,
+        pbi);
+}
+
+metakernel write_geo_data(
+    qword batchPtrs,
+    qword srcDesc,
+    qword pVertexBufferOffsetInLinearisedUniqueVertexBuffers,
+    qword pVertexBufferSize,
+    qword dstDescOffset,
+    qword dstDataOffset,
+    dword numThreads)
+{  
+    dispatch opencl_kernel_write_geo_data(numThreads, 1, 1) args(
+        batchPtrs,
+        srcDesc,
+        pVertexBufferOffsetInLinearisedUniqueVertexBuffers,
+        pVertexBufferSize,
+        dstDescOffset,
+        dstDataOffset,
+        numThreads);
+}
+
+metakernel write_input_build_op(
+    qword batchPtrs,
+    qword buildOpOffset,
+    qword srcBvh,
+    qword dstBvhAddr,
+    dword offsetToEnd,
+    dword flags,
+    dword numGeometries,
+    dword numInstances,
+    dword instArrayOfPtrs)
+
+{  
+    dispatch opencl_kernel_write_input_build_op(1, 1, 1) args(
+        batchPtrs,
+        buildOpOffset,
+        srcBvh,
+        dstBvhAddr,
+        offsetToEnd,
+        flags,
+        numGeometries,
+        numInstances,
+        instArrayOfPtrs);
+}
+
+metakernel copy_instance_descriptors_array(
+    qword batchPtrs,
+    qword instanceDescArr,
+    qword offset,
+    dword numInstances,
+    dword numPhysThreads)
+{  
+    dispatch opencl_kernel_copy_instance_descriptors_array(numPhysThreads, 1, 1) args(
+        batchPtrs,
+        instanceDescArr,
+        offset,
+        numInstances);
+}
+
+metakernel copy_instance_descriptors_array_of_ptrs(
+    qword batchPtrs,
+    qword instanceDescArrPtrs,
+    qword offset,
+    dword numInstances,
+    dword numPhysThreads)
+{  
+    dispatch opencl_kernel_copy_instance_descriptors_array_of_ptrs(numPhysThreads, 1, 1) args(
+        batchPtrs,
+        instanceDescArrPtrs,
+        offset,
+        numInstances);
+}
+
+metakernel insert_copy_op(
+    qword batchPtrs,
+    qword offset,
+    qword src,
+    qword dst,
+    dword type)
+{  
+    dispatch opencl_kernel_insert_copy_op(1, 1, 1) args(
+        batchPtrs,
+        offset,
+        src,
+        dst,
+        type);
+}
+
+metakernel copy_vertex_data(
+    qword desc,
+    qword src,
+    qword offset,
+    qword size)
+{
+    define byteSize REG0;
+    define numGroupsRqd REG1;
+    define shift REG2;
+    define minimum REG3;
+
+    shift = 6;
+    minimum = 1;
+    byteSize = load_dword(size);
+    numGroupsRqd = byteSize >> shift;
+    numGroupsRqd = numGroupsRqd + minimum;
+    DISPATCHDIM_X = numGroupsRqd.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_copy_vertex_data args(
+        desc,
+        src,
+        offset,
+        size);
+}
+
+metakernel generate_unique_batch_id(
+    qword batchIds,
+    dword batchIndex)
+{
+    dispatch opencl_kernel_generate_unique_batch_id(1, 1, 1) args(
+        batchIds,
+        batchIndex);
+}
+
+metakernel finish_batch_dump_inputs(
+    qword batchPtrs,
+    qword dumpMainBuffer)
+{
+    dispatch opencl_kernel_finish_batch_dump_inputs(1, 1, 1) args(
+        batchPtrs,
+        dumpMainBuffer);
+}
+
+metakernel finish_batch_dump_outputs(
+    qword batchPtrs,
+    qword dumpMainBuffer)
+{
+    dispatch opencl_kernel_finish_batch_dump_outputs(1, 1, 1) args(
+        batchPtrs,
+        dumpMainBuffer);
+}
diff --git a/src/intel/vulkan/grl/gpu/instance.h b/src/intel/vulkan/grl/gpu/instance.h
new file mode 100644
index 00000000000..e463a01dc90
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/instance.h
@@ -0,0 +1,183 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "shared.h"
+#include "affinespace.h"
+#include "api_interface.h"
+#include "qbvh6.h"
+#include "libs/lsc_intrinsics.h"
+
+GRL_INLINE uint32_t HwInstanceLeafPart1_getInstanceIndex(struct HwInstanceLeaf *I)
+{
+    return I->part1.instanceIndex;
+}
+
+GRL_INLINE void encodeDW0_HwInstanceLeafPart0(
+    uint32_t shaderIndex,
+    uint32_t geomMask,
+    uint4 *dst)
+{
+    (*dst).x = (shaderIndex & ((1 << 24) - 1)) |
+             (geomMask << 24);
+}
+
+GRL_INLINE void encodeDW1_HwInstanceLeafPart0(
+    uint32_t instanceContributionToHitGroupIndex,
+    uint32_t notProcedural,
+    uint32_t geomFlags,
+    uint4* dst)
+{
+    (*dst).y = (instanceContributionToHitGroupIndex & ((1 << 24) - 1)) |
+        ((notProcedural & 1) << (24 + 5)) |
+        ((geomFlags & 3) << (24 + 5 + 1));
+}
+
+GRL_INLINE void encodeDW2DW3_HwInstanceLeafPart0(
+    uint64_t rootNodePtr,
+    uint32_t instFlags,
+    uint4* dst)
+{
+    uint64_t flags = instFlags;
+    uint DW2 = (uint)rootNodePtr;
+    uint DW3 = ((uint)(rootNodePtr >> 32ul) & 0xffff);
+    DW3 |= flags << 16ull;
+    (*dst).z = DW2;
+    (*dst).w = DW3;
+}
+
+GRL_INLINE void HwInstanceLeafPart0_setDW0(struct HwInstanceLeaf *I,
+                                       uint32_t shaderIndex,
+                                       uint32_t geomMask)
+{
+    I->part0.DW0 =
+        (shaderIndex & ((1 << 24) - 1)) |
+        (geomMask << 24);
+}
+
+GRL_INLINE void HwInstanceLeafPart0_setDW1(struct HwInstanceLeaf *I,
+                                       uint32_t instanceContributionToHitGroupIndex,
+                                       uint32_t notProcedural,
+                                       uint32_t geomFlags)
+{
+    I->part0.DW1 =
+        (instanceContributionToHitGroupIndex & ((1 << 24) - 1)) |
+        ((notProcedural & 1) << (24 + 5)) |
+        ((geomFlags & 3) << (24 + 5 + 1));
+}
+
+GRL_INLINE void HwInstanceLeafPart1_setDW0DW1(struct HwInstanceLeaf *I,
+                                          global char *pBvhPtr)
+{
+    I->part1.DW0_DW1 = ((uint64_t)pBvhPtr) & (((uint64_t)1 << 48) - 1);
+}
+
+GRL_INLINE void HwInstanceLeafPart0_setDW2DW3(struct HwInstanceLeaf *I,
+                                          uint64_t rootNodePtr,
+                                          uint32_t instFlags)
+{
+    uint64_t flags = instFlags;
+    flags = flags << 48ull;
+    uint64_t ptr = rootNodePtr & 0x0000ffffffffffff;
+    I->part0.DW2_DW3 = ptr + flags;
+}
+
+GRL_INLINE void HwInstanceLeaf_Constructor(global struct HwInstanceLeaf* leaf,
+    global const struct GRL_RAYTRACING_INSTANCE_DESC* instDesc,
+    uint instanceIndex,
+    uint rootNodeByteOffset,
+    uint instanceMask)
+{
+    global uint4* InstanceLeaf_4DWparts = (global uint4*) (leaf);
+
+    struct AffineSpace3f obj2world = AffineSpace3f_load_row_major(instDesc->Transform);
+
+    qword accStructPtr = (qword)instDesc->AccelerationStructure;
+    uint4 p1_DW0_3 = (uint4)(
+        (uint)accStructPtr,
+        (uint)(accStructPtr >> (uint64_t)32),
+        GRL_get_instanceID(instDesc),
+        instanceIndex);
+
+    struct AffineSpace3f world2obj = AffineSpace3f_invert(obj2world);
+
+    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 0 /*part1 + 0DW*/, p1_DW0_3);
+
+    uint4 p1_DW4_7 = (uint4)(
+        as_uint(obj2world.l.vx.x),
+        as_uint(obj2world.l.vx.y),
+        as_uint(obj2world.l.vx.z),
+        as_uint(obj2world.l.vy.x));
+
+    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 1 /*part1 + 4DW*/, p1_DW4_7);
+
+    uint4 p1_DW8_11 = (uint4)(
+        as_uint(obj2world.l.vy.y),
+        as_uint(obj2world.l.vy.z),
+        as_uint(obj2world.l.vz.x),
+        as_uint(obj2world.l.vz.y));
+
+    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 2 /*part1 + 8DW*/, p1_DW8_11);
+
+
+    uint4 p1_DW12_15 = (uint4)(
+        as_uint(obj2world.l.vz.z),
+        as_uint(world2obj.p.x),
+        as_uint(world2obj.p.y),
+        as_uint(world2obj.p.z));
+
+    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 4 + 3 /*part1 + 12DW*/, p1_DW12_15);
+
+    
+    uint hit_group_index = GRL_get_InstanceContributionToHitGroupIndex(instDesc);
+    global struct BVHBase* bvh = (global struct BVHBase*)instDesc->AccelerationStructure;
+
+    uint4 p0_DW0_3;
+
+    encodeDW0_HwInstanceLeafPart0(
+        hit_group_index,
+        instanceMask,
+        &p0_DW0_3);
+
+    encodeDW1_HwInstanceLeafPart0(
+        hit_group_index, // for HW instance leaf, this field is used to offset the hit-group index
+        1,  // disable opaque culling.. Necessary for SW instancing.. don't-care for HW instancing
+        0,
+        &p0_DW0_3);
+
+    encodeDW2DW3_HwInstanceLeafPart0(
+        rootNodeByteOffset == NO_NODE_OFFSET ? 0 : ((uint64_t)bvh) + rootNodeByteOffset, // offset NO_NODE_OFFSET is for degenerated instance, put null as root pointer
+        GRL_get_InstanceFlags(instDesc),
+        &p0_DW0_3);
+
+    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 0 /*part0 + 0DW*/, p0_DW0_3);
+
+    uint4 p0_DW4_7 = (uint4)(
+        as_uint(world2obj.l.vx.x),
+        as_uint(world2obj.l.vx.y),
+        as_uint(world2obj.l.vx.z),
+        as_uint(world2obj.l.vy.x));
+
+    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 1 /*part0 + 4DW*/, p0_DW4_7);
+
+    uint4 p0_DW8_11 = (uint4)(
+        as_uint(world2obj.l.vy.y),
+        as_uint(world2obj.l.vy.z),
+        as_uint(world2obj.l.vz.x),
+        as_uint(world2obj.l.vz.y));
+
+    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 2 /*part0 + 8DW*/, p0_DW8_11);
+
+    uint4 p0_DW12_15 = (uint4)(
+        as_uint(world2obj.l.vz.z),
+        as_uint(obj2world.p.x),
+        as_uint(obj2world.p.y),
+        as_uint(obj2world.p.z));
+
+    store_uint4_L1S_L3WB(InstanceLeaf_4DWparts, 3 /*part0 + 12DW*/, p0_DW12_15);
+}
diff --git a/src/intel/vulkan/grl/gpu/intrinsics.h b/src/intel/vulkan/grl/gpu/intrinsics.h
new file mode 100644
index 00000000000..0dff3147d8a
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/intrinsics.h
@@ -0,0 +1,581 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+// TODO: AABB_work_group_reduce is super slow, remove !!!
+
+#pragma cl_intel_subgroups : enable
+#pragma cl_khr_fp16        : enable
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+
+uint intel_sub_group_ballot(bool valid);
+
+// atom_min
+float __attribute__((overloadable)) atom_min(volatile __global float *p, float val);
+float __attribute__((overloadable)) atom_min(volatile __local float *p, float val);
+float __attribute__((overloadable)) atomic_min(volatile __global float *p, float val);
+float __attribute__((overloadable)) atomic_min(volatile __local float *p, float val);
+// atom_max
+float __attribute__((overloadable)) atom_max(volatile __global float *p, float val);
+float __attribute__((overloadable)) atom_max(volatile __local float *p, float val);
+float __attribute__((overloadable)) atomic_max(volatile __global float *p, float val);
+float __attribute__((overloadable)) atomic_max(volatile __local float *p, float val);
+// atom_cmpxchg
+float __attribute__((overloadable)) atom_cmpxchg(volatile __global float *p, float cmp, float val);
+float __attribute__((overloadable)) atom_cmpxchg(volatile __local float *p, float cmp, float val);
+float __attribute__((overloadable)) atomic_cmpxchg(volatile __global float *p, float cmp, float val);
+float __attribute__((overloadable)) atomic_cmpxchg(volatile __local float *p, float cmp, float val);
+
+
+
+inline uint subgroup_single_atomic_add(global uint *p, uint val)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const int v = subgroupLocalID == 0 ? atomic_add(p, val) : 0;
+    return sub_group_broadcast(v, 0);
+}
+
+inline float halfarea(const float3 d)
+{
+    return fma(d.x, (d.y + d.z), d.y * d.z);
+}
+
+inline float area(const float3 d)
+{
+    return halfarea(d) * 2.0f;
+}
+
+inline uint maxDim(const float3 a)
+{
+    const float3 b = fabs(a);
+    const bool b_x_y = b.x > b.y;
+    const float cur_max = b_x_y ? b.x : b.y;
+    const uint cur_idx = b_x_y ? 0 : 1;
+    const bool b_x_y_z = b.z > cur_max;
+    return b_x_y_z ? 2 : cur_idx;
+}
+
+inline uint3 sortByMaxDim(const float3 a)
+{
+    const uint kz = maxDim(a);
+    const uint _kx = (kz + 1) % 3;
+    const uint _ky = (_kx + 1) % 3;
+    const bool kz_pos = a[kz] >= 0.0f;
+    const uint kx = kz_pos ? _ky : _kx;
+    const uint ky = kz_pos ? _kx : _ky;
+    return (uint3)(kx, ky, kz);
+}
+
+inline uint4 sort4_ascending(const uint4 dist)
+{
+    const uint a0 = dist.s0;
+    const uint a1 = dist.s1;
+    const uint a2 = dist.s2;
+    const uint a3 = dist.s3;
+    const uint b0 = min(a0, a2);
+    const uint b1 = min(a1, a3);
+    const uint b2 = max(a0, a2);
+    const uint b3 = max(a1, a3);
+    const uint c0 = min(b0, b1);
+    const uint c1 = max(b0, b1);
+    const uint c2 = min(b2, b3);
+    const uint c3 = max(b2, b3);
+    const uint d0 = c0;
+    const uint d1 = min(c1, c2);
+    const uint d2 = max(c1, c2);
+    const uint d3 = c3;
+    return (uint4)(d0, d1, d2, d3);
+}
+
+__constant const uint shuffleA[8] = {1, 0, 3, 2, 5, 4, 7, 6};
+__constant const uint shuffleB[8] = {2, 3, 0, 1, 7, 6, 5, 4};
+__constant const uint shuffleC[8] = {1, 0, 3, 2, 5, 4, 7, 6};
+__constant const uint shuffleD[8] = {7, 6, 5, 4, 3, 2, 1, 0};
+__constant const uint shuffleE[8] = {2, 3, 0, 1, 6, 7, 4, 5};
+__constant const uint shuffleF[8] = {1, 0, 3, 2, 5, 4, 7, 6};
+__constant const uint shuffleG[8] = {0, 2, 1, 3, 5, 4, 7, 6};
+
+__constant const uint selAA[8] = {0, 1, 0, 1, 0, 1, 0, 1};
+__constant const uint selCC[8] = {0, 0, 1, 1, 0, 0, 1, 1};
+__constant const uint selF0[8] = {0, 0, 0, 0, 1, 1, 1, 1};
+
+__constant const uint selGG[8] = {0, 0, 1, 0, 1, 1, 1, 1};
+
+inline uint compare_exchange_descending(const uint a0, const uint shuffleMask, const uint selectMask)
+{
+    const uint a1 = intel_sub_group_shuffle(a0, shuffleMask);
+    const uint a_min = min(a0, a1);
+    const uint a_max = max(a0, a1);
+    return select(a_max, a_min, selectMask);
+}
+
+inline uint compare_exchange_ascending(const uint a0, const uint shuffleMask, const uint selectMask)
+{
+    const uint a1 = intel_sub_group_shuffle(a0, shuffleMask);
+    const uint a_min = min(a0, a1);
+    const uint a_max = max(a0, a1);
+    return select(a_min, a_max, selectMask);
+}
+
+inline uint sort8_descending(const uint aa)
+{
+    const unsigned int slotID = get_sub_group_local_id() % 8;
+    const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]);
+    const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]);
+    const uint dd = compare_exchange_descending(cc, shuffleC[slotID], selAA[slotID]);
+    const uint ee = compare_exchange_descending(dd, shuffleD[slotID], selF0[slotID]);
+    const uint ff = compare_exchange_descending(ee, shuffleE[slotID], selCC[slotID]);
+    const uint gg = compare_exchange_descending(ff, shuffleF[slotID], selAA[slotID]);
+    return gg;
+}
+
+inline uint sort8_ascending(const uint aa)
+{
+    const unsigned int slotID = get_sub_group_local_id() % 8;
+    const uint bb = compare_exchange_ascending(aa, shuffleA[slotID], selAA[slotID]);
+    const uint cc = compare_exchange_ascending(bb, shuffleB[slotID], selCC[slotID]);
+    const uint dd = compare_exchange_ascending(cc, shuffleC[slotID], selAA[slotID]);
+    const uint ee = compare_exchange_ascending(dd, shuffleD[slotID], selF0[slotID]);
+    const uint ff = compare_exchange_ascending(ee, shuffleE[slotID], selCC[slotID]);
+    const uint gg = compare_exchange_ascending(ff, shuffleF[slotID], selAA[slotID]);
+    return gg;
+}
+
+inline uint sort4_descending(const uint aa)
+{
+    const unsigned int slotID = get_sub_group_local_id() % 8;
+    const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]);
+    const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]);
+    const uint dd = compare_exchange_descending(cc, shuffleG[slotID], selGG[slotID]);
+    return dd;
+}
+
+inline ulong compare_exchange_descending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask)
+{
+    const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask);
+    const ulong a_min = min(a0, a1);
+    const ulong a_max = max(a0, a1);
+    return select(a_max, a_min, (ulong)selectMask);
+}
+
+inline ulong compare_exchange_ascending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask)
+{
+    const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask);
+    const ulong a_min = min(a0, a1);
+    const ulong a_max = max(a0, a1);
+    return select(a_min, a_max, (ulong)selectMask);
+}
+
+inline ulong sort8_ascending_ulong(const ulong aa)
+{
+    const unsigned int slotID = get_sub_group_local_id() % 8;
+    const ulong bb = compare_exchange_ascending_ulong(aa, shuffleA[slotID], selAA[slotID]);
+    const ulong cc = compare_exchange_ascending_ulong(bb, shuffleB[slotID], selCC[slotID]);
+    const ulong dd = compare_exchange_ascending_ulong(cc, shuffleC[slotID], selAA[slotID]);
+    const ulong ee = compare_exchange_ascending_ulong(dd, shuffleD[slotID], selF0[slotID]);
+    const ulong ff = compare_exchange_ascending_ulong(ee, shuffleE[slotID], selCC[slotID]);
+    const ulong gg = compare_exchange_ascending_ulong(ff, shuffleF[slotID], selAA[slotID]);
+    return gg;
+}
+
+inline uint bitInterleave3D(const uint4 in)
+{
+    uint x = in.x, y = in.y, z = in.z;
+    x = (x | (x << 16)) & 0x030000FF;
+    x = (x | (x << 8)) & 0x0300F00F;
+    x = (x | (x << 4)) & 0x030C30C3;
+    x = (x | (x << 2)) & 0x09249249;
+
+    y = (y | (y << 16)) & 0x030000FF;
+    y = (y | (y << 8)) & 0x0300F00F;
+    y = (y | (y << 4)) & 0x030C30C3;
+    y = (y | (y << 2)) & 0x09249249;
+
+    z = (z | (z << 16)) & 0x030000FF;
+    z = (z | (z << 8)) & 0x0300F00F;
+    z = (z | (z << 4)) & 0x030C30C3;
+    z = (z | (z << 2)) & 0x09249249;
+
+    return x | (y << 1) | (z << 2);
+}
+
+inline uint bitInterleave4D(const uint4 in)
+{
+    uint x = in.x, y = in.y, z = in.z, w = in.w;
+
+    x = x & 0x000000ff;
+    x = (x ^ (x << 16)) & 0x00c0003f;
+    x = (x ^ (x << 8)) & 0x00c03807;
+    x = (x ^ (x << 4)) & 0x08530853;
+    x = (x ^ (x << 2)) & 0x09090909;
+    x = (x ^ (x << 1)) & 0x11111111;
+
+    y = y & 0x000000ff;
+    y = (y ^ (y << 16)) & 0x00c0003f;
+    y = (y ^ (y << 8)) & 0x00c03807;
+    y = (y ^ (y << 4)) & 0x08530853;
+    y = (y ^ (y << 2)) & 0x09090909;
+    y = (y ^ (y << 1)) & 0x11111111;
+
+    z = z & 0x000000ff;
+    z = (z ^ (z << 16)) & 0x00c0003f;
+    z = (z ^ (z << 8)) & 0x00c03807;
+    z = (z ^ (z << 4)) & 0x08530853;
+    z = (z ^ (z << 2)) & 0x09090909;
+    z = (z ^ (z << 1)) & 0x11111111;
+
+    w = w & 0x000000ff;
+    w = (w ^ (w << 16)) & 0x00c0003f;
+    w = (w ^ (w << 8)) & 0x00c03807;
+    w = (w ^ (w << 4)) & 0x08530853;
+    w = (w ^ (w << 2)) & 0x09090909;
+    w = (w ^ (w << 1)) & 0x11111111;
+
+    return (x | (y << 1) | (z << 2) | (w << 3));
+}
+
+inline ulong ulong_bitInterleave4D(const uint4 in)
+{
+    ulong x = in.x, y = in.y, z = in.z, w = in.w;
+
+    x = x & 0x0000ffff;
+    x = (x ^ (x << 32)) & 0x0000f800000007ff;
+    x = (x ^ (x << 16)) & 0x0000f80007c0003f;
+    x = (x ^ (x << 8)) & 0x00c0380700c03807;
+    x = (x ^ (x << 4)) & 0x0843084308430843;
+    x = (x ^ (x << 2)) & 0x0909090909090909;
+    x = (x ^ (x << 1)) & 0x1111111111111111;
+
+    y = y & 0x0000ffff;
+    y = (y ^ (y << 32)) & 0x0000f800000007ff;
+    y = (y ^ (y << 16)) & 0x0000f80007c0003f;
+    y = (y ^ (y << 8)) & 0x00c0380700c03807;
+    y = (y ^ (y << 4)) & 0x0843084308430843;
+    y = (y ^ (y << 2)) & 0x0909090909090909;
+    y = (y ^ (y << 1)) & 0x1111111111111111;
+
+    z = z & 0x0000ffff;
+    z = (z ^ (z << 32)) & 0x0000f800000007ff;
+    z = (z ^ (z << 16)) & 0x0000f80007c0003f;
+    z = (z ^ (z << 8)) & 0x00c0380700c03807;
+    z = (z ^ (z << 4)) & 0x0843084308430843;
+    z = (z ^ (z << 2)) & 0x0909090909090909;
+    z = (z ^ (z << 1)) & 0x1111111111111111;
+
+    w = w & 0x0000ffff;
+    w = (w ^ (w << 32)) & 0x0000f800000007ff;
+    w = (w ^ (w << 16)) & 0x0000f80007c0003f;
+    w = (w ^ (w << 8)) & 0x00c0380700c03807;
+    w = (w ^ (w << 4)) & 0x0843084308430843;
+    w = (w ^ (w << 2)) & 0x0909090909090909;
+    w = (w ^ (w << 1)) & 0x1111111111111111;
+
+    return (x | (y << 1) | (z << 2) | (w << 3));
+}
+
+inline uint bitCompact(uint x)
+{
+    x &= 0x09249249;
+    x = (x ^ (x >> 2)) & 0x030c30c3;
+    x = (x ^ (x >> 4)) & 0x0300f00f;
+    x = (x ^ (x >> 8)) & 0xff0000ff;
+    x = (x ^ (x >> 16)) & 0x000003ff;
+    return x;
+}
+
+inline uint3 bitCompact3D(const uint in)
+{
+    const uint x = bitCompact(x >> 0);
+    const uint y = bitCompact(y >> 1);
+    const uint z = bitCompact(z >> 2);
+    return (uint3)(x, y, z);
+}
+
+inline uint convertToPushIndices8(uint ID)
+{
+    const unsigned int slotID = get_sub_group_local_id();
+    uint index = 0;
+    for (uint i = 0; i < 8; i++)
+    {
+        const uint mask = intel_sub_group_ballot(ID == i);
+        const uint new_index = ctz(mask);
+        index = i == slotID ? new_index : index;
+    }
+    return index;
+}
+
+inline uint convertToPushIndices16(uint ID)
+{
+    const unsigned int slotID = get_sub_group_local_id();
+    uint index = 0;
+    for (uint i = 0; i < 16; i++)
+    {
+        const uint mask = intel_sub_group_ballot(ID == i);
+        const uint new_index = ctz(mask);
+        index = i == slotID ? new_index : index;
+    }
+    return index;
+}
+
+#define FLOAT_EXPONENT_MASK     (0x7F800000)  // used to be EXPONENT_MASK
+#define FLOAT_MANTISSA_MASK     (0x007FFFFF)  // used to be MANTISSA_MASK
+#define FLOAT_NEG_ONE_EXP_MASK  (0x3F000000)
+#define FLOAT_BIAS              (127)
+#define FLOAT_MANTISSA_BITS     (23)
+
+inline float3 frexp_vec3(float3 len, int3* exp)
+{
+    float3 mant = as_float3((int3)((as_int3(len) & (int3)FLOAT_MANTISSA_MASK) + (int3)FLOAT_NEG_ONE_EXP_MASK));
+    mant = select(mant, (float3)(0.5f), (int3)(mant == (float3)(1.0f)));
+    mant = copysign(mant, len);
+    *exp = ((as_int3(len) & (int3)FLOAT_EXPONENT_MASK) >> (int3)FLOAT_MANTISSA_BITS) - ((int3)FLOAT_BIAS - (int3)(1));
+    return mant;
+}
+
+
+#ifndef uniform
+#define uniform
+#endif
+
+#ifndef varying
+#define varying
+#endif
+
+uint get_sub_group_global_id()
+{
+    return get_sub_group_id() + get_num_sub_groups() * get_group_id( 0 );
+}
+
+// each lane contains the number of 1 bits below the corresponding position in 'mask'
+uint subgroup_bit_prefix_exclusive(uniform uint mask)
+{
+    varying ushort lane = get_sub_group_local_id();
+    varying uint lane_mask = (1 << lane) - 1;
+    varying uint m = mask & lane_mask;
+    return popcount(m);
+}
+
+uint bit_prefix_exclusive(uniform uint mask, varying uint lane_idx )
+{
+    varying uint lane_mask = (1 << lane_idx) - 1;
+    varying uint m = mask & lane_mask;
+    return popcount(m);
+}
+
+
+uint3 sub_group_broadcast_uint3(uint3 v, uniform ushort idx)
+{
+    return (uint3)(sub_group_broadcast(v.x,idx),
+                   sub_group_broadcast(v.y,idx),
+                   sub_group_broadcast(v.z,idx));
+}
+
+float3 sub_group_broadcast_float3(float3 v, uniform ushort idx)
+{
+    return (float3)(sub_group_broadcast(v.x, idx),
+                    sub_group_broadcast(v.y, idx),
+                    sub_group_broadcast(v.z, idx));
+}
+
+float3 sub_group_reduce_min_float3(float3 v)
+{
+    return (float3)(sub_group_reduce_min(v.x),
+                    sub_group_reduce_min(v.y),
+                    sub_group_reduce_min(v.z) );
+}
+float3 sub_group_reduce_max_float3(float3 v)
+{
+    return (float3)(sub_group_reduce_max(v.x),
+                    sub_group_reduce_max(v.y),
+                    sub_group_reduce_max(v.z));
+}
+
+float3 sub_group_shuffle_float3(float3 v, uniform ushort idx)
+{
+    return (float3)(intel_sub_group_shuffle(v.x, idx),
+                    intel_sub_group_shuffle(v.y, idx),
+                    intel_sub_group_shuffle(v.z, idx));
+}
+uint3 sub_group_shuffle_uint3(uint3 v, uniform ushort idx)
+{
+    return (uint3)( intel_sub_group_shuffle(v.x, idx),
+                    intel_sub_group_shuffle(v.y, idx),
+                    intel_sub_group_shuffle(v.z, idx));
+}
+
+
+inline uchar sub_group_reduce_or_N6(uchar val)
+{
+    val = val | intel_sub_group_shuffle_down(val, val, 4);
+    val = val | intel_sub_group_shuffle_down(val, val, 2);
+    val = val | intel_sub_group_shuffle_down(val, val, 1);
+    return sub_group_broadcast(val, 0);
+}
+
+inline uchar sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(uchar val)
+{
+    uint SIMD8_id = get_sub_group_local_id() / 8;
+    val = val | intel_sub_group_shuffle_down(val, val, 4);
+    val = val | intel_sub_group_shuffle_down(val, val, 2);
+    val = val | intel_sub_group_shuffle_down(val, val, 1);
+
+    return intel_sub_group_shuffle(val, SIMD8_id * 8);
+}
+
+
+inline __attribute__((overloadable)) uint atomic_inc_local( local uint* p )
+{
+    return atomic_fetch_add_explicit( (volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group );
+}
+
+inline __attribute__((overloadable)) int atomic_inc_local(local int* p)
+{
+    return atomic_fetch_add_explicit( (volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline __attribute__((overloadable)) uint atomic_dec_local(local uint* p)
+{
+    return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline __attribute__((overloadable)) int atomic_dec_local(local int* p)
+{
+    return atomic_fetch_sub_explicit((volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline __attribute__((overloadable)) uint atomic_sub_local(local uint* p, uint n)
+{
+    return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline __attribute__((overloadable)) int atomic_sub_local(local int* p, int n )
+{
+    return atomic_fetch_sub_explicit( (volatile local atomic_int*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline uint atomic_add_local( local uint* p, uint n )
+{
+    return atomic_fetch_add_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline uint atomic_xor_local(local uint* p, uint n)
+{
+    return atomic_fetch_xor_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline uint atomic_or_local(local uint* p, uint n)
+{
+    return atomic_fetch_or_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline uint atomic_min_local(local uint* p, uint n)
+{
+    return atomic_fetch_min_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+inline uint atomic_max_local(local uint* p, uint n)
+{
+    return atomic_fetch_max_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
+}
+
+
+
+
+inline uint atomic_inc_global( global uint* p )
+{
+    return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device);
+}
+
+inline uint atomic_dec_global(global uint* p)
+{
+    return atomic_fetch_sub_explicit( (volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device);
+}
+
+inline bool atomic_compare_exchange_global(global uint* p, uint* expected, uint desired)
+{
+    return atomic_compare_exchange_strong_explicit((volatile global atomic_uint*) p, expected, desired, memory_order_relaxed, memory_order_relaxed, memory_scope_device);
+}
+
+inline uint atomic_add_global( global uint* p, uint n )
+{
+    return atomic_fetch_add_explicit( (volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
+}
+
+inline uint atomic_sub_global(global uint* p, uint n)
+{
+    return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
+}
+
+inline uint atomic_or_global(global uint* p, uint n)
+{
+    return atomic_fetch_or_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
+}
+
+
+inline uint atomic_inc_global_acquire(global uint* p)
+{
+    return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_acquire, memory_scope_device);
+}
+
+
+inline uint atomic_inc_global_release(global uint* p)
+{
+    return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device);
+}
+inline uint atomic_dec_global_release(global uint* p)
+{
+    return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device);
+}
+
+inline uint generic_atomic_add(uint* p, uint val)
+{
+    if (to_global(p) != NULL)
+        return atomic_add_global(to_global(p), val);
+    if (to_local(p) != NULL)
+        return atomic_add_local(to_local(p), val);
+    return 0;
+}
+
+inline __attribute__((overloadable)) uint sub_group_reduce_max_N6( uint n )
+{
+    n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) );
+    n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) );
+    n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) );
+    return sub_group_broadcast( n, 0 );
+}
+
+inline __attribute__((overloadable)) float sub_group_reduce_max_N6( float n )
+{
+    n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) );
+    n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) );
+    n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) );
+    return sub_group_broadcast( n, 0 );
+}
+
+inline __attribute__((overloadable)) float sub_group_reduce_max_N6_2xSIMD8_in_SIMD16(float n)
+{
+    n = max(n, intel_sub_group_shuffle_down(n, n, 4));
+    n = max(n, intel_sub_group_shuffle_down(n, n, 2));
+    n = max(n, intel_sub_group_shuffle_down(n, n, 1));
+    return intel_sub_group_shuffle(n, (get_sub_group_local_id() / 8) * 8);//sub_group_broadcast(n, 0);
+}
+
+inline uint generic_atomic_inc(uint* p)
+{
+    if (to_global(p) != NULL)
+        return atomic_inc_global(to_global(p));
+    if (to_local(p) != NULL)
+        return atomic_inc(to_local(p));
+    return 0;
+}
+
+
+// Built-in GRL function which, if called in a kernel body, will force the kernel
+//  to be compiled to the minimum SIMD width supported by the platform
+void GRL_UseMinimumSIMDWidth();
+\ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/libs/libraries.grl b/src/intel/vulkan/grl/gpu/libs/libraries.grl
new file mode 100644
index 00000000000..1d6c0d2c6c5
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/libs/libraries.grl
@@ -0,0 +1,13 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+library lsc_intrinsics
+{
+    default   "lsc_intrinsics.cl" ;
+    fallback  "lsc_intrinsics_fallback.cl";
+}
+
diff --git a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.cl b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.cl
new file mode 100644
index 00000000000..03a76ba36f1
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.cl
@@ -0,0 +1,1033 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+// LSC Cache options
+// Load message caching control
+enum LSC_LDCC {
+    LSC_LDCC_DEFAULT,
+    LSC_LDCC_L1UC_L3UC,     // Override to L1 uncached and L3 uncached
+    LSC_LDCC_L1UC_L3C,      // Override to L1 uncached and L3 cached
+    LSC_LDCC_L1C_L3UC,      // Override to L1 cached and L3 uncached
+    LSC_LDCC_L1C_L3C,       // Override to L1 cached and L3 cached
+    LSC_LDCC_L1S_L3UC,      // Override to L1 streaming load and L3 uncached
+    LSC_LDCC_L1S_L3C,       // Override to L1 streaming load and L3 cached
+    LSC_LDCC_L1IAR_L3C,     // Override to L1 invalidate-after-read, and L3 cached
+};
+
+// Store message caching control (also used for atomics)
+enum LSC_STCC {
+    LSC_STCC_DEFAULT,
+    LSC_STCC_L1UC_L3UC,     // Override to L1 uncached and L3 uncached
+    LSC_STCC_L1UC_L3WB,     // Override to L1 uncached and L3 written back
+    LSC_STCC_L1WT_L3UC,     // Override to L1 written through and L3 uncached
+    LSC_STCC_L1WT_L3WB,     // Override to L1 written through and L3 written back
+    LSC_STCC_L1S_L3UC,      // Override to L1 streaming and L3 uncached
+    LSC_STCC_L1S_L3WB,      // Override to L1 streaming and L3 written back
+    LSC_STCC_L1WB_L3WB,     // Override to L1 written through and L3 written back
+};
+
+// LSC Loads
+
+// Global address space
+uint    __builtin_IB_lsc_load_global_uchar_to_uint (const __global uchar  *base, int immElemOff, enum LSC_LDCC cacheOpt);     //D8U32
+uint    __builtin_IB_lsc_load_global_ushort_to_uint(const __global ushort *base, int immElemOff, enum LSC_LDCC cacheOpt);   //D16U32
+uint    __builtin_IB_lsc_load_global_uint  (const __global uint   *base, int immElemOff, enum LSC_LDCC cacheOpt);       //D32V1
+uint2   __builtin_IB_lsc_load_global_uint2 (const __global uint2  *base, int immElemOff, enum LSC_LDCC cacheOpt);     //D32V2
+uint3   __builtin_IB_lsc_load_global_uint3 (const __global uint3  *base, int immElemOff, enum LSC_LDCC cacheOpt);     //D32V3
+uint4   __builtin_IB_lsc_load_global_uint4 (const __global uint4  *base, int immElemOff, enum LSC_LDCC cacheOpt);     //D32V4
+uint8   __builtin_IB_lsc_load_global_uint8 (const __global uint8  *base, int immElemOff, enum LSC_LDCC cacheOpt);     //D32V8
+ulong   __builtin_IB_lsc_load_global_ulong (const __global ulong  *base, int immElemOff, enum LSC_LDCC cacheOpt);    //D64V1
+ulong2  __builtin_IB_lsc_load_global_ulong2(const __global ulong2 *base, int immElemOff, enum LSC_LDCC cacheOpt);  //D64V2
+ulong3  __builtin_IB_lsc_load_global_ulong3(const __global ulong3 *base, int immElemOff, enum LSC_LDCC cacheOpt);  //D64V3
+ulong4  __builtin_IB_lsc_load_global_ulong4(const __global ulong4 *base, int immElemOff, enum LSC_LDCC cacheOpt);  //D64V4
+ulong8  __builtin_IB_lsc_load_global_ulong8(const __global ulong8 *base, int immElemOff, enum LSC_LDCC cacheOpt);  //D64V8
+
+// Local address space
+uint    __builtin_IB_lsc_load_local_uchar_to_uint( const __local  uchar *base, int immElemOff); //D8U32
+uint    __builtin_IB_lsc_load_local_ushort_to_uint(const __local ushort *base, int immElemOff); //D16U32
+uint    __builtin_IB_lsc_load_local_uint  (const __local uint   *base, int immElemOff);   //D32V1
+uint2   __builtin_IB_lsc_load_local_uint2 (const __local uint2  *base, int immElemOff);  //D32V2
+uint3   __builtin_IB_lsc_load_local_uint3 (const __local uint3  *base, int immElemOff);  //D32V3
+uint4   __builtin_IB_lsc_load_local_uint4 (const __local uint4  *base, int immElemOff);  //D32V4
+uint8   __builtin_IB_lsc_load_local_uint8 (const __local uint8  *base, int immElemOff);  //D32V8
+ulong   __builtin_IB_lsc_load_local_ulong (const __local ulong  *base, int immElemOff);  //D64V1
+ulong2  __builtin_IB_lsc_load_local_ulong2(const __local ulong2 *base, int immElemOff); //D64V2
+ulong3  __builtin_IB_lsc_load_local_ulong3(const __local ulong3 *base, int immElemOff); //D64V3
+ulong4  __builtin_IB_lsc_load_local_ulong4(const __local ulong4 *base, int immElemOff); //D64V4
+ulong8  __builtin_IB_lsc_load_local_ulong8(const __local ulong8 *base, int immElemOff); //D64V8
+
+// LSC Stores
+
+// Global address space
+void  __builtin_IB_lsc_store_global_uchar_from_uint (__global uchar  *base, int immElemOff, uint val, enum LSC_STCC cacheOpt);     //D8U32
+void  __builtin_IB_lsc_store_global_ushort_from_uint(__global ushort *base, int immElemOff, uint val, enum LSC_STCC cacheOpt);  //D16U32
+void  __builtin_IB_lsc_store_global_uint  (__global uint   *base, int immElemOff, uint val, enum LSC_STCC cacheOpt);        //D32V1
+void  __builtin_IB_lsc_store_global_uint2 (__global uint2  *base, int immElemOff, uint2 val, enum LSC_STCC cacheOpt);     //D32V2
+void  __builtin_IB_lsc_store_global_uint3 (__global uint3  *base, int immElemOff, uint3 val, enum LSC_STCC cacheOpt);     //D32V3
+void  __builtin_IB_lsc_store_global_uint4 (__global uint4  *base, int immElemOff, uint4 val, enum LSC_STCC cacheOpt);     //D32V4
+void  __builtin_IB_lsc_store_global_uint8 (__global uint8  *base, int immElemOff, uint8 val, enum LSC_STCC cacheOpt);     //D32V8
+void  __builtin_IB_lsc_store_global_ulong (__global ulong  *base, int immElemOff, ulong val, enum LSC_STCC cacheOpt);     //D64V1
+void  __builtin_IB_lsc_store_global_ulong2(__global ulong2 *base, int immElemOff, ulong2 val, enum LSC_STCC cacheOpt);  //D64V2
+void  __builtin_IB_lsc_store_global_ulong3(__global ulong3 *base, int immElemOff, ulong3 val, enum LSC_STCC cacheOpt);  //D64V3
+void  __builtin_IB_lsc_store_global_ulong4(__global ulong4 *base, int immElemOff, ulong4 val, enum LSC_STCC cacheOpt);  //D64V4
+void  __builtin_IB_lsc_store_global_ulong8(__global ulong8 *base, int immElemOff, ulong8 val, enum LSC_STCC cacheOpt);  //D64V8
+
+// Local address space
+void  __builtin_IB_lsc_store_local_uchar_from_uint (__local  uchar *base, int immElemOff, uint val);   //D8U32
+void  __builtin_IB_lsc_store_local_ushort_from_uint(__local ushort *base, int immElemOff, uint val); //D16U32
+void  __builtin_IB_lsc_store_local_uint  (__local uint   *base, int immElemOff, uint val);   //D32V1
+void  __builtin_IB_lsc_store_local_uint2 (__local uint2  *base, int immElemOff, uint2 val);  //D32V2
+void  __builtin_IB_lsc_store_local_uint3 (__local uint3  *base, int immElemOff, uint3 val);  //D32V3
+void  __builtin_IB_lsc_store_local_uint4 (__local uint4  *base, int immElemOff, uint4 val);  //D32V4
+void  __builtin_IB_lsc_store_local_uint8 (__local uint8  *base, int immElemOff, uint8 val);  //D32V8
+void  __builtin_IB_lsc_store_local_ulong (__local ulong  *base, int immElemOff, ulong val);  //D64V1
+void  __builtin_IB_lsc_store_local_ulong2(__local ulong2 *base, int immElemOff, ulong2 val);  //D64V2
+void  __builtin_IB_lsc_store_local_ulong3(__local ulong3 *base, int immElemOff, ulong3 val);  //D64V3
+void  __builtin_IB_lsc_store_local_ulong4(__local ulong4 *base, int immElemOff, ulong4 val);  //D64V4
+void  __builtin_IB_lsc_store_local_ulong8(__local ulong8 *base, int immElemOff, ulong8 val);  //D64V8
+
+// LSC prefetching
+
+// LSC Pre-Fetch Load functions with CacheControls
+// Global address space
+void __builtin_IB_lsc_prefetch_global_uchar (const __global uchar  *base, int immElemOff, enum LSC_LDCC cacheOpt); //D8U32
+void __builtin_IB_lsc_prefetch_global_ushort(const __global ushort *base, int immElemOff, enum LSC_LDCC cacheOpt); //D16U32
+void __builtin_IB_lsc_prefetch_global_uint  (const __global uint   *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V1
+void __builtin_IB_lsc_prefetch_global_uint2 (const __global uint2  *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V2
+void __builtin_IB_lsc_prefetch_global_uint3 (const __global uint3  *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V3
+void __builtin_IB_lsc_prefetch_global_uint4 (const __global uint4  *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V4
+void __builtin_IB_lsc_prefetch_global_uint8 (const __global uint8  *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V8
+void __builtin_IB_lsc_prefetch_global_ulong (const __global ulong  *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V1
+void __builtin_IB_lsc_prefetch_global_ulong2(const __global ulong2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V2
+void __builtin_IB_lsc_prefetch_global_ulong3(const __global ulong3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V3
+void __builtin_IB_lsc_prefetch_global_ulong4(const __global ulong4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V4
+void __builtin_IB_lsc_prefetch_global_ulong8(const __global ulong8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V8
+
+// LSC Fence support
+
+// FS - Fence Scope
+enum LSC_FS {
+    LSC_FS_THREAD_GROUP,
+    LSC_FS_LOCAL,
+    LSC_FS_TILE,
+    LSC_FS_GPU,
+    LSC_FS_GPUs,
+    LSC_FS_SYSTEM_RELEASE,
+    LSC_FS_SYSTEM_ACQUIRE
+};
+
+// FT - Fence Type
+enum LSC_FT {
+    LSC_FT_DEFAULT,
+    LSC_FT_EVICT,
+    LSC_FT_INVALIDATE,
+    LSC_FT_DISCARD,
+    LSC_FT_CLEAN,
+    LSC_FT_L3
+};
+
+// LSC Fence functions
+void  __builtin_IB_lsc_fence_global_untyped(enum LSC_FS scope, enum LSC_FT flushType);   // Mem Port - UGM
+void  __builtin_IB_lsc_fence_global_untyped_cross_tile(enum LSC_FS scope, enum LSC_FT flushType);  // Mem Port - UGML
+void  __builtin_IB_lsc_fence_global_typed(enum LSC_FS scope, enum LSC_FT flushType);     // Mem Port - TGM
+void  __builtin_IB_lsc_fence_local();                                                    // Mem Port - SLM
+
+// Exported functions
+
+// LSC Loads
+// uchar
+uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// ushort
+uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// uint
+uint load_uint_L1UC_L3UC(global uint* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint load_uint_L1UC_L3C(global uint* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint load_uint_L1C_L3UC(global uint* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint load_uint_L1C_L3C(global uint* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint load_uint_L1S_L3UC(global uint* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint load_uint_L1S_L3C(global uint* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint load_uint_L1IAR_L3C(global uint* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// uint2
+uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint2 load_uint2_L1UC_L3C(global uint2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint2 load_uint2_L1C_L3UC(global uint2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint2 load_uint2_L1C_L3C(global uint2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint2 load_uint2_L1S_L3UC(global uint2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint2 load_uint2_L1S_L3C(global uint2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// uint3
+uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint3 load_uint3_L1UC_L3C(global uint3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint3 load_uint3_L1C_L3UC(global uint3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint3 load_uint3_L1C_L3C(global uint3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint3 load_uint3_L1S_L3UC(global uint3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint3 load_uint3_L1S_L3C(global uint3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// uint4
+uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint4 load_uint4_L1UC_L3C(global uint4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint4 load_uint4_L1C_L3UC(global uint4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint4 load_uint4_L1C_L3C(global uint4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint4 load_uint4_L1S_L3UC(global uint4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint4 load_uint4_L1S_L3C(global uint4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// uint8
+uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+uint8 load_uint8_L1UC_L3C(global uint8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+uint8 load_uint8_L1C_L3UC(global uint8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+uint8 load_uint8_L1C_L3C(global uint8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+uint8 load_uint8_L1S_L3UC(global uint8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+uint8 load_uint8_L1S_L3C(global uint8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// ulong
+ulong load_ulong_L1UC_L3UC(global ulong* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+ulong load_ulong_L1UC_L3C(global ulong* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+ulong load_ulong_L1C_L3UC(global ulong* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+ulong load_ulong_L1C_L3C(global ulong* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+ulong load_ulong_L1S_L3UC(global ulong* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+ulong load_ulong_L1S_L3C(global ulong* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+ulong load_ulong_L1IAR_L3C(global ulong* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// ulong2
+ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// ulong3
+ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// ulong4
+ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// ulong8
+ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1UC_L3UC);
+}
+
+ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1UC_L3C);
+}
+
+ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1C_L3UC);
+}
+
+ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1C_L3C);
+}
+
+ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1S_L3UC);
+}
+
+ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1S_L3C);
+}
+
+ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset)
+{
+    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1IAR_L3C);
+}
+
+// LSC Stores
+// uchar
+void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// ushort
+void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// uint
+void store_uint_L1UC_L3UC(global uint* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_uint_L1UC_L3WB(global uint* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_uint_L1WT_L3UC(global uint* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_uint_L1WT_L3WB(global uint* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_uint_L1S_L3UC(global uint* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_uint_L1S_L3WB(global uint* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_uint_L1WB_L3WB(global uint* it, int offset, uint value)
+{
+    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// uint2
+void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value)
+{
+    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value)
+{
+    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value)
+{
+    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value)
+{
+    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value)
+{
+    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value)
+{
+    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value)
+{
+    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// uint3
+void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value)
+{
+    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value)
+{
+    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value)
+{
+    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value)
+{
+    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value)
+{
+    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value)
+{
+    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value)
+{
+    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// uint4
+void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value)
+{
+    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value)
+{
+    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value)
+{
+    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value)
+{
+    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value)
+{
+    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value)
+{
+    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value)
+{
+    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// uint8
+void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value)
+{
+    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value)
+{
+    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value)
+{
+    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value)
+{
+    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value)
+{
+    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value)
+{
+    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value)
+{
+    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// ulong
+void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value)
+{
+    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value)
+{
+    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value)
+{
+    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value)
+{
+    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value)
+{
+    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value)
+{
+    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value)
+{
+    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// ulong2
+void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value)
+{
+    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value)
+{
+    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value)
+{
+    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// ulong3
+void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value)
+{
+    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value)
+{
+    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value)
+{
+    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// ulong4
+void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value)
+{
+    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value)
+{
+    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value)
+{
+    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// ulong8
+void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value)
+{
+    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1UC_L3UC);
+}
+
+void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1UC_L3WB);
+}
+
+void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value)
+{
+    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1WT_L3UC);
+}
+
+void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1WT_L3WB);
+}
+
+void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value)
+{
+    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1S_L3UC);
+}
+
+void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1S_L3WB);
+}
+
+void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1WB_L3WB);
+}
+
+// LSC Fence support
+void mem_fence_gpu_default()
+{
+    __builtin_IB_lsc_fence_global_untyped(LSC_FS_TILE, LSC_FT_DEFAULT);
+}
+
+void mem_fence_workgroup_default()
+{
+    __builtin_IB_lsc_fence_global_untyped(LSC_FS_THREAD_GROUP, LSC_FT_DEFAULT);
+}
+
+void mem_fence_gpu_invalidate()
+{
+    // NOTE: 'FS_TILE' is used here to avoid DG2 HW bug where L3 is needlessly flushed on a 'GPU' scope fence
+    __builtin_IB_lsc_fence_global_untyped(LSC_FS_TILE, LSC_FT_INVALIDATE);
+}
+
+void mem_fence_gpu_evict()
+{
+    __builtin_IB_lsc_fence_global_untyped(LSC_FS_TILE, LSC_FT_EVICT);
+}
+
+void mem_fence_evict_to_memory()
+{
+    __builtin_IB_lsc_fence_global_untyped(LSC_FS_GPU, LSC_FT_EVICT);
+    __builtin_IB_lsc_fence_global_untyped(LSC_FS_GPU, LSC_FT_L3);
+}
diff --git a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.h b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.h
new file mode 100644
index 00000000000..a12dac00e77
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics.h
@@ -0,0 +1,207 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+// LSC Loads
+uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset);
+uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset);
+uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset);
+uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset);
+uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset);
+uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset);
+uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset);
+
+uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset);
+uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset);
+uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset);
+uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset);
+uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset);
+uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset);
+uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset);
+
+uint load_uint_L1UC_L3UC(global uint* it, int offset);
+uint load_uint_L1UC_L3C(global uint* it, int offset);
+uint load_uint_L1C_L3UC(global uint* it, int offset);
+uint load_uint_L1C_L3C(global uint* it, int offset);
+uint load_uint_L1S_L3UC(global uint* it, int offset);
+uint load_uint_L1S_L3C(global uint* it, int offset);
+uint load_uint_L1IAR_L3C(global uint* it, int offset);
+
+uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset);
+uint2 load_uint2_L1UC_L3C(global uint2* it, int offset);
+uint2 load_uint2_L1C_L3UC(global uint2* it, int offset);
+uint2 load_uint2_L1C_L3C(global uint2* it, int offset);
+uint2 load_uint2_L1S_L3UC(global uint2* it, int offset);
+uint2 load_uint2_L1S_L3C(global uint2* it, int offset);
+uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset);
+
+uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset);
+uint3 load_uint3_L1UC_L3C(global uint3* it, int offset);
+uint3 load_uint3_L1C_L3UC(global uint3* it, int offset);
+uint3 load_uint3_L1C_L3C(global uint3* it, int offset);
+uint3 load_uint3_L1S_L3UC(global uint3* it, int offset);
+uint3 load_uint3_L1S_L3C(global uint3* it, int offset);
+uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset);
+
+uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset);
+uint4 load_uint4_L1UC_L3C(global uint4* it, int offset);
+uint4 load_uint4_L1C_L3UC(global uint4* it, int offset);
+uint4 load_uint4_L1C_L3C(global uint4* it, int offset);
+uint4 load_uint4_L1S_L3UC(global uint4* it, int offset);
+uint4 load_uint4_L1S_L3C(global uint4* it, int offset);
+uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset);
+
+uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset);
+uint8 load_uint8_L1UC_L3C(global uint8* it, int offset);
+uint8 load_uint8_L1C_L3UC(global uint8* it, int offset);
+uint8 load_uint8_L1C_L3C(global uint8* it, int offset);
+uint8 load_uint8_L1S_L3UC(global uint8* it, int offset);
+uint8 load_uint8_L1S_L3C(global uint8* it, int offset);
+uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset);
+
+ulong load_ulong_L1UC_L3UC(global ulong* it, int offset);
+ulong load_ulong_L1UC_L3C(global ulong* it, int offset);
+ulong load_ulong_L1C_L3UC(global ulong* it, int offset);
+ulong load_ulong_L1C_L3C(global ulong* it, int offset);
+ulong load_ulong_L1S_L3UC(global ulong* it, int offset);
+ulong load_ulong_L1S_L3C(global ulong* it, int offset);
+ulong load_ulong_L1IAR_L3C(global ulong* it, int offset);
+
+ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset);
+ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset);
+ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset);
+ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset);
+ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset);
+ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset);
+ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset);
+
+ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset);
+ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset);
+ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset);
+ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset);
+ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset);
+ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset);
+ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset);
+
+ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset);
+ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset);
+ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset);
+ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset);
+ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset);
+ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset);
+ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset);
+
+ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset);
+ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset);
+ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset);
+ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset);
+ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset);
+ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset);
+ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset);
+
+// LSC Stores
+void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value);
+void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value);
+void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value);
+void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value);
+void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value);
+void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value);
+void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value);
+
+void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value);
+void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value);
+void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value);
+void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value);
+void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value);
+void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value);
+void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value);
+
+void store_uint_L1UC_L3UC(global uint* it, int offset, uint value);
+void store_uint_L1UC_L3WB(global uint* it, int offset, uint value);
+void store_uint_L1WT_L3UC(global uint* it, int offset, uint value);
+void store_uint_L1WT_L3WB(global uint* it, int offset, uint value);
+void store_uint_L1S_L3UC(global uint* it, int offset, uint value);
+void store_uint_L1S_L3WB(global uint* it, int offset, uint value);
+void store_uint_L1WB_L3WB(global uint* it, int offset, uint value);
+
+void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value);
+void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value);
+void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value);
+void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value);
+void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value);
+void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value);
+void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value);
+
+void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value);
+void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value);
+void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value);
+void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value);
+void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value);
+void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value);
+void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value);
+
+void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value);
+void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value);
+void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value);
+void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value);
+void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value);
+void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value);
+void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value);
+
+void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value);
+void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value);
+void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value);
+void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value);
+void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value);
+void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value);
+void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value);
+
+void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value);
+void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value);
+void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value);
+void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value);
+void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value);
+void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value);
+void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value);
+
+void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value);
+void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value);
+void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value);
+void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value);
+void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value);
+void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value);
+void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value);
+
+void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value);
+void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value);
+void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value);
+void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value);
+void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value);
+void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value);
+void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value);
+
+void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value);
+void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value);
+void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value);
+void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value);
+void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value);
+void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value);
+void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value);
+
+void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value);
+void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value);
+void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value);
+void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value);
+void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value);
+void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value);
+void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value);
+
+// LSC Fence support
+void mem_fence_gpu_default();
+void mem_fence_workgroup_default();
+void mem_fence_gpu_invalidate();
+void mem_fence_gpu_evict();
+void mem_fence_evict_to_memory();
diff --git a/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics_fallback.cl b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics_fallback.cl
new file mode 100644
index 00000000000..2217618c7c5
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/libs/lsc_intrinsics_fallback.cl
@@ -0,0 +1,898 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+// LSC Loads
+// uchar
+uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+// ushort
+uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset)
+{
+    return (uint)(it[offset]);
+}
+
+// uint
+uint load_uint_L1UC_L3UC(global uint* it, int offset)
+{
+    return it[offset];
+}
+
+uint load_uint_L1UC_L3C(global uint* it, int offset)
+{
+    return it[offset];
+}
+
+uint load_uint_L1C_L3UC(global uint* it, int offset)
+{
+    return it[offset];
+}
+
+uint load_uint_L1C_L3C(global uint* it, int offset)
+{
+    return it[offset];
+}
+
+uint load_uint_L1S_L3UC(global uint* it, int offset)
+{
+    return it[offset];
+}
+
+uint load_uint_L1S_L3C(global uint* it, int offset)
+{
+    return it[offset];
+}
+
+uint load_uint_L1IAR_L3C(global uint* it, int offset)
+{
+    return it[offset];
+}
+
+// uint2
+uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset)
+{
+    return it[offset];
+}
+
+uint2 load_uint2_L1UC_L3C(global uint2* it, int offset)
+{
+    return it[offset];
+}
+
+uint2 load_uint2_L1C_L3UC(global uint2* it, int offset)
+{
+    return it[offset];
+}
+
+uint2 load_uint2_L1C_L3C(global uint2* it, int offset)
+{
+    return it[offset];
+}
+
+uint2 load_uint2_L1S_L3UC(global uint2* it, int offset)
+{
+    return it[offset];
+}
+
+uint2 load_uint2_L1S_L3C(global uint2* it, int offset)
+{
+    return it[offset];
+}
+
+uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset)
+{
+    return it[offset];
+}
+
+// uint3
+uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset)
+{
+    return it[offset];
+}
+
+uint3 load_uint3_L1UC_L3C(global uint3* it, int offset)
+{
+    return it[offset];
+}
+
+uint3 load_uint3_L1C_L3UC(global uint3* it, int offset)
+{
+    return it[offset];
+}
+
+uint3 load_uint3_L1C_L3C(global uint3* it, int offset)
+{
+    return it[offset];
+}
+
+uint3 load_uint3_L1S_L3UC(global uint3* it, int offset)
+{
+    return it[offset];
+}
+
+uint3 load_uint3_L1S_L3C(global uint3* it, int offset)
+{
+    return it[offset];
+}
+
+uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset)
+{
+    return it[offset];
+}
+
+// uint4
+uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset)
+{
+    return it[offset];
+}
+
+uint4 load_uint4_L1UC_L3C(global uint4* it, int offset)
+{
+    return it[offset];
+}
+
+uint4 load_uint4_L1C_L3UC(global uint4* it, int offset)
+{
+    return it[offset];
+}
+
+uint4 load_uint4_L1C_L3C(global uint4* it, int offset)
+{
+    return it[offset];
+}
+
+uint4 load_uint4_L1S_L3UC(global uint4* it, int offset)
+{
+    return it[offset];
+}
+
+uint4 load_uint4_L1S_L3C(global uint4* it, int offset)
+{
+    return it[offset];
+}
+
+uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset)
+{
+    return it[offset];
+}
+
+// uint8
+uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset)
+{
+    return it[offset];
+}
+
+uint8 load_uint8_L1UC_L3C(global uint8* it, int offset)
+{
+    return it[offset];
+}
+
+uint8 load_uint8_L1C_L3UC(global uint8* it, int offset)
+{
+    return it[offset];
+}
+
+uint8 load_uint8_L1C_L3C(global uint8* it, int offset)
+{
+    return it[offset];
+}
+
+uint8 load_uint8_L1S_L3UC(global uint8* it, int offset)
+{
+    return it[offset];
+}
+
+uint8 load_uint8_L1S_L3C(global uint8* it, int offset)
+{
+    return it[offset];
+}
+
+uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset)
+{
+    return it[offset];
+}
+
+// ulong
+ulong load_ulong_L1UC_L3UC(global ulong* it, int offset)
+{
+    return it[offset];
+}
+
+ulong load_ulong_L1UC_L3C(global ulong* it, int offset)
+{
+    return it[offset];
+}
+
+ulong load_ulong_L1C_L3UC(global ulong* it, int offset)
+{
+    return it[offset];
+}
+
+ulong load_ulong_L1C_L3C(global ulong* it, int offset)
+{
+    return it[offset];
+}
+
+ulong load_ulong_L1S_L3UC(global ulong* it, int offset)
+{
+    return it[offset];
+}
+
+ulong load_ulong_L1S_L3C(global ulong* it, int offset)
+{
+    return it[offset];
+}
+
+ulong load_ulong_L1IAR_L3C(global ulong* it, int offset)
+{
+    return it[offset];
+}
+
+// ulong2
+ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset)
+{
+    return it[offset];
+}
+
+ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset)
+{
+    return it[offset];
+}
+
+ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset)
+{
+    return it[offset];
+}
+
+ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset)
+{
+    return it[offset];
+}
+
+ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset)
+{
+    return it[offset];
+}
+
+ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset)
+{
+    return it[offset];
+}
+
+ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset)
+{
+    return it[offset];
+}
+
+// ulong3
+ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset)
+{
+    return it[offset];
+}
+
+ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset)
+{
+    return it[offset];
+}
+
+ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset)
+{
+    return it[offset];
+}
+
+ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset)
+{
+    return it[offset];
+}
+
+ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset)
+{
+    return it[offset];
+}
+
+ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset)
+{
+    return it[offset];
+}
+
+ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset)
+{
+    return it[offset];
+}
+
+// ulong4
+ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset)
+{
+    return it[offset];
+}
+
+ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset)
+{
+    return it[offset];
+}
+
+ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset)
+{
+    return it[offset];
+}
+
+ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset)
+{
+    return it[offset];
+}
+
+ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset)
+{
+    return it[offset];
+}
+
+ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset)
+{
+    return it[offset];
+}
+
+ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset)
+{
+    return it[offset];
+}
+
+// ulong8
+ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset)
+{
+    return it[offset];
+}
+
+ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset)
+{
+    return it[offset];
+}
+
+ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset)
+{
+    return it[offset];
+}
+
+ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset)
+{
+    return it[offset];
+}
+
+ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset)
+{
+    return it[offset];
+}
+
+ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset)
+{
+    return it[offset];
+}
+
+ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset)
+{
+    return it[offset];
+}
+
+// LSC Stores
+// uchar
+void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value)
+{
+    it[offset] = (uchar)(value);
+}
+
+void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value)
+{
+    it[offset] = (uchar)(value);
+}
+
+void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value)
+{
+    it[offset] = (uchar)(value);
+}
+
+void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value)
+{
+    it[offset] = (uchar)(value);
+}
+
+void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value)
+{
+    it[offset] = (uchar)(value);
+}
+
+void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value)
+{
+    it[offset] = (uchar)(value);
+}
+
+void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value)
+{
+    it[offset] = (uchar)(value);
+}
+
+// ushort
+void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value)
+{
+    it[offset] = (ushort)(value);
+}
+
+void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value)
+{
+    it[offset] = (ushort)(value);
+}
+
+void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value)
+{
+    it[offset] = (ushort)(value);
+}
+
+void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value)
+{
+    it[offset] = (ushort)(value);
+}
+
+void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value)
+{
+    it[offset] = (ushort)(value);
+}
+
+void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value)
+{
+    it[offset] = (ushort)(value);
+}
+
+void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value)
+{
+    it[offset] = (ushort)(value);
+}
+
+// uint
+void store_uint_L1UC_L3UC(global uint* it, int offset, uint value)
+{
+    it[offset] = value;
+}
+
+void store_uint_L1UC_L3WB(global uint* it, int offset, uint value)
+{
+    it[offset] = value;
+}
+
+void store_uint_L1WT_L3UC(global uint* it, int offset, uint value)
+{
+    it[offset] = value;
+}
+
+void store_uint_L1WT_L3WB(global uint* it, int offset, uint value)
+{
+    it[offset] = value;
+}
+
+void store_uint_L1S_L3UC(global uint* it, int offset, uint value)
+{
+    it[offset] = value;
+}
+
+void store_uint_L1S_L3WB(global uint* it, int offset, uint value)
+{
+    it[offset] = value;
+}
+
+void store_uint_L1WB_L3WB(global uint* it, int offset, uint value)
+{
+    it[offset] = value;
+}
+
+// uint2
+void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value)
+{
+    it[offset] = value;
+}
+
+void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value)
+{
+    it[offset] = value;
+}
+
+void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value)
+{
+    it[offset] = value;
+}
+
+void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value)
+{
+    it[offset] = value;
+}
+
+void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value)
+{
+    it[offset] = value;
+}
+
+void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value)
+{
+    it[offset] = value;
+}
+
+void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value)
+{
+    it[offset] = value;
+}
+
+// uint3
+void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value)
+{
+    it[offset] = value;
+}
+
+void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value)
+{
+    it[offset] = value;
+}
+
+void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value)
+{
+    it[offset] = value;
+}
+
+void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value)
+{
+    it[offset] = value;
+}
+
+void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value)
+{
+    it[offset] = value;
+}
+
+void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value)
+{
+    it[offset] = value;
+}
+
+void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value)
+{
+    it[offset] = value;
+}
+
+// uint4
+void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value)
+{
+    it[offset] = value;
+}
+
+void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value)
+{
+    it[offset] = value;
+}
+
+void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value)
+{
+    it[offset] = value;
+}
+
+void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value)
+{
+    it[offset] = value;
+}
+
+void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value)
+{
+    it[offset] = value;
+}
+
+void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value)
+{
+    it[offset] = value;
+}
+
+void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value)
+{
+    it[offset] = value;
+}
+
+// uint8
+void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value)
+{
+    it[offset] = value;
+}
+
+void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value)
+{
+    it[offset] = value;
+}
+
+void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value)
+{
+    it[offset] = value;
+}
+
+void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value)
+{
+    it[offset] = value;
+}
+
+void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value)
+{
+    it[offset] = value;
+}
+
+void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value)
+{
+    it[offset] = value;
+}
+
+void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value)
+{
+    it[offset] = value;
+}
+
+// ulong
+void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value)
+{
+    it[offset] = value;
+}
+
+void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value)
+{
+    it[offset] = value;
+}
+
+void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value)
+{
+    it[offset] = value;
+}
+
+void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value)
+{
+    it[offset] = value;
+}
+
+void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value)
+{
+    it[offset] = value;
+}
+
+void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value)
+{
+    it[offset] = value;
+}
+
+void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value)
+{
+    it[offset] = value;
+}
+
+// ulong2
+void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value)
+{
+    it[offset] = value;
+}
+
+// ulong3
+void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value)
+{
+    it[offset] = value;
+}
+
+// ulong4
+void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value)
+{
+    it[offset] = value;
+}
+
+// ulong8
+void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+    it[offset] = value;
+}
+
+void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value)
+{
+    it[offset] = value;
+}
+
+// LSC Fence support
+void mem_fence_gpu_default()
+{
+    write_mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
+void mem_fence_workgroup_default()
+{
+    write_mem_fence( CLK_GLOBAL_MEM_FENCE );
+}
+
+void mem_fence_gpu_invalidate()
+{
+    read_mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
+void mem_fence_gpu_evict()
+{
+    read_mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
+void mem_fence_evict_to_memory()
+{
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
diff --git a/src/intel/vulkan/grl/gpu/mem_utils.h b/src/intel/vulkan/grl/gpu/mem_utils.h
new file mode 100644
index 00000000000..b57a25279fd
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/mem_utils.h
@@ -0,0 +1,161 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "shared.h"
+
+/// Write cache line to global memory
+/// Assumes subgroup_size is 16
+///
+/// @param dst 64 bytes aligned output pointer
+/// @param val value to write
+GRL_INLINE void CacheLineSubgroupWrite(global char* dst, uint val)
+{
+    global uint* addrAligned = (global uint*)(global uint16*)dst;
+    intel_sub_group_block_write(addrAligned, val);
+}
+
+/// Read cache line from global memory
+/// Assumes subgroup_size is 16
+///
+/// @param src 64 bytes aligned input pointer
+/// @return uint read from memory
+GRL_INLINE uint CacheLineSubgroupRead(const global char* src)
+{
+    const global uint* addrAligned = (const global uint*)(global uint16*)src;
+    return intel_sub_group_block_read(addrAligned);
+}
+
+/// Copy cache line
+/// Assumes subgroup_size is 16
+///
+/// @param dst 64 bytes aligned output pointer
+/// @param src input pointer
+GRL_INLINE void CopyCacheLine(global char* dst, const global char* src)
+{
+    global const uint* usrc = (global const uint*) (src);
+
+    uint data = intel_sub_group_block_read(usrc);
+    CacheLineSubgroupWrite(dst, data);
+}
+
+/// Fast memory copy
+/// 
+/// @param dst output pointer
+/// @param src input pointer
+/// @param size number of bytes to copy
+/// @param numGroups number of groups that execute this function
+GRL_INLINE void CopyMemory(global char* dst, const global char* src, uint size, uint numGroups)
+{
+    const uint CACHELINE_SIZE = 64;
+
+    uint globalID = get_local_size(0) * get_group_id(0) + get_local_id(0);
+    
+    // this part copies cacheline per physical thread one write. starting from dst aligned up to cacheline.
+    // it copies laso reminder
+    {
+        uint alignAdd = ((uint)(uint64_t)dst) & (CACHELINE_SIZE - 1);
+        alignAdd = (CACHELINE_SIZE - alignAdd) & (CACHELINE_SIZE - 1);
+
+        if (size > alignAdd)
+        {
+            uint alignedBytesCount = size - alignAdd;
+            uint alignedDWsCount = alignedBytesCount >> 2;
+            global uint* dstAlignedPart = (global uint*)(dst + alignAdd);
+            global uint* srcAlignedPart = (global uint*)(src + alignAdd);
+
+            for (uint id = globalID; id < alignedDWsCount; id += get_local_size(0) * numGroups)
+            {
+                dstAlignedPart[id] = srcAlignedPart[id];
+            }
+
+            if (globalID < alignedBytesCount - (alignedDWsCount << 2))
+            {
+                global uint8_t* dstByteRem = (global uint8_t*)(dstAlignedPart + alignedDWsCount);
+                global uint8_t* srcByteRem = (global uint8_t*)(srcAlignedPart + alignedDWsCount);
+                dstByteRem[globalID] = srcByteRem[globalID];
+            }
+        }
+    }
+    
+    // copy to dst below aligned up to chacheline
+    {
+        uint misalignmentBytesSize = (4 - (((uint)dst) & /*bytes in DW*/3)) & 3;
+        if (misalignmentBytesSize)
+        {
+            if (globalID < misalignmentBytesSize)
+            {
+                dst[globalID] = src[globalID];
+            }
+            dst += misalignmentBytesSize;
+            src += misalignmentBytesSize;
+        }
+
+        uint misalignmentDWSize = (CACHELINE_SIZE - (((uint)dst) & (CACHELINE_SIZE - 1))) & (CACHELINE_SIZE - 1);
+        if (misalignmentDWSize)
+        {
+            if (globalID < (misalignmentDWSize >> 2))
+            {
+                ((global uint*)dst)[globalID] = ((global uint*)src)[globalID];
+            }
+        }
+    }
+}
+
+#define CACHELINE_SIZE 64
+#define CACHELINE_PER_BLOCK 4
+#define BLOCK_SIZE 256 // = CACHELINE_SIZE * CACHELINE_PER_BLOCK;
+
+GRL_INLINE
+global const char *getInstanceDataToCopy(global const char *array, global const uint64_t *arrayOfPtrs, const uint byteOffset)
+{
+    if (array != NULL)
+    {
+        return array + byteOffset;
+    }
+    else
+    {
+        return (global char *)arrayOfPtrs[byteOffset >> 6];
+    }
+}
+
+// assummed:
+// dst is always 64 bytes alligned
+// size is always multiply of 64 bytes (size of InstanceDesc is always 64 bytes)
+GRL_INLINE
+void copyInstances(global char *dst, global const char *array, global const uint64_t *arrayOfPtrs, const uint64_t size, const uint numGroups)
+{
+    uint taskId = get_group_id(0);
+
+    uint blockedSize = (size) & (~(BLOCK_SIZE - 1));
+
+    uint cachelinedTailOffset = blockedSize;
+    uint cachelinedTailSize = (size - cachelinedTailOffset) & (~(CACHELINE_SIZE - 1));
+
+    uint tailCacheLines = cachelinedTailSize >> 6; // divide by CACHELINE_SIZE
+    uint reversedTaskId = (uint)(-(((int)taskId) - ((int)numGroups - 1)));
+    if (reversedTaskId < tailCacheLines)
+    {
+        uint byteOffset = cachelinedTailOffset + (reversedTaskId * CACHELINE_SIZE);
+        global const char *src = getInstanceDataToCopy(array, arrayOfPtrs, byteOffset);
+        CopyCacheLine(dst + byteOffset, src);
+    }
+
+    uint numBlocks = blockedSize >> 8;
+    while (taskId < numBlocks)
+    {
+        uint byteOffset = (taskId * BLOCK_SIZE);
+
+        for (uint cl = 0; cl < CACHELINE_PER_BLOCK; cl++)
+        {
+            global const char *src = getInstanceDataToCopy(array, arrayOfPtrs, byteOffset);
+            CopyCacheLine(dst + byteOffset, src);
+            byteOffset += CACHELINE_SIZE;
+        }
+
+        taskId += numGroups;
+    }
+}
+\ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/misc.cl b/src/intel/vulkan/grl/gpu/misc.cl
new file mode 100644
index 00000000000..d32c8267b73
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/misc.cl
@@ -0,0 +1,367 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "api_interface.h"
+#include "common.h"
+#include "instance.h"
+#include "misc_shared.h"
+#include "mem_utils.h"
+
+#define DBG(x)
+#define ENABLE_CHECKS 0
+
+#define CACHELINE_SIZE 64
+#define CACHELINE_PER_BLOCK 4
+#define BLOCK_SIZE 256 // = CACHELINE_SIZE * CACHELINE_PER_BLOCK;
+
+GRL_INLINE
+uint32_t getGeomDescPrimitiveCountAsUint32t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index)
+{
+    return  (uint32_t)GRL_get_primitive_count(&geomDesc[index]);
+}
+
+GRL_INLINE
+uint32_t getGeomDescTypeAndFlagsAsUint32t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index)
+{
+    return (uint32_t)GRL_get_Type(&geomDesc[index]) |
+           (((uint32_t)GRL_get_Flags(&geomDesc[index])) << 16);
+}
+
+GRL_INLINE
+uint64_t getGeomDescAsUint64t(global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t index)
+{
+    return (uint64_t)getGeomDescPrimitiveCountAsUint32t(geomDesc, index) |
+           (((uint64_t)getGeomDescTypeAndFlagsAsUint32t(geomDesc, index)) << 32);
+}
+
+// assummed:
+// dst is always 64 bytes alligned
+GRL_INLINE
+void copyGeoMetaData(global char* dst, global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc, uint64_t size, uint numGroups)
+{
+    uint taskId = get_group_id(0);
+    uint localId = get_sub_group_local_id();
+
+    uint cachelinedSize = (size) & (~(CACHELINE_SIZE-1));
+
+    uint reminderOffset = cachelinedSize;
+    uint reminderQWSize = (size - reminderOffset) >> 3;
+
+    uint tailCacheLines = cachelinedSize >> 6; // divide by CACHELINE_SIZE
+    uint reversedTaskId = (uint)(-(((int)taskId) - ((int)numGroups-1)));
+    if (reversedTaskId == tailCacheLines && localId < reminderQWSize)
+    {
+        uint reminderOffsetQW = reminderOffset >> 3;
+        global uint64_t* dstQW = (global uint64_t*)(dst);
+        dstQW[localId + reminderOffsetQW] = getGeomDescAsUint64t(geomDesc, localId + reminderOffsetQW);
+    }
+
+    uint numCacheLines = cachelinedSize >> 6;
+    while (taskId < numCacheLines)
+    {
+        uint byteOffset = taskId * CACHELINE_SIZE;
+        uint geoIdFromOffset = (byteOffset >> 3) + (localId >> 1);
+
+        uint32_t data = 0;
+        if (localId & 1)
+        {
+            data = getGeomDescTypeAndFlagsAsUint32t(geomDesc, geoIdFromOffset);
+        }
+        else
+        {
+            data = getGeomDescPrimitiveCountAsUint32t(geomDesc, geoIdFromOffset);
+        }
+        CacheLineSubgroupWrite(dst + byteOffset, data);
+
+        taskId += numGroups;
+    }
+}
+
+GRL_INLINE
+uint groupCountForInstancesCopySize(uint size)
+{
+    return (size >> 8) + 3;
+}
+
+GRL_INLINE
+uint groupCountForGeoMetaDataCopySize(uint size)
+{
+    return (size >> 6) + 1;
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instances(global char* dest, global char* instancesArray, uint64_t size)
+{
+  //  global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
+    copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instances_indirect(global char* dest, global char* instancesArray, global const struct IndirectBuildRangeInfo* const indirect_data)
+{
+    uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc);
+    instancesArray += indirect_data->primitiveOffset;
+    uint tid = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+    if (tid == 0)
+    {
+        struct BVHBase* bvh     = (struct BVHBase*)dest;
+        bvh->Meta.instanceCount = indirect_data->primitiveCount;
+    }
+    copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instance_ptrs(global char* dest, global uint64_t* arrayOfPtrs, uint64_t size)
+{
+    //global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
+    copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instance_ptrs_indirect(global char* dest, global uint64_t* arrayOfPtrs, global struct IndirectBuildRangeInfo const * const indirect_data)
+{
+    uint64_t size = indirect_data->primitiveCount * sizeof(InstanceDesc);
+    arrayOfPtrs += indirect_data->primitiveOffset;
+    uint tid = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+    if (tid == 0)
+    {
+        struct BVHBase* bvh     = (struct BVHBase*)dest;
+        bvh->Meta.instanceCount = indirect_data->primitiveCount;
+    }
+    copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instances_base_ptr(global BVHBase* bvh, global char* instancesArray, uint64_t size)
+{
+    global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
+    copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instances_base_ptr_indirect(global BVHBase* bvh, global char* instancesArray, global struct IndirectBuildRangeInfo const * const indirect_data)
+{
+    global char* dest = (global char*)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
+    uint64_t     size = indirect_data->primitiveCount * sizeof(InstanceDesc);
+    instancesArray += indirect_data->primitiveOffset;
+    copyInstances(dest, instancesArray, NULL, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instance_ptrs_base_ptr(global BVHBase* bvh, global uint64_t* arrayOfPtrs, uint64_t size)
+{
+    global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
+    copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_instance_ptrs_base_ptr_indirect(global BVHBase* bvh, global uint64_t* arrayOfPtrs, global struct IndirectBuildRangeInfo const * const indirect_data)
+{
+    global char* dest = (global char*)((unsigned long)bvh + bvh->Meta.instanceDescsStart);
+    uint64_t     size = indirect_data->primitiveCount * sizeof(InstanceDesc);
+    arrayOfPtrs += indirect_data->primitiveOffset;
+    copyInstances(dest, NULL, arrayOfPtrs, size, groupCountForInstancesCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel copy_geo_meta_data(global char* dest, global char* src, uint64_t size)
+{
+    //global char *dest = (global char *)((unsigned long)bvh + bvh->Meta.geoDescsStart);
+    global GRL_RAYTRACING_GEOMETRY_DESC *geomDesc = (global GRL_RAYTRACING_GEOMETRY_DESC *)((unsigned long)src);
+    copyGeoMetaData(dest, geomDesc, size, groupCountForGeoMetaDataCopySize(size));
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__( ( reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 ) ) )
+__attribute__( ( intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH ) ) )
+void kernel copy_geo_descs_indirect_build(global char* dest, global char* src, global struct IndirectBuildRangeInfo const * const indirect_data, uint numGeometries)
+{
+    uint32_t gid = get_local_id(0) + get_group_id(0) * get_local_size(0);
+    if (gid < numGeometries) {
+        global GRL_RAYTRACING_GEOMETRY_DESC* dstDesc = (global GRL_RAYTRACING_GEOMETRY_DESC*)(dest);
+        global GRL_RAYTRACING_GEOMETRY_DESC* srcDesc = (global GRL_RAYTRACING_GEOMETRY_DESC*)(src);
+
+        GRL_RAYTRACING_GEOMETRY_DESC geo = srcDesc[gid];
+
+        uint primitiveCount  = indirect_data[gid].primitiveCount;
+        uint primitiveOffset = indirect_data[gid].primitiveOffset;
+        uint firstVertex     = indirect_data[gid].firstVertex;
+        uint transformOffset = indirect_data[gid].transformOffset;
+
+        if (srcDesc[gid].Type == GEOMETRY_TYPE_TRIANGLES)
+        {
+            if (geo.Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE)
+            {
+                geo.Desc.Triangles.VertexCount = primitiveCount * 3;
+                geo.Desc.Triangles.pVertexBuffer += primitiveOffset
+                                                    + firstVertex * geo.Desc.Triangles.VertexBufferByteStride;
+            }
+            else
+            {
+                geo.Desc.Triangles.IndexCount = primitiveCount * 3;
+                geo.Desc.Triangles.pIndexBuffer += primitiveOffset;
+                geo.Desc.Triangles.pVertexBuffer += firstVertex * geo.Desc.Triangles.VertexBufferByteStride;
+            }
+            if (geo.Desc.Triangles.pTransformBuffer) {
+                geo.Desc.Triangles.pTransformBuffer += transformOffset;
+            }
+        } else {
+            // GEOMETRY_TYPE_PROCEDURAL
+            geo.Desc.Procedural.AABBCount = primitiveCount;
+            geo.Desc.Procedural.pAABBs_GPUVA += primitiveOffset;
+        }
+
+        dstDesc[gid] = geo;
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel batched_init_globals(global struct BatchedInitGlobalsData *data)
+{
+    uint groupID = get_group_id(0);
+
+    struct BatchedInitGlobalsData entry = data[groupID];
+
+    global struct Globals* globals = (global struct Globals*)entry.p_build_globals;
+    global char *bvh_mem = (global char*)entry.p_bvh_buffer;
+    uint numPrimitives = entry.numPrimitives;
+    uint numGeometries = entry.numGeometries;
+    uint numInstances = entry.numInstances;
+    uint instance_descs_start = entry.instance_descs_start;
+    uint geo_meta_data_start = entry.geo_meta_data_start;
+    uint node_data_start = entry.node_data_start;
+    uint quad_data_start = entry.leaf_data_start;
+    uint instance_data_start = entry.leaf_data_start;
+    uint procedural_data_start = entry.procedural_data_start;
+    uint back_pointer_start = entry.back_pointer_start;
+    uint build_record_start = entry.leaf_data_start;
+    uint totalBytes = entry.sizeTotal;
+    uint leafPrimType = entry.leafType;
+    uint leafSize = entry.leafSize;
+
+    uint root_node_offset = node_data_start;
+    struct BVHBase *base = (struct BVHBase *)bvh_mem;
+
+    base->Meta.instanceCount      = numInstances;
+    base->Meta.geoCount           = numGeometries;
+    base->Meta.instanceDescsStart = instance_descs_start;
+    base->Meta.geoDescsStart      = geo_meta_data_start;
+    base->Meta.allocationSize     = totalBytes;
+    // This doesnt work correctly
+    //ERROR_INFO initErr = { 0, 0, 0, 0xAAABBAAA };
+    //base->Meta.errors = initErr;
+    base->Meta.errors.type = 0;
+    base->Meta.errors.offset_in_BVH = 0; //in 64B units
+    base->Meta.errors.when = 0;
+    base->Meta.errors.reserved = 0xAAABBAAA;
+
+    base->nodeDataCur = node_data_start / 64;
+    base->quadLeafStart = quad_data_start / 64;
+    base->quadLeafCur = quad_data_start / 64;
+    base->instanceLeafStart = instance_data_start / 64;
+    base->instanceLeafEnd = instance_data_start / 64;
+    base->proceduralDataStart = procedural_data_start / 64;
+    base->proceduralDataCur = procedural_data_start / 64;
+    base->backPointerDataStart = back_pointer_start / 64;
+    base->refitTreeletsDataStart = totalBytes / 64;
+    base->refitStartPointDataStart = totalBytes / 64;
+    base->BVHDataEnd = totalBytes / 64;
+    base->refitTreeletCnt = 0;
+    base->refitTreeletCnt2 = 0;
+    base->rootNodeOffset = root_node_offset;
+
+    base->fatLeafCount = 0;
+    base->fatLeafTableStart = entry.fatleaf_table_start / 64;
+    base->innerCount = 0;
+    base->innerTableStart = entry.innernode_table_start / 64;
+    base->quadLeftoversCountNewAtomicUpdate = 0;
+    base->quadTableSizeNewAtomicUpdate = 0;
+    base->quadIndicesDataStart = entry.quad_indices_data_start / 64;
+
+    if (back_pointer_start != totalBytes)
+    {
+        BackPointers* back_pointers = BVHBase_GetBackPointers(base);
+        uint root_node_idx = root_node_offset - node_data_start;
+        global uint *root_node_backpointer = (global uint *)InnerNode_GetBackPointer(back_pointers,root_node_idx);
+        *root_node_backpointer = ((uint)-1) << 6;
+    }
+
+    AABB3f_init(&base->Meta.bounds);
+    AABB_init(&globals->centroidBounds);
+
+    globals->build_record_start = build_record_start;
+
+    globals->numBuildRecords = 0;
+    globals->numBuildRecords_extended = 0;
+    globals->numPrimitives = numPrimitives;
+    globals->numSplittedPrimitives = 0;
+    globals->sync = 0;
+    globals->probThreshold = 0.0f;
+    globals->leafPrimType = leafPrimType;
+    globals->leafSize = leafSize;
+}
+
+
+
+// This is temporary WA for mock in DXR
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) void kernel copy_mock(global char *dest,
+                                                                                     global char *src,
+                                                                                     uint32_t size)
+{
+    uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0);
+    uint32_t globalSize = get_num_groups(0) * get_local_size(0);
+    for (uint32_t i = globalId; i < size; i += globalSize)
+    {
+        dest[i] = src[i];
+    }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(32, 1, 1)))
+void kernel mem_set(global char *dest,
+    dword byte,
+    dword size)
+{
+    uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0);
+    if (globalId < size)
+    {
+        dest[globalId] = (char)byte;
+    }
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(32, 1, 1)))
+void kernel mem_set_size_ptr(global char *dest,
+    dword byte,
+    global qword* sizePtr)
+{
+    uint32_t globalId = get_local_id(0) + get_group_id(0) * get_local_size(0);
+    if (globalId < *sizePtr)
+    {
+        dest[globalId] = (char)byte;
+    }
+}
diff --git a/src/intel/vulkan/grl/gpu/misc.grl b/src/intel/vulkan/grl/gpu/misc.grl
new file mode 100644
index 00000000000..cb98534afb4
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/misc.grl
@@ -0,0 +1,278 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module misc;
+
+kernel_module misc("misc.cl")
+{
+    kernel opencl_kernel_batched_init_globals                 < kernelFunction="batched_init_globals" >;
+    kernel opencl_kernel_copy_instances                       < kernelFunction="copy_instances" >;
+    kernel opencl_kernel_copy_instances_indirect              < kernelFunction="copy_instances_indirect" >;
+    kernel opencl_kernel_copy_instance_ptrs                   < kernelFunction="copy_instance_ptrs" >;
+    kernel opencl_kernel_copy_instance_ptrs_indirect          < kernelFunction="copy_instance_ptrs_indirect" >;
+    kernel opencl_kernel_copy_instances_base_ptr              < kernelFunction="copy_instances_base_ptr" >;
+    kernel opencl_kernel_copy_instances_base_ptr_indirect     < kernelFunction="copy_instances_base_ptr_indirect" >;
+    kernel opencl_kernel_copy_instance_ptrs_base_ptr          < kernelFunction="copy_instance_ptrs_base_ptr" >;
+    kernel opencl_kernel_copy_instance_ptrs_base_ptr_indirect < kernelFunction="copy_instance_ptrs_base_ptr_indirect" >;
+    kernel opencl_kernel_copy_geo_meta_data                   < kernelFunction="copy_geo_meta_data" >;
+    kernel opencl_kernel_copy_geo_descs_indirect_build        < source="misc.cl", kernelFunction="copy_geo_descs_indirect_build" >;
+    kernel opencl_kernel_copy_mock                            < kernelFunction="copy_mock" >;
+    kernel opencl_kernel_memset                               < kernelFunction="mem_set" >;
+    kernel opencl_kernel_memset_size_ptr                      < kernelFunction="mem_set_size_ptr" >;
+}
+
+import struct MKBuilderState "structs.grl";
+import struct MKSizeEstimate "structs.grl";
+
+
+metakernel batched_init_globals(
+    qword p_data,
+    dword numWgs)
+{
+    dispatch opencl_kernel_batched_init_globals(numWgs,1,1) args(p_data);
+}
+
+metakernel copy_instances(
+    qword bvh_buffer,
+    qword instanceDescsBuffer,
+    qword totalSizeToCopy,
+    dword numThreads)
+{
+    dispatch opencl_kernel_copy_instances (numThreads, 1, 1) args(
+        bvh_buffer,
+        instanceDescsBuffer,
+        totalSizeToCopy);
+}
+
+metakernel
+copy_instances_indirect( qword bvh_buffer, qword instanceDescsBuffer, qword indirectBuildRangeInfo )
+{
+
+    define num_groups REG0;
+    define C_2        REG2;
+    define C_3        REG3;
+
+    C_2       = 2;
+    C_3       = 3;
+
+    // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions
+    // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3
+    num_groups = load_dword( indirectBuildRangeInfo );
+    num_groups = num_groups >> C_2;
+    num_groups = num_groups + C_3;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_copy_instances_indirect args(
+        bvh_buffer,
+        instanceDescsBuffer,
+        indirectBuildRangeInfo);
+}
+
+metakernel copy_instance_ptrs(
+    qword bvh_buffer,
+    qword instanceDescPtrsBuffer,
+    qword totalSizeToCopy,
+    dword numThreads)
+{
+    dispatch opencl_kernel_copy_instance_ptrs (numThreads, 1, 1) args(
+        bvh_buffer,
+        instanceDescPtrsBuffer,
+        totalSizeToCopy);
+}
+
+metakernel copy_instance_ptrs_indirect(
+    qword bvh_buffer,
+    qword instanceDescPtrsBuffer,
+    qword indirectBuildRangeInfo)
+{
+    define num_groups REG0;
+    define C_2        REG2;
+    define C_3        REG3;
+
+    C_2       = 2;
+    C_3       = 3;
+
+    // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions
+    // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3
+    num_groups = load_dword( indirectBuildRangeInfo );
+    num_groups = num_groups >> C_2;
+    num_groups = num_groups + C_3;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_copy_instance_ptrs_indirect args(
+        bvh_buffer,
+        instanceDescPtrsBuffer,
+        indirectBuildRangeInfo);
+}
+
+metakernel copy_instances_base_ptr(
+    qword bvh_buffer,
+    qword instanceDescsBuffer,
+    qword totalSizeToCopy,
+    dword numThreads)
+{
+    dispatch opencl_kernel_copy_instances_base_ptr (numThreads, 1, 1) args(
+        bvh_buffer,
+        instanceDescsBuffer,
+        totalSizeToCopy);
+}
+
+metakernel copy_instances_base_ptr_indirect(
+    qword bvh_buffer,
+    qword instanceDescsBuffer,
+    qword indirectBuildRangeInfo)
+{
+    define num_groups REG0;
+    define C_2        REG2;
+    define C_3        REG3;
+
+    C_2       = 2;
+    C_3       = 3;
+
+    // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions
+    // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3
+    num_groups = load_dword( indirectBuildRangeInfo );
+    num_groups = num_groups >> C_2;
+    num_groups = num_groups + C_3;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_copy_instances_base_ptr_indirect args(
+        bvh_buffer,
+        instanceDescsBuffer,
+        indirectBuildRangeInfo);
+}
+
+metakernel copy_instance_ptrs_base_ptr(
+    qword bvh_buffer,
+    qword instanceDescPtrsBuffer,
+    qword totalSizeToCopy,
+    dword numThreads)
+{
+    dispatch opencl_kernel_copy_instance_ptrs_base_ptr (numThreads, 1, 1) args(
+        bvh_buffer,
+        instanceDescPtrsBuffer,
+        totalSizeToCopy);
+}
+
+metakernel copy_instance_ptrs_base_ptr_indirect(
+    qword bvh_buffer,
+    qword instanceDescPtrsBuffer,
+    qword indirectBuildRangeInfo)
+{
+    define num_groups REG0;
+    define C_2        REG2;
+    define C_3        REG3;
+
+    C_2       = 2;
+    C_3       = 3;
+
+    // sizeof(InstanceDesc) == 64, matches DXR and Vulkan API definitions
+    // num_groups = ((num_instances << log_2(64)) >> 8) + 3 = (num_instances >> 2) + 3
+    num_groups = load_dword( indirectBuildRangeInfo );
+    num_groups = num_groups >> C_2;
+    num_groups = num_groups + C_3;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_copy_instance_ptrs_base_ptr_indirect  args(
+        bvh_buffer,
+        instanceDescPtrsBuffer,
+        indirectBuildRangeInfo);
+}
+
+metakernel copy_geo_descs(
+    qword private_dest,
+    qword transient_src,
+    qword indirectBuildRangeInfo,
+    dword numGeometries)
+{
+
+    define num_groups (numGeometries + 16 - 1) / 16;
+    dispatch opencl_kernel_copy_geo_descs_indirect_build(num_groups, 1, 1) args(
+        private_dest,
+        transient_src,
+        indirectBuildRangeInfo,
+        numGeometries);
+}
+
+metakernel copy_geo_meta_data(
+    qword bvh_buffer,
+    qword geomdesc_buffer,
+    qword totalSizeToCopy,
+    dword numThreads)
+{
+    dispatch opencl_kernel_copy_geo_meta_data (numThreads, 1, 1) args(
+        bvh_buffer,
+        geomdesc_buffer,
+        totalSizeToCopy);
+}
+
+
+const COPY_MOCK_GROUP_SIZE = 16;
+
+metakernel copy_mock(
+    qword dest,
+    qword src,
+    dword size)
+{
+    define num_groups (size + COPY_MOCK_GROUP_SIZE - 1) / COPY_MOCK_GROUP_SIZE;
+    dispatch opencl_kernel_copy_mock(num_groups, 1, 1) args(
+        dest,
+        src,
+        size);
+}
+
+metakernel memset(
+    qword dest,
+    dword byte,
+    dword size)
+{
+    define num_groups (size + 32 - 1) / 32;
+    dispatch opencl_kernel_memset(num_groups, 1, 1) args(
+        dest,
+        byte,
+        size);
+}
+
+metakernel memset_size_ptr(
+    qword dest,
+    dword byte,
+    qword sizePtr)
+{
+    define byteSize REG0;
+    define C_32 REG1; C_32 = 32;
+    define C_1 REG2; C_1 = 1;
+    define C_4 REG3; C_4 = 4;
+    define numGroupsRqd REG4;
+
+    byteSize = load_dword(sizePtr);
+    
+    numGroupsRqd = byteSize + C_32;
+    numGroupsRqd = numGroupsRqd - C_1;
+    numGroupsRqd = numGroupsRqd >> C_4;
+    numGroupsRqd = numGroupsRqd >> C_1;
+
+    DISPATCHDIM_X = numGroupsRqd.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_kernel_memset_size_ptr args(
+        dest,
+        byte,
+        sizePtr);
+}
diff --git a/src/intel/vulkan/grl/gpu/misc_legacy.cl b/src/intel/vulkan/grl/gpu/misc_legacy.cl
new file mode 100644
index 00000000000..a464e89537c
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/misc_legacy.cl
@@ -0,0 +1,386 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "input_client_structs.h"
+#include "common.h"
+#include "instance.h"
+
+#define DBG(x)
+#define ENABLE_CHECKS 0
+
+/*
+
+  This kernel implements a exclusive scan addition operation. The
+  implementation currently only uses one DSS.
+
+ */
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_scan_exclusive_add(global uint *input,
+                            global uint *output,
+                            const uint N)
+{
+    const uint j = get_local_id(0);
+    const uint J = get_local_size(0);
+    const uint BLOCKSIZE = (N + J - 1) / J;
+    const uint start = min((j + 0) * BLOCKSIZE, N);
+    const uint end = min((j + 1) * BLOCKSIZE, N);
+
+    uint base = 0;
+    for (uint i = start; i < end; i++)
+        base += input[i];
+
+    base = work_group_scan_exclusive_add(base);
+
+    uint accu = 0;
+    for (uint i = start; i < end; i++)
+    {
+        output[i] = base + accu;
+        accu += input[i];
+    }
+}
+
+/*
+
+  This kernel implements a exclusive scan addition operation that can use the entire GPU.
+
+ */
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_scan_exclusive_add_phase0(global uint *input,
+                                   global uint *output,
+                                   global uint *prefix_sums,
+                                   const uint N)
+{
+    const uint local_size = get_local_size(0);
+    const uint numTasks = get_num_groups(0);
+    const uint groupID = get_group_id(0);
+    const uint localID = get_local_id(0);
+    const uint global_startID = (groupID + 0) * N / numTasks;
+    const uint global_endID = (groupID + 1) * N / numTasks;
+
+    uint base = 0;
+    for (uint i = global_startID + localID; i < global_endID; i += local_size)
+        base += input[i];
+
+    base = work_group_reduce_add(base);
+
+    if (localID == 0)
+    {
+        prefix_sums[groupID] = base;
+        printf("%d -> %d \n", groupID, base);
+    }
+}
+
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_scan_exclusive_add_phase1(global uint *input,
+                                   global uint *output,
+                                   global uint *prefix_sums,
+                                   const uint N)
+{
+    const uint local_size = get_local_size(0);
+    const uint numTasks = get_num_groups(0);
+    const uint groupID = get_group_id(0);
+    const uint localID = get_local_id(0);
+    const uint global_startID = (groupID + 0) * N / numTasks;
+    const uint global_endID = (groupID + 1) * N / numTasks;
+    const uint local_range = global_endID - global_startID;
+
+    uint global_base = 0;
+    for (uint i = 0; i < groupID; i++)
+        global_base += prefix_sums[i];
+
+    const uint j = get_local_id(0);
+    const uint J = get_local_size(0);
+    const uint BLOCKSIZE = (local_range + J - 1) / J;
+    const uint startID = (j + 0) * local_range / J + global_startID;
+    const uint endID = (j + 1) * local_range / J + global_startID;
+
+    uint base = 0;
+    for (uint i = startID; i < endID; i++)
+        base += input[i];
+
+    base = work_group_scan_exclusive_add(base);
+
+    uint accu = 0;
+    for (uint i = startID; i < endID; i++)
+    {
+        output[i] = global_base + base + accu;
+        accu += input[i];
+    }
+}
+
+/* ========================================================================= */
+/* ============================== STATISTICS =============================== */
+/* ========================================================================= */
+
+/* ====== STATS config ====== */
+
+#define ENABLE_STAT_CHECKS 1
+#define DBG_STATS(x)
+
+__attribute__((reqd_work_group_size(256, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+printBVHStatistics(global struct Globals *globals,
+                   global char *bvh_mem,
+                   global struct StatStackEntry *global_stack0,
+                   global struct StatStackEntry *global_stack1,
+                   const uint presplit)
+{
+    const uint globalID = get_global_id(0);
+    const uint localID = get_local_id(0);
+    const uint local_size = get_local_size(0);
+
+    struct BVHBase *base = (struct BVHBase *)bvh_mem;
+    const uint root = base->rootNodeOffset;
+
+    local uint stack_items[2];
+    local uint iterations;
+
+    struct AABB root_aabb = getAABB_QBVHNodeN((global struct QBVHNodeN *)(bvh_mem + root));
+    root_aabb = conservativeAABB(&root_aabb);
+    const float root_area = AABB_halfArea(&root_aabb);
+
+    global struct QBVHNodeN *root_node = (global struct QBVHNodeN *)(bvh_mem + base->rootNodeOffset);
+
+    if (root_node->type != BVH_INTERNAL_NODE)
+    {
+        const uint numChildren = getNumChildren_QBVHNodeN(root_node);
+        const uint current = root;
+        for (uint i = 0; i < numChildren; i++)
+        {
+            struct AABB aabb = extractAABB_QBVHNodeN(root_node, i);
+            const float area = AABB_halfArea(&aabb);
+
+            global_stack0[i].node = current + root_node->offset * 64 + i * sizeof(struct Quad);
+            global_stack0[i].type = root_node->type;
+            global_stack0[i].area = area;
+            global_stack0[i].aabb = aabb;
+            global_stack0[i].depth = 0;
+        }
+        stack_items[0] = numChildren;
+        stack_items[1] = 0;
+    }
+    else
+    {
+        global_stack0[0].node = root;
+        global_stack0[0].type = root_node->type;
+        global_stack0[0].area = root_area;
+        global_stack0[0].aabb = root_aabb;
+        global_stack0[0].depth = 1;
+        stack_items[0] = 1;
+        stack_items[1] = 0;
+    }
+
+    const uint maxInnerNodeOffset = globals->node_mem_allocator.cur;
+    const uint maxLeafNodeOffset = globals->quad_mem_allocator.cur;
+
+    DBG_STATS(if (localID == 0) printf("diff %d \n", (globals->node_mem_allocator_cur - globals->node_mem_allocator_start) / 64));
+
+    iterations = 0;
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    float sah_nodes = 0.0f;
+    float sah_leaves = 0.0f;
+    uint leaves = 0;
+    uint inner_nodes = 0;
+    uint max_depth = 0;
+    uint leaf_items = 0;
+    uint inner_nodes_valid_children = 0;
+
+    while (1)
+    {
+        work_group_barrier(CLK_GLOBAL_MEM_FENCE);
+        const uint buffer_index = (iterations % 2) == 0 ? 0 : 1;
+        global struct StatStackEntry *input_global_stack = buffer_index == 0 ? global_stack0 : global_stack1;
+        global struct StatStackEntry *output_global_stack = buffer_index == 0 ? global_stack1 : global_stack0;
+
+        const uint local_stack_items = stack_items[buffer_index];
+        stack_items[1 - buffer_index] = 0;
+
+        DBG_STATS(if (globalID == 0) printf("iterations %d local_stack_items %d \n", iterations, local_stack_items));
+
+        if (local_stack_items == 0)
+            break;
+        //if (iterations == 5) break;
+
+        work_group_barrier(CLK_GLOBAL_MEM_FENCE);
+
+        if (globalID == 0)
+            iterations++;
+
+        for (uint sindex = localID; sindex < local_stack_items; sindex += local_size)
+        {
+
+            uint current = input_global_stack[sindex].node;
+            uint type = input_global_stack[sindex].type;
+            float current_area = input_global_stack[sindex].area;
+            struct AABB current_aabb = input_global_stack[sindex].aabb;
+            uint current_depth = input_global_stack[sindex].depth;
+
+            //printf("localID %d sindex %d current %d type %d local_stack_items %d \n",localID,sindex,current,type,local_stack_items);
+
+            max_depth = max(max_depth, current_depth);
+
+            if (type == BVH_QUAD_NODE)
+            {
+                unsigned int prims = 1; //getNumLeafPrims(current);
+                if (prims > BVH_LEAF_N_MAX)
+                    printf("too many items in leaf %d \n", prims);
+                unsigned int prims_offset = current; //getLeafOffset(current);
+                //printf("prims_offset %d \n",prims_offset);
+
+                leaf_items += prims;
+                sah_leaves += current_area;
+                leaves++;
+#if ENABLE_STAT_CHECKS == 1
+                struct AABB leafAABB;
+                AABB_init(&leafAABB);
+
+                global struct Quad *quads = (global struct Quad *)(bvh_mem + prims_offset);
+                //printf("prims_offset %d \n",prims_offset);
+
+                for (uint i = 0; i < prims; i++)
+                {
+                    struct AABB quadAABB = getAABB_Quad(&quads[i]);
+                    AABB_extend(&leafAABB, &quadAABB);
+                }
+
+                if (!presplit && !AABB_subset(&leafAABB, &current_aabb))
+                {
+                    printf("leaf error: current %d depth %d \n", current, current_depth);
+                    AABB_print(&current_aabb);
+                    printf("leaf bounds: \n");
+                    AABB_print(&leafAABB);
+                }
+#endif
+            }
+            else if (type == BVH_INTERNAL_NODE)
+            {
+                inner_nodes++;
+                sah_nodes += current_area;
+                global struct QBVHNodeN *nodeN = (global struct QBVHNodeN *)(bvh_mem + current);
+
+                uint children = 0;
+                for (uint i = 0; i < BVH_NODE_N6; i++)
+                {
+                    if (nodeN->qbounds.lower_x[i] > nodeN->qbounds.upper_x[i])
+                        break;
+                    children++;
+                }
+                //printf("children %d \n",children);
+
+#if ENABLE_STAT_CHECKS == 1
+                if (children > BVH_NODE_N6 || children == 0)
+                {
+                    printf("#children not in valid range: %d offset %d localID %d \n", children, current, localID);
+                    printQBVHNodeN(nodeN);
+                }
+
+                if (nodeN->offset > globals->totalAllocatedMem || (int)nodeN->offset < 0)
+                {
+                    printf("offset error %d \n", nodeN->offset);
+                }
+#endif
+
+                uint children_offset = atomic_add(&stack_items[1 - buffer_index], children);
+
+                for (uint i = 0; i < children; i++)
+                {
+                    inner_nodes_valid_children++;
+
+                    struct AABB aabb = extractAABB_QBVHNodeN(nodeN, i);
+                    const float area = AABB_halfArea(&aabb);
+
+                    aabb = conservativeAABB(&aabb);
+
+#if 0 // ENABLE_STAT_CHECKS == 1                            // FIXME: not clear whether parent child property still holds !!!!
+
+                  // if (aabb.lower.x == (float)(INFINITY))
+                  //   {
+                  //     printf("aabb inf error %d current %d nodeN %d \n",i, current, children);
+                  //     break;
+                  //   }
+
+
+                  if (!presplit && !AABB_subset(&aabb,&current_aabb))
+                    {
+                      printf("Parent: current %d depth %d children %d \n",current, current_depth, children);
+                      AABB_print(&current_aabb);
+                      printf("Child %d: \n",i);
+                      AABB_print(&aabb);
+                    }
+#endif
+
+                    uint dest_index = children_offset + i;
+                    if (nodeN->type == BVH_QUAD_NODE)
+                    {
+                        output_global_stack[dest_index].node = current + nodeN->offset * 64 + i * sizeof(struct Quad);
+                        if (output_global_stack[dest_index].node >= maxLeafNodeOffset)
+                        {
+                            printf("stack leaf offset error %d %d current %d %d \n", output_global_stack[dest_index].node, output_global_stack[dest_index].node / 64, current, current / 64);
+                        }
+                    }
+                    else if (nodeN->type == BVH_INTERNAL_NODE)
+                    {
+                        output_global_stack[dest_index].node = (current + nodeN->offset * 64 + i * sizeof(struct QBVHNodeN));
+                        if (output_global_stack[dest_index].node >= maxInnerNodeOffset)
+                        {
+                            printf("stack inner node offset error %d %d current %d %d maxInnerNodeOffset %d \n", output_global_stack[dest_index].node, output_global_stack[dest_index].node / 64, current, current / 64, maxInnerNodeOffset);
+                        }
+                    }
+
+                    output_global_stack[dest_index].type = nodeN->type;
+                    output_global_stack[dest_index].area = area;
+                    output_global_stack[dest_index].aabb = aabb;
+                    output_global_stack[dest_index].depth = current_depth + 1;
+                    //printf("global_stack[dest_index].node %d global_stack[dest_index].type %d \n",global_stack[dest_index].node,global_stack[dest_index].type);
+                }
+            }
+        }
+    }
+
+    sah_nodes = work_group_reduce_add(sah_nodes);
+    sah_leaves = work_group_reduce_add(sah_leaves);
+    leaves = work_group_reduce_add(leaves);
+    inner_nodes = work_group_reduce_add(inner_nodes);
+    max_depth = work_group_reduce_max(max_depth);
+    leaf_items = work_group_reduce_add(leaf_items);
+    inner_nodes_valid_children = work_group_reduce_add(inner_nodes_valid_children);
+
+    if (globalID == 0)
+    {
+        /*
+    sah_nodes  *= 1.0f / root_area;
+    sah_leaves *= 1.0f / root_area;
+    float sah = sah_nodes + sah_leaves;
+
+    const uint globalLeafMemAllocatorOffset = globals->quad_mem_allocator.start;
+    const uint totalAllocatedMem = globals->totalAllocatedMem;
+
+    printf("BVH_NODE_N6 %d BVH_LEAF_N_MIN %d BVH_LEAF_N_MAX %d \n",BVH_NODE_N6,BVH_LEAF_N_MIN,BVH_LEAF_N_MAX);
+    float node_util = 100.0f * (float)inner_nodes_valid_children / (inner_nodes * BVH_NODE_N6);
+    float leaf_util = 100.0f * (float)leaf_items / (leaves);
+    printf("allocators: node  %d -> %d ; leaf %d -> %d \n",globals->node_mem_allocator_cur,globals->node_mem_allocator_start,globals->leaf_mem_allocator_cur,globals->leaf_mem_allocator_start);
+    printf("inner nodes %d leaves %d  sah %f sah_node %f sah_leaves %f max_depth %d leaf_items %d node util %f leaf util %f (%f) \n",inner_nodes,leaves,sah,sah_nodes,sah_leaves,max_depth,leaf_items,node_util,leaf_util,(float)leaf_items / leaves);
+    uint node_mem     = globals->node_mem_allocator_cur;
+    uint max_node_mem = globalLeafMemAllocatorOffset;
+    float node_mem_ratio = 100.0f * (float)node_mem / max_node_mem;
+
+    uint leaf_mem        = globals->leaf_mem_allocator.cur - globalLeafMemAllocatorOffset;
+    uint max_leaf_mem    = totalAllocatedMem - globalLeafMemAllocatorOffset;
+    float leaf_mem_ratio = 100.0f * (float)leaf_mem / max_leaf_mem;
+
+    uint total_mem = node_mem + leaf_mem;
+    float total_mem_ratio = 100.0f * (float)total_mem / totalAllocatedMem;
+
+    printf("used node memory %d (%f) / used leaf memory %d (%f) / total memory used %d (%f) / total memory allocated %d \n",node_mem, node_mem_ratio, leaf_mem, leaf_mem_ratio, total_mem, total_mem_ratio, totalAllocatedMem);
+    */
+    }
+}
diff --git a/src/intel/vulkan/grl/gpu/misc_shared.h b/src/intel/vulkan/grl/gpu/misc_shared.h
new file mode 100644
index 00000000000..218f2fa4291
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/misc_shared.h
@@ -0,0 +1,196 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+//
+//   This file contains structure definitions shared by GRL OCL kernels and host code
+//
+
+#pragma once
+
+#include "GRLGen12.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+GRL_NAMESPACE_BEGIN(MISC)
+
+struct BatchedInitGlobalsData
+{
+    qword p_build_globals;
+    qword p_bvh_buffer;
+    dword numPrimitives;
+    dword numGeometries;
+    dword numInstances;
+    dword instance_descs_start;
+    dword geo_meta_data_start;
+    dword node_data_start;
+    dword leaf_data_start;
+    dword procedural_data_start;
+    dword back_pointer_start;
+    dword sizeTotal;
+    dword leafType;
+    dword leafSize;
+    dword fatleaf_table_start;
+    dword innernode_table_start;
+    dword quad_indices_data_start;
+};
+
+/// Header of debug buffer
+///
+/// Header is placed at the begining of debug buffer. 
+/// After header there is circullar buffer space
+typedef struct DebugBufferHeader
+{
+    /// Offset to begin of buffer (after header)
+    dword headStart;
+    /// Offset to free memory in buffer (used by gpu)
+    dword gpuHead;
+    /// Offset to end of data in buffer that is ready to read (read on cpu, set on gpu, might be behind gpuHeader)
+    dword cpuHead;
+    /// Flag for buffer overflow
+    dword overflow;
+    /// Total size of buffer
+    dword totalSize;
+    /// Padding needed because otherwise GPU overrides tail with cacheline flush
+    dword pad[11];
+    /// Offset to begin of data in buffer
+    dword tail;
+} DebugBufferHeader;
+
+enum InputDumpOperationType
+{
+    INPUT_DUMP_OP_NOP,
+    INPUT_DUMP_OP_BATCH,
+    INPUT_DUMP_OP_BUILD,
+    INPUT_DUMP_OP_UPDATE,
+    INPUT_DUMP_OP_CLONE,
+    INPUT_DUMP_OP_COMPACT,
+    INPUT_DUMP_OP_SERIALIZE,
+    INPUT_DUMP_OP_DESERIALIZE,
+    INPUT_DUMP_OP_END_BUFFER
+};
+
+// each operation starts with the same header structure and looks like this
+
+//  some defined struct { <-----------------start
+//     OpHeader 
+//     .... struct type specific data
+//  }
+//  ... auxilary data of variable len
+//  <-------------------------------------- end - indicated by endOfData
+typedef struct OpHeader
+{
+    dword operationType;
+    dword endOfData; // offset to end of this primitive
+} OpHeader;
+
+// header for batch operations
+typedef struct BatchOpHeader
+{
+    OpHeader opHeader;
+} BatchOpHeader;
+
+// interpretation for operationType INPUT_DUMP_OP_BATCH
+typedef struct InputBatch
+{
+    BatchOpHeader header;
+    qword batchId;
+    dword vertexBufferDataSize;
+    dword firstContainedOpOffset;
+    
+    // layout of batch is as below, each line is 128B aligned:
+
+    // 
+    //  InputBatch <-------------------------------- start
+    //       optional: batchVertexData
+    //  InputBuildDesc/InputCopy <------------------ start + firstContainedOpOffset
+    //       optional: extra data of above token
+    //  InputBuildDesc/InputCopy
+    //       optional: extra data of above token
+    //  ...
+    //  InputBuildDesc/InputCopy
+    //       optional: extra data of above token
+    //  <-------------------------------------------- end    = start + endOfData
+} InputBatch;
+
+// for operationType:
+//   INPUT_DUMP_OP_BUILD,
+//   INPUT_DUMP_OP_UPDATE,
+// followed by auxilary data of variable len
+typedef struct InputBuild
+{
+    OpHeader header;
+    qword srcBvhPtr;
+    qword dstBvhPtr;
+    dword flags;
+    dword numGeos;
+    dword numInstances;
+    dword instArrayOfPtrs;
+} InputBuild;
+
+// for operationType:
+//   INPUT_DUMP_OP_CLONE,
+//   INPUT_DUMP_OP_COMPACT,
+//   INPUT_DUMP_OP_SERIALIZE,
+// 
+//   Not for INPUT_DUMP_OP_DESERIALIZE!
+typedef struct InputCopy
+{
+    OpHeader header;
+    qword srcBvhPtr;
+    qword dstBvhPtr;
+} InputCopy;
+
+// for INPUT_DUMP_OP_DESERIALIZE
+// decode for debug tools follows this format
+typedef struct InputDeserialize
+{
+    OpHeader header;
+    qword dstBvhPtr;
+} InputDeserialize;
+
+typedef struct InputBatchPtrs
+{
+    qword dumpDst;
+    qword globalDumpBuffer;
+    qword nonVertexDataStart;
+    dword vertexBuffersSize;
+    dword totalSize;
+} InputBatchPtrs;
+
+enum OutputDumpOperationType
+{
+    OUTPUT_DUMP_OP_NOP,
+    OUTPUT_DUMP_OP_BATCH,
+    OUTPUT_DUMP_OP_DATA,
+    OUTPUT_DUMP_OP_END_BUFFER
+};
+
+// interpretation for operationType OUTPUT_DUMP_OP_BATCH
+typedef struct OutputBatch {
+    BatchOpHeader header;
+    qword batchId;
+    dword firstContainedOpOffset;
+} OutputBatch;
+
+// interpretation for operationType OUTPUT_DUMP_OP_DATA
+typedef struct OutputData
+{
+    OpHeader header;
+    qword srcBvhPtr;
+} OutputData;
+
+typedef struct OutputBatchPtrs 
+{
+    qword dumpDst;
+    qword dataStart;
+    dword dataSize;
+    dword totalSize;
+} OutputBatchPtrs;
+
+GRL_NAMESPACE_END(MISC)
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
diff --git a/src/intel/vulkan/grl/gpu/morton/morton_common.h b/src/intel/vulkan/grl/gpu/morton/morton_common.h
new file mode 100644
index 00000000000..2beb7a1aff3
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton/morton_common.h
@@ -0,0 +1,245 @@
+//
+// Copyright (C) 2009-2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "common.h"
+
+#define MORTON_DEBUG_CHECKS 0
+#define MORTON_VERBOSE_LOG 0
+
+GRL_INLINE uint get_morton_sort_lsb_req_iterations( uint shift )
+{
+#if 0 // turn off, because current hierarchy build requires full sort
+    // Difference between max iterations needed for LSB sorting and
+    // number of iterations needed for LSB sorting without primIDs
+    // This indicates how many of first iterations would be skipped in LSB
+    return 8 - (8 - (shift >> 3));
+#else
+    return 0;
+#endif
+}
+
+typedef struct BuildRecordLocalMortonFlattener
+{
+    unsigned int leftChild;  // global
+    unsigned int rightChild; // global
+    unsigned int rangeStart; // global
+    unsigned int local_parent_index__numItems;
+} BuildRecordLocalMortonFlattener;
+
+// TODO: Currently sizeof UPerNodeData is 32, AABB struct allocates more data than needed and can be reduced
+typedef union UPerNodeData {
+    float4                           four_DWs;
+    BuildRecordLocalMortonFlattener  buildRecord;
+    MortonFlattenedBoxlessNode             boxlessNode;
+    struct AABB                      box;
+} UPerNodeData;
+
+GRL_INLINE uint MortonFlattenedBoxlessNode_GetChildOffset(MortonFlattenedBoxlessNode bn)
+{
+    return bn.childOffset_type >> 6;
+}
+
+GRL_INLINE uint MortonFlattenedBoxlessNode_GetType(MortonFlattenedBoxlessNode bn)
+{
+    return bn.childOffset_type & ((1<<6) -1);
+}
+
+GRL_INLINE void set_2xSG_arr_first_write(uint index, uint* arr, ushort val, short lane)
+{
+    short lane_used = index % get_sub_group_size();
+    short shift = (index / get_sub_group_size()) * get_sub_group_size();
+    if (lane_used == lane) {
+        *arr |= (val << shift);
+    }
+}
+
+GRL_INLINE short get_from_2xSG_arr(uint index, uint arr, short lane)
+{
+    short r = 0;
+    short lane_used = index % get_sub_group_size();
+    short shift =    (index / get_sub_group_size()) * get_sub_group_size();
+        r = arr >> shift;
+    r = sub_group_broadcast(r, lane_used);
+    return r;
+}
+
+GRL_INLINE void unpack_from_2xSG_arr(uint count, uint arr, short lane, ushort* dst)
+{
+    if (lane < count)
+    {
+        dst[lane]=(ushort)(arr & 0xFFFF);
+        short hi_idx = lane + get_sub_group_size();
+        if (hi_idx < count) {
+            dst[hi_idx] = (ushort)(arr >> 16);
+        }
+    }
+}
+
+
+GRL_INLINE void pack_from_2xSG_arr(ushort* src, uint count, uint *arr, short lane)
+{
+    if (lane < count)
+    {
+        *arr = src[lane];
+        short hi_idx = lane + get_sub_group_size();
+        if (hi_idx < count) {
+            *arr |= ((uint)(src[hi_idx])) << 16u;
+        }
+    }
+}
+
+GRL_INLINE void set_2xSG_arr(uint index, uint* arr, short val, short lane)
+{
+    short lane_used = index % get_sub_group_size();
+    short shift = (index / get_sub_group_size()) * get_sub_group_size();
+    if (lane_used == lane) {
+        uint rem_val = (*arr) & (0xFFFF0000 >> shift); //calculate the ramaining other half in the uint
+        *arr = (val << shift) | rem_val;
+    }
+}
+
+GRL_INLINE void SUBGROUP_refit_bottom_up_local(
+    uniform struct QBVHNodeN* globalNodeData,
+    uniform struct BackPointers* backPointers,
+    uniform uint treeletRootGlobalIndex,
+    uniform uint globalBaseForInternalNodes,
+    varying ushort lane,
+    uniform local union UPerNodeData* local_nodes,
+    varying uint sg_bu_startpoints,
+    uniform uint sg_bu_startpoints_cnt)
+{
+    if(sg_bu_startpoints_cnt == 0)
+        return;
+
+    const uint head_lane = 0;
+    uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_bu_startpoints, lane);
+
+    uniform uint prev_loc_index = 0;
+    uniform struct AABB child_aabb; // this carries reduced aabb between loop turns
+
+    uniform uint backpointer = local_nodes[curNodeIndex].boxlessNode.backPointer;
+
+    while (curNodeIndex != 0)
+    {
+        uniform uint lead_child_loc_offset = MortonFlattenedBoxlessNode_GetChildOffset(local_nodes[curNodeIndex].boxlessNode);
+        uniform uint nodeType = MortonFlattenedBoxlessNode_GetType(local_nodes[curNodeIndex].boxlessNode);
+        varying uint child_loc_idx = lead_child_loc_offset + curNodeIndex + lane;
+
+        uint numChildren = BackPointer_GetNumChildren(backpointer);
+        if (child_loc_idx != prev_loc_index &&
+            lane < numChildren)
+        {
+            child_aabb = local_nodes[child_loc_idx].box;
+        }
+        else if (lane >= numChildren) {
+            AABB_init(&child_aabb);
+            child_aabb.lower.w = as_float(0u);
+        }
+
+        // TODO: perNode data could hold 7 dwords per node instead of 8 as long as we keep it in SLM
+        struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
+        reduced_bounds = AABB_sub_group_shuffle( &reduced_bounds, 0 );
+
+        uint instMask = (uint)sub_group_reduce_or_N6(as_uint(child_aabb.lower.w));
+        reduced_bounds.lower.w = as_float((uint)instMask);
+        uint reduce_bounds_lane = AABB_sub_group_shuffle_coordPerLane(&reduced_bounds, 0);
+        local uint* pbox = (local uint*)(local_nodes+ curNodeIndex);
+        if (lane < 8)
+        {
+            pbox[lane] = reduce_bounds_lane;
+        }
+
+        uint global_node_idx = globalBaseForInternalNodes + curNodeIndex;
+        /* get bounds of all children from child nodes directly */
+        struct QBVHNodeN* qnode = globalNodeData + global_node_idx;
+        subgroup_setQBVHNodeN_setFields(lead_child_loc_offset, nodeType, &child_aabb, numChildren, instMask, qnode, false);
+        child_aabb = reduced_bounds;
+        uint parentIndex = BackPointer_GetParentIndex(backpointer);
+
+        write_mem_fence(CLK_LOCAL_MEM_FENCE);
+
+        if (lane == 0)
+        {
+            backpointer = atomic_inc_local(&(local_nodes[parentIndex].boxlessNode.backPointer));
+            uint globalParentIndex = (parentIndex > 0) ? (parentIndex + globalBaseForInternalNodes) : treeletRootGlobalIndex;
+            uint globalBackpointer = (globalParentIndex << 6) | (numChildren << 3);
+
+            /* set global back pointer */
+            *InnerNode_GetBackPointer(backPointers, global_node_idx) = globalBackpointer;
+
+#if MORTON_VERBOSE_LOG
+            printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, lead_child_loc_offset: %d, numChildren: %d, child_loc_idx: %d\n",
+                   global_node_idx, global_node_idx + qnode->offset, qnode->offset, globalBackpointer >> 6, lead_child_loc_offset, numChildren, child_loc_idx);
+#endif
+        }
+
+        backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane);
+        prev_loc_index = curNodeIndex;
+        curNodeIndex = parentIndex;
+
+        /* if all children got refitted, then continue */
+        uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7;
+        uniform uint numChildrenTotal = (backpointer >> 3) & 0x7;
+        if (numChildrenRefitted != numChildrenTotal)
+        {
+            if(sg_bu_startpoints_cnt)
+            {
+                curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_bu_startpoints, lane);
+                backpointer = local_nodes[curNodeIndex].boxlessNode.backPointer;
+            }
+            else
+                return;
+        }
+    }
+
+    // process root of the treelet
+    {
+
+#if MORTON_DEBUG_CHECKS
+        if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n");
+#endif
+
+        uniform uint lead_child_loc_offset = MortonFlattenedBoxlessNode_GetChildOffset(local_nodes[0].boxlessNode);
+        varying uint child_loc_idx = lead_child_loc_offset + 0 + lane;
+        uint numChildren = BackPointer_GetNumChildren(backpointer);
+
+        if (child_loc_idx != prev_loc_index &&
+            lane < numChildren)
+        {
+            child_aabb = local_nodes[child_loc_idx].box;
+        }
+        else if (lane >= numChildren) {
+            AABB_init(&child_aabb);
+            child_aabb.lower.w = as_float(0u);
+        }
+
+        // TODO: perNode data could hold 7 dwords per node instead of 8 as long as we keep it in SLM
+        uint instMask = (uint)sub_group_reduce_or_N6(as_uint(child_aabb.lower.w));
+        uint nodeType = MortonFlattenedBoxlessNode_GetType(local_nodes[curNodeIndex].boxlessNode);
+        uint global_node_idx = treeletRootGlobalIndex;
+        uint lead_child_global_idx = globalBaseForInternalNodes + lead_child_loc_offset;
+
+        /* get bounds of all children from child nodes directly */
+        struct QBVHNodeN* qnode = globalNodeData + global_node_idx;
+
+        subgroup_setQBVHNodeN_setFields(lead_child_global_idx - global_node_idx, nodeType, &child_aabb, numChildren, instMask, qnode, false);
+
+        /* reset refit counter for next refit */
+        if (lane == 0)
+        {
+            /* set global back pointer */
+            *InnerNode_GetBackPointer(backPointers, global_node_idx) = backpointer & (~7u);
+
+            // TODO: Move AABBs to separate buffer, but for now communicate bottom-tip boxes through qnodes
+
+#if MORTON_VERBOSE_LOG
+            printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n",
+                   curNodeIndex, global_node_idx, global_node_idx + qnode->offset, qnode->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt);
+#endif
+        }
+    }
+}
diff --git a/src/intel/vulkan/grl/gpu/morton/phase0.cl b/src/intel/vulkan/grl/gpu/morton/phase0.cl
new file mode 100644
index 00000000000..2fa91c214e1
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton/phase0.cl
@@ -0,0 +1,400 @@
+//
+// Copyright (C) 2009-2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "libs/lsc_intrinsics.h"
+#include "morton/morton_common.h"
+
+GRL_INLINE void SUBGROUP_create_node_phase0(
+    uniform global struct Globals* globals,
+    uniform global struct BinaryMortonCodeHierarchy* bnodes,
+    uniform global char* bvh_mem,
+    uniform global uint *global_refit_startpoints,
+    uniform uint rID,
+    uniform local uint* local_numRecords,
+    uniform local uint* local_QNodeOffset,
+    uniform global struct BuildRecordMorton* records,
+    uniform struct BuildRecordMorton current,
+    uniform local uint* local_startpoints_num)
+{
+    uniform global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    uniform const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
+    uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+    uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+
+    varying ushort lane = get_sub_group_local_id();
+
+    /* initialize child array */
+    uniform uint numChildren = 2;
+    varying struct BuildRecordMorton sg_children;
+    sg_children.items = 0;
+    sg_children.nodeID = (lane == 0) ? bnodes[current.nodeID].leftChild : bnodes[current.nodeID].rightChild;
+
+    if ( lane < numChildren )
+        sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, sg_children.nodeID );
+
+    /* fill QBVH6 node with up to 6 children */
+    while ( numChildren < BVH_NODE_N6 )
+    {
+        varying bool sg_is_leaf = sg_children.items <= cfg_minLeafSize;
+        if ( sub_group_all( sg_is_leaf ) )
+            break;
+
+        uniform uint bestItems = sub_group_reduce_max_N6( sg_children.items );
+        uniform ushort bestChild = ctz( intel_sub_group_ballot( sg_children.items == bestItems ) );
+        uniform uint bestNodeID = sub_group_broadcast( sg_children.nodeID, bestChild );
+
+        varying uint nodeID = (lane == bestChild) ? bnodes[bestNodeID].leftChild : bnodes[bestNodeID].rightChild;
+
+        if ( lane == numChildren || lane == bestChild )
+        {
+            sg_children.nodeID = nodeID;
+            sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, nodeID );
+        }
+
+        numChildren++;
+    }
+
+    const uint current_index = current.current_index;
+    struct QBVHNodeN* qnode = nodeData + current_index;
+    SUBGROUP_QBVHNodeN_setChildIncr1( qnode );
+
+    uniform uint global_offset;
+    uniform uint child_node_offset;
+
+    // Check if all children will be roots for the local subgtrees in phase1. If so we keep the node ids to be later
+    // used in global refit after phase1
+    varying uchar is_children_root = (lane < numChildren) ? (sg_children.items <= MORTON_BUILDER_SUBTREE_THRESHOLD) : 0;
+    uniform uchar children_roots_num = sub_group_reduce_add(is_children_root);
+
+    if ( lane == 0 )
+    {
+        child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren);
+
+        /* create node, but to not set bounds yet as these get calculated during refit */
+        QBVH6Node_set_type( qnode, BVH_INTERNAL_NODE );
+        QBVH6Node_set_offset( qnode, (global struct QBVHNodeN*)(bvh_mem + child_node_offset) );
+        /* set back pointers */
+        uint backpointer = (current.parent_index << 6) | (numChildren << 3);
+
+        global_offset = atomic_add_local( local_numRecords, numChildren - 1 );
+
+#if MORTON_VERBOSE_LOG
+        printf("PHASE0: loc_id: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d\n",
+               rID, current_index, current_index + qnode->offset, qnode->offset, current.parent_index, numChildren);
+#endif
+
+        if(children_roots_num == numChildren)
+        {
+            uint startpoints_offset = atomic_inc_local( local_startpoints_num );
+            global_refit_startpoints[startpoints_offset] = current_index;
+        }
+        else
+        {
+            backpointer += children_roots_num;
+        }
+
+        *InnerNode_GetBackPointer(backPointers, current_index) = backpointer;
+    }
+
+    child_node_offset = sub_group_broadcast( child_node_offset, 0 );
+    global_offset = sub_group_broadcast( global_offset, 0 );
+
+    uniform global struct QBVHNodeN* childNodes = (global struct QBVHNodeN*)(bvh_mem + child_node_offset);
+
+    sg_children.current_index = childNodes - nodeData + lane;
+    sg_children.parent_index = current_index;
+
+    if ( lane < numChildren )
+    {
+        uint write_position = (lane == 0) ? rID : global_offset + lane - 1;
+        records[write_position] = sg_children;
+    }
+}
+
+
+GRL_INLINE void SUBGROUP_create_node_phase0_local_sync(
+    uniform global struct Globals* globals,
+    uniform global struct BinaryMortonCodeHierarchy* bnodes,
+    uniform global char* bvh_mem,
+    uniform uint rID,
+    uniform local uint* local_numRecords,
+    uniform local uint* local_QNodeOffset,
+    uniform global struct BuildRecordMorton* records,
+    uniform struct BuildRecordMorton current,
+    uniform local uint* local_p0_total,
+    uniform global struct MortonFlattenedBoxlessNode *boxless_nodes,
+    uniform uint nodeDataStart)
+{
+    uniform global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    uniform const uint rootNodeOffset = bvh->rootNodeOffset;
+    uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+    uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+
+    varying ushort lane = get_sub_group_local_id();
+
+    /* initialize child array */
+    uniform uint numChildren = 2;
+    varying struct BuildRecordMorton sg_children;
+    sg_children.items = 0;
+    sg_children.nodeID = (lane == 0) ? bnodes[current.nodeID].leftChild : bnodes[current.nodeID].rightChild;
+
+    if ( lane < numChildren )
+        sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, sg_children.nodeID );
+
+    /* fill QBVH6 node with up to 6 children */
+    while ( numChildren < BVH_NODE_N6 )
+    {
+        varying bool sg_is_leaf = sg_children.items <= cfg_minLeafSize;
+        if ( sub_group_all( sg_is_leaf ) )
+            break;
+
+        uniform uint bestItems = sub_group_reduce_max_N6( sg_children.items );
+        uniform ushort bestChild = ctz( intel_sub_group_ballot( sg_children.items == bestItems ) );
+        uniform uint bestNodeID = sub_group_broadcast( sg_children.nodeID, bestChild );
+
+        varying uint nodeID = (lane == bestChild) ? bnodes[bestNodeID].leftChild : bnodes[bestNodeID].rightChild;
+
+        if ( lane == numChildren || lane == bestChild )
+        {
+            sg_children.nodeID = nodeID;
+            sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, nodeID );
+        }
+
+        numChildren++;
+    }
+
+    const uint current_index = current.current_index;
+    uniform uint global_offset;
+    uniform uint child_node_offset;
+
+    // Check if all children will be roots for the local subgtrees in phase1. If so we keep the node ids to be later
+    // used in global refit after phase1
+    varying uchar is_children_root = (lane < numChildren) ? (sg_children.items <= MORTON_BUILDER_SUBTREE_THRESHOLD) : 0;
+    uniform uchar rootMask = sub_group_reduce_or_N6(is_children_root << lane);
+    uniform uchar children_roots_num = sub_group_reduce_add(is_children_root);
+
+    if ( lane == 0 )
+    {
+        child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren);
+
+        /* Do not create qnodes here */
+        uint backpointer = (current.parent_index << 6) | (numChildren << 3);
+
+        global_offset = atomic_add_local( local_numRecords, numChildren - 1 );
+
+#if MORTON_VERBOSE_LOG
+        printf("PHASE0: loc_id: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, nodeDataStart: %d\n",
+               rID, current_index, current_index + qnode->offset, qnode->offset, current.parent_index, numChildren, nodeDataStart);
+#endif
+
+        MortonFlattenedBoxlessNode flattened_node;
+
+        if(children_roots_num != numChildren)
+            backpointer += children_roots_num;
+
+        flattened_node.binary_hierarchy_index = (current_index << 6) | rootMask;
+
+        uint loc_id = atomic_inc_local( local_p0_total );
+
+        flattened_node.childOffset_type = ((((child_node_offset - nodeDataStart * 64) / 64) - current_index) << 6) | BVH_INTERNAL_NODE;
+        flattened_node.backPointer = backpointer;
+
+        //TODO: change this writes to L1WB or streaming
+        boxless_nodes[loc_id] = flattened_node;
+
+        *InnerNode_GetBackPointer(backPointers, current_index) = backpointer;
+    }
+
+    child_node_offset = sub_group_broadcast( child_node_offset, 0 );
+    global_offset = sub_group_broadcast( global_offset, 0 );
+
+    uniform global struct QBVHNodeN* childNodes = (global struct QBVHNodeN*)(bvh_mem + child_node_offset);
+
+    sg_children.current_index = childNodes - nodeData + lane;
+    sg_children.parent_index = current_index;
+
+    if ( lane < numChildren )
+    {
+        uint write_position = (lane == 0) ? rID : global_offset + lane - 1;
+        records[write_position] = sg_children;
+    }
+}
+
+/*
+
+  In this phase a single large work group performs the construction of
+  the top of the BVH and creates a build record array.
+
+  Two varians of this kernel:
+  1. Refit with global synchronization - Used for big bvh, where number of allocated nodes will not fit
+     in SLM in phase2. Phase0 creates qnodes in bvh, and provides startpoints for bottom up phase
+     that is executed after phase1. This refit uses global synchronizations and mem_fence_gpu_invalidate
+     that is not effective.
+  2. Refit with local synchronization - Flattened boxless nodes are passed via global memory, along with
+     number of created nodes. Phase0 does not create qnodes in bvh, it is done in phase2 during refit.
+     In phase2, flattened boxless nodes are moved to SLM, along with bounding boxes from phase1.
+     Refit is performed only with local synchronization.
+
+*/
+
+__attribute__((reqd_work_group_size(512, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+parallel_build_phase0(global struct Globals *globals,
+                      global struct BinaryMortonCodeHierarchy *bnodes,
+                      global char *bvh_mem,
+                      global uint *global_refit_startpoints)
+{
+    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
+    global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
+
+    /* a queue of build records in global memory */
+    global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
+    local uint local_numRecords;
+    local uint local_QNodeOffset;
+    local uint local_startpoints_num;
+
+    /* initialize first build record */
+    if (get_local_id(0) == 0)
+    {
+        /* allocate root node */
+        uint root_node_offset = 64*bvh->nodeDataCur;
+        global struct QBVHNodeN *rootNode = (global struct QBVHNodeN *)(bvh_mem + root_node_offset);
+
+        //assert(root_node_offset == 0);
+        records[0].nodeID = globals->binary_hierarchy_root;
+        records[0].items = globals->numPrimitives;
+        records[0].current_index = rootNode - nodeData;
+        records[0].parent_index = -1;
+
+        local_numRecords = 1;
+        local_QNodeOffset = root_node_offset + 64;
+        local_startpoints_num = 0;
+
+        mem_fence_workgroup_default();
+    }
+
+    uint num_records = 1;
+
+    /* terminate when all subtrees are under size threshold */
+    while(true)
+    {
+        work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        /* all work items in the work group pick a subtree to build */
+        for (uint ID = get_sub_group_id(); ID < num_records; ID += get_num_sub_groups() )
+        {
+            /* small subtrees will get built in next phase */
+            if (records[ID].items <= MORTON_BUILDER_SUBTREE_THRESHOLD) // FIXME: should break at 64 leaves not 64 primitives
+                continue;
+
+            /* create QBVH node */
+            SUBGROUP_create_node_phase0(globals, bnodes, bvh_mem, global_refit_startpoints, ID, &local_numRecords, &local_QNodeOffset,
+                                        records, records[ID], &local_startpoints_num);
+        }
+
+        work_group_barrier( CLK_LOCAL_MEM_FENCE );
+        mem_fence_workgroup_default();
+        uint old_num_records = num_records;
+        num_records = local_numRecords;
+        if( old_num_records == num_records )
+            break;
+
+    }
+
+    /* remember number of build records for next phase */
+    if (get_local_id( 0 ) == 0)
+    {
+        globals->numBuildRecords = local_numRecords;
+        globals->p0_created_num = local_startpoints_num;
+        bvh->nodeDataCur = local_QNodeOffset / 64;
+
+#if MORTON_VERBOSE_LOG
+        printf("PHASE_0: allocated %d nodes. globals->global_refit_startpoints: %d\n", BVHBase_numNodes(bvh), globals->p0_created_num);
+#endif
+    }
+}
+
+__attribute__((reqd_work_group_size(512, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+parallel_build_phase0_local_sync(global struct Globals *globals,
+                      global struct BinaryMortonCodeHierarchy *bnodes,
+                      global char *bvh_mem,
+                      global struct MortonFlattenedBoxlessNode *boxless_nodes)
+{
+    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
+    global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
+    uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64;
+
+    /* a queue of build records in global memory */
+    global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
+    local uint local_numRecords;
+    local uint local_QNodeOffset;
+    local uint local_p0_total;
+
+    /* initialize first build record */
+    if (get_local_id(0) == 0)
+    {
+        /* allocate root node */
+        uint root_node_offset = 64*bvh->nodeDataCur;
+        global struct QBVHNodeN *rootNode = (global struct QBVHNodeN *)(bvh_mem + root_node_offset);
+
+        //assert(root_node_offset == 0);
+        records[0].nodeID = globals->binary_hierarchy_root;
+        records[0].items = globals->numPrimitives;
+        records[0].current_index = rootNode - nodeData;
+        records[0].parent_index = -1;
+
+        local_numRecords = 1;
+        local_QNodeOffset = root_node_offset + 64;
+        local_p0_total = 0;
+
+        mem_fence_workgroup_default();
+    }
+
+    uint num_records = 1;
+
+    /* terminate when all subtrees are under size threshold */
+    while(true)
+    {
+        work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        /* all work items in the work group pick a subtree to build */
+        for (uint ID = get_sub_group_id(); ID < num_records; ID += get_num_sub_groups() )
+        {
+            /* small subtrees will get built in next phase */
+            if (records[ID].items <= MORTON_BUILDER_SUBTREE_THRESHOLD) // FIXME: should break at 64 leaves not 64 primitives
+                continue;
+
+            /* create QBVH node */
+            SUBGROUP_create_node_phase0_local_sync(globals, bnodes, bvh_mem, ID, &local_numRecords, &local_QNodeOffset, records,
+                                                   records[ID], &local_p0_total, boxless_nodes, nodeDataStart);
+        }
+
+        mem_fence_workgroup_default();
+        work_group_barrier( CLK_LOCAL_MEM_FENCE );
+
+        uint old_num_records = num_records;
+        num_records = local_numRecords;
+        if( old_num_records == num_records )
+            break;
+
+    }
+
+    /* remember number of build records for next phase */
+    if (get_local_id( 0 ) == 0)
+    {
+        globals->numBuildRecords = local_numRecords;
+        bvh->nodeDataCur = local_QNodeOffset / 64;
+
+        globals->p0_allocated_num = BVHBase_numNodes(bvh);
+        globals->p0_created_num = local_p0_total;
+
+#if MORTON_VERBOSE_LOG
+            printf("PHASE_0_LOCAL_SYNC: allocated %d nodes. globals->global_refit_startpoints: %d\n", BVHBase_numNodes(bvh), globals->global_refit_startpoints);
+#endif
+    }
+}
diff --git a/src/intel/vulkan/grl/gpu/morton/phase1.cl b/src/intel/vulkan/grl/gpu/morton/phase1.cl
new file mode 100644
index 00000000000..6a1dd2aa44b
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton/phase1.cl
@@ -0,0 +1,785 @@
+//
+// Copyright (C) 2009-2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "libs/lsc_intrinsics.h"
+#include "morton/morton_common.h"
+
+// caution rec.local_parent_index__numItems needs to have high 16bits filled afterwards;
+BuildRecordLocalMortonFlattener TranslateToLocalRecord(struct BinaryMortonCodeHierarchy srcRec)
+{
+    BuildRecordLocalMortonFlattener rec;
+    rec.leftChild  = srcRec.leftChild;
+    rec.rightChild = srcRec.rightChild;
+    rec.rangeStart = srcRec.range.start;
+    rec.local_parent_index__numItems = (srcRec.range.end - srcRec.range.start) + 1;
+    return rec;
+}
+
+GRL_INLINE BuildRecordLocalMortonFlattener MortonFlattenedBoxlessNode_reinterpret_as_BR(MortonFlattenedBoxlessNode boxless)
+{
+    BuildRecordLocalMortonFlattener rec;
+    rec.leftChild = boxless.binary_hierarchy_index;
+    rec.rightChild = boxless.childOffset_type;
+    rec.rangeStart = boxless.backPointer;
+    rec.local_parent_index__numItems = 0;
+    return rec;
+}
+
+GRL_INLINE void SUBGROUP_create_boxless_node_phase1(
+    uniform global struct Globals* globals,
+    uniform global struct BinaryMortonCodeHierarchy* bnodes,
+    uniform global char* bvh_mem,
+    uniform BuildRecordLocalMortonFlattener currentRecord,
+    uniform uint  currQnodeLocalId, //local index for flattened qnoode, don't mix this with nodeIndex that is in morton build record
+    uniform local uint* local_numRecords,
+    uniform uint tictoc,
+    uniform uint* sg_bu_startpoint_arr,
+    uniform uint* sg_bu_startpoint_cnt,
+    uniform uint parentOfRoot,
+    uniform bool processRoot,
+    uniform UPerNodeData* nodeData)
+{
+    varying ushort lane = get_sub_group_local_id();
+
+    /* initialize child array */
+    uniform uint numChildren = 2;
+    varying struct BuildRecordLocalMortonFlattener sg_children;
+    sg_children.local_parent_index__numItems = 0;
+
+    uint binary_hierarchy_child_idx = (lane == 0) ? currentRecord.leftChild : currentRecord.rightChild;
+    if (lane >= numChildren) binary_hierarchy_child_idx = 1 << 31;
+
+    sg_children = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, binary_hierarchy_child_idx));
+
+    /* fill QBVH6 node with up to 6 children */
+    while (numChildren < BVH_NODE_N6)
+    {
+        // we dont have to do "local_parent_index__numItems & 0xFFFF" because local_parent_index part is 0 here at this point
+        uint childNumItems = sg_children.local_parent_index__numItems;
+        varying bool sg_is_leaf = childNumItems <= cfg_minLeafSize;
+        if (sub_group_all(sg_is_leaf)) { break; }
+
+        uniform uint   bestItems = sub_group_reduce_max_N6(childNumItems);
+        uniform ushort bestChild = ctz(intel_sub_group_ballot(childNumItems == bestItems));
+        varying uint   leftOfBest = sg_children.leftChild; // val important only for (lane == bestChild), not valid for other lanes
+        uniform uint   rightOfBest = sub_group_broadcast(sg_children.rightChild, bestChild);
+
+        varying uint nodeID = (lane == bestChild) ? leftOfBest : rightOfBest;
+
+        if (lane == numChildren || lane == bestChild)
+        {
+            sg_children = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, nodeID));
+        }
+
+        numChildren++;
+    }
+
+    uniform uint global_offset;
+    uniform uint child_node_index;
+
+    bool isFatleafChild = (sg_children.local_parent_index__numItems <= cfg_minLeafSize) && (lane < numChildren);
+    uint numFatleafChildren = popcount(intel_sub_group_ballot(isFatleafChild));
+
+    if (lane <= numChildren) {
+        uint           writeIDX = 0;
+
+        if (lane == numChildren)
+    {
+        /* create nodes in local structure, to be used later in the bottom up to create nodes in actual bvh */
+        MortonFlattenedBoxlessNode flattened_node;
+            uint parentIDX;
+
+            if (processRoot)
+            {
+                *local_numRecords = numChildren + 1;
+                child_node_index = 1;
+                writeIDX = 0;
+        flattened_node.binary_hierarchy_index = 0xFFFFFFFF;
+                flattened_node.childOffset_type = (1 << 6) | BVH_INTERNAL_NODE;
+                parentIDX = parentOfRoot;
+            }
+            else
+            {
+                uint shift = (16 * tictoc);
+                uint mask = 0xFFFF;
+                uint atomicAddVal = numChildren << shift;
+                child_node_index = atomic_add_local(local_numRecords, atomicAddVal);
+                sub_group_barrier(0);
+                writeIDX = currQnodeLocalId;
+                parentIDX = currentRecord.local_parent_index__numItems >> 16;
+                flattened_node.binary_hierarchy_index = 0xFFFFFFFF;
+                sub_group_barrier(0);
+                child_node_index = (child_node_index >> 16) + (child_node_index & mask);
+        flattened_node.childOffset_type = ((child_node_index - currQnodeLocalId) << 6) | BVH_INTERNAL_NODE;
+            }
+
+#if MORTON_VERBOSE_LOG
+            printf("wg %d: SUBGROUP_create_boxless_node_phase1: writeIDX %d, child_node_index %d - %d\n", get_group_id(0), writeIDX, child_node_index, child_node_index + numChildren);
+#endif
+            flattened_node.backPointer = (parentIDX << 6) | (numChildren << 3) | numFatleafChildren;
+            sg_children = MortonFlattenedBoxlessNode_reinterpret_as_BR(flattened_node);
+    }
+
+        child_node_index = sub_group_broadcast(child_node_index, numChildren);
+
+        if (lane != numChildren)
+    {
+            writeIDX = child_node_index + lane;
+            sg_children.local_parent_index__numItems |= currQnodeLocalId << 16;
+    }
+
+        nodeData[writeIDX].buildRecord = sg_children;
+    }
+
+    if (numFatleafChildren == numChildren) {
+        uint arridx = *sg_bu_startpoint_cnt;
+        // GRL_INLINE void set_2xSG_arr_first_write(uint index, uint* arr, ushort val, short lane)
+        set_2xSG_arr_first_write(arridx, sg_bu_startpoint_arr, (ushort)currQnodeLocalId, lane);
+        *sg_bu_startpoint_cnt = arridx + 1;
+    }
+}
+
+// TODO_OPT:  Consider having phase 0 bucket the build records by number of primitives, and dispatch different variants
+//    of this kernel with different WG sizes.   There are many records produced that generate only 1 or 2 subtrees, so 8 SGs is
+//     probably often wasted
+GRL_INLINE void phase1_process_fatleaf(
+    uint   globalBaseForInternalNodes,    // for root node this is indexOfRoot
+    uint   globalParent          ,        // for root this should be parentOfRoot
+    bool   isInstancePrimLeafType,        //
+    uint   leafPrimType,                  //
+    uint   leafStride,                    //
+    global struct QBVHNodeN* nodeData,    // per group
+    uint nodeDataStart,                   //
+    struct AABB* primref,                 //
+    BackPointers* backPointers,           //
+    global struct MortonCodePrimitive* mc,//
+    uint nodesToLeafsGap,                 //
+    local union UPerNodeData* perNodeData,//
+    bool processRoot,                               //
+    short localNodeId,                              //
+    BuildRecordLocalMortonFlattener fatleafRecord,  // per node
+    uint primID )                                   //
+{
+    uint lane = get_sub_group_local_id();
+    uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF);
+    uniform uint mcID = fatleafRecord.rangeStart;
+    uint pseudolane = lane < numChildren ? lane : 0;
+    varying struct AABB sg_bounds = primref[primID];
+
+    uint local_parent_idx = (fatleafRecord.local_parent_index__numItems >> 16);
+    uint globalNodeId = globalBaseForInternalNodes + localNodeId;
+    uniform global struct QBVHNodeN* qnode = nodeData + globalNodeId;
+
+    uint children_offset = (mcID * leafStride + nodesToLeafsGap) - globalNodeId;
+
+    {
+        /* For all primitives in a fat leaf we store a back
+         * pointer. This way we can modify the fat leaf node at leaf construction time. */
+        uint back_pointer = globalNodeId + nodeDataStart;
+        /* Store back pointer and primID inside morton code array to
+         * be later used by leaf creation. */
+        mc[mcID + pseudolane].index_code = ((ulong)back_pointer) << 32 | (ulong)primID;
+    }
+
+    struct AABB reduce_bounds = AABB_sub_group_reduce_N6(&sg_bounds);
+    reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, 0 );
+
+    uint8_t instMask;
+    if (isInstancePrimLeafType)
+    {
+        instMask = lane < numChildren ? PRIMREF_instanceMask(&sg_bounds) : 0;
+        subgroup_setInstanceQBVHNodeN(children_offset, &sg_bounds, numChildren, qnode, instMask);
+        instMask = sub_group_reduce_or_N6(instMask);
+    }
+    else
+    {
+        instMask = 0xFF;
+        subgroup_setQBVHNodeN_setFields_reduced_bounds(children_offset, leafPrimType, &sg_bounds, numChildren, instMask, qnode, false, reduce_bounds);
+    }
+
+    reduce_bounds.lower.w = as_float((uint)instMask);
+    uint reduce_bounds_lane = AABB_sub_group_shuffle_coordPerLane(&reduce_bounds, 0);
+    local uint* boxUint = (local uint*)(perNodeData + localNodeId);
+    if (get_sub_group_size() == 8 || lane < 8)
+    {
+        boxUint[lane] = reduce_bounds_lane;
+        uint globalParentIdx;
+        if (processRoot) {
+            // for root, treeletRootGlobalIndex is index of rootsParent in global space
+            globalParentIdx = globalParent;
+        }
+        else {
+            // for non root, raw_parent_idx is in local space
+            globalParentIdx = (local_parent_idx > 0) ? (globalBaseForInternalNodes + local_parent_idx) : globalParent;
+        }
+        if (lane == 0) {
+            *InnerNode_GetBackPointer(backPointers, globalNodeId) = (globalParentIdx << 6) | (numChildren << 3);
+        }
+    }
+}
+
+GRL_INLINE void perform_phase1(global struct Globals* globals,
+    global struct MortonCodePrimitive* mc,
+    global struct AABB* primref,
+    global struct BinaryMortonCodeHierarchy* bnodes,
+    global char* bvh_mem,
+    local union UPerNodeData* perNodeData,
+    local uint* local_records_head,
+    local uint* local_globalOffsetForNodes,
+    BuildRecordLocalMortonFlattener rootRecord,
+    uint treeletRootGlobalIndex,
+    uint parentOfRootIndex,
+    const uint leafPrimType,
+    bool isInstancePrimLeafType)
+{
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    varying ushort lane = get_sub_group_local_id();
+
+    // array that will keep 2x8 shorts indices
+    varying uint    sg_fatleaf_array = 0x0;
+    uniform uint8_t sg_fatleaf_cnt = 0;
+    /* terminate when all subtrees are leaves */
+
+    uint subgroupId = get_sub_group_id();
+    uint ID = subgroupId;
+
+    uint sg_bu_startpoints = 0;
+    uniform uint sg_bu_startpoints_cnt = 0;
+    const uint shift_mask = globals->shift_mask;
+
+    const uint nodeDataStart  = BVH_ROOT_NODE_OFFSET / 64;
+    BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+    global struct QBVHNodeN* nodeData = BVHBase_nodeData(bvh);
+
+    uint* pLeafStart = (!isInstancePrimLeafType) ? &bvh->quadLeafStart : &bvh->instanceLeafStart;
+    uint  leafStart = *pLeafStart;
+    uint  leafStride = (!isInstancePrimLeafType) ? 1 : (sizeof(struct HwInstanceLeaf) / sizeof(struct InternalNode));
+    uint  nodesToLeafsGap = leafStart - nodeDataStart;
+
+    if (ID == 0)
+    {
+        BuildRecordLocalMortonFlattener current = rootRecord;
+
+        if ((current.local_parent_index__numItems & 0xFFFF) <= BVH_NODE_N6)
+        {
+             *local_records_head = 1;
+#if MORTON_DEBUG_CHECKS
+                if (sg_fatleaf_cnt > 32) printf("parallel_build_phase1_Indirect_SG sg_fatleaf_array: one subgroup has more than 32 items remembered\n");
+#endif
+            BuildRecordLocalMortonFlattener fatleafRecord = current;
+            uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF);
+            uint pseudolane = lane < numChildren ? lane : 0;
+            uniform const uint mcID = fatleafRecord.rangeStart;
+            varying uint primID = (uint)(mc[mcID + pseudolane].index_code & shift_mask);
+
+            phase1_process_fatleaf(
+                treeletRootGlobalIndex, parentOfRootIndex, isInstancePrimLeafType, leafPrimType, leafStride,
+                nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData,
+                true, 0, fatleafRecord, primID);
+        }
+        else
+        {
+#if MORTON_VERBOSE_LOG
+            if (get_local_id(0) == 0) { printf("wg %d perform_phase1: starting collapsing subtree with root at node %d \n", get_group_id(0), rootIndex); }
+#endif
+            //printf("local_records_head = %d\n", *local_records_head);
+            SUBGROUP_create_boxless_node_phase1(globals, bnodes, bvh_mem, current, ID, local_records_head, 0, &sg_bu_startpoints, &sg_bu_startpoints_cnt, parentOfRootIndex, true, perNodeData);
+            *local_globalOffsetForNodes = treeletRootGlobalIndex;
+        }
+
+        ID += get_num_sub_groups();
+    }
+
+    uniform uint priv_records_tail = 1;
+
+    /* wait for all work items to have updated local_records array */
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    uniform uint priv_records_head = *local_records_head & 0xFFFF;
+    treeletRootGlobalIndex = *local_globalOffsetForNodes; // propagated from subgroup 1
+    uniform uint priv_records_tail_prev = priv_records_tail;
+    uniform uint other_records_head = priv_records_head;
+
+    uint ticToc = 1;
+
+    if (priv_records_head == priv_records_tail)
+    {
+        return;
+    }
+    else
+    {
+        do
+        {
+            for (; ID < priv_records_head; ID += get_num_sub_groups())
+            {
+                BuildRecordLocalMortonFlattener current = (perNodeData[ID].buildRecord);
+
+                if ((current.local_parent_index__numItems & 0xFFFF) <= BVH_NODE_N6)
+                {
+                    set_2xSG_arr_first_write(sg_fatleaf_cnt++, &sg_fatleaf_array, ID, lane);
+#if MORTON_VERBOSE_LOG
+                    if (lane == 0)printf("wg %d, sg %d, perform_phase1: node ID %d is fatleaf \n", get_group_id(0), get_sub_group_id(), ID);
+#endif
+#if MORTON_DEBUG_CHECKS
+                    if (sg_fatleaf_cnt > 32) printf("parallel_build_phase1_Indirect_SG sg_fatleaf_array: one subgroup has more than 32 items remembered\n");
+#endif
+                }
+                else
+                {
+                    SUBGROUP_create_boxless_node_phase1(globals, bnodes, bvh_mem, current, ID, local_records_head, ticToc, &sg_bu_startpoints, &sg_bu_startpoints_cnt, 0, 0, perNodeData);
+                }
+            }
+
+            priv_records_tail = priv_records_head;
+            /* wait for all work items to have updated local_records array */
+            work_group_barrier(CLK_LOCAL_MEM_FENCE);
+            {
+                uint records_as_in_mem = *local_records_head;
+                priv_records_head = (records_as_in_mem >> (16 * ticToc)) & 0xFFFF;
+                uint other_records_head_temp = priv_records_head;
+                priv_records_head += other_records_head;
+                other_records_head = other_records_head_temp;
+                ticToc = ticToc ^ 1;
+#if MORTON_VERBOSE_LOG
+                if(get_local_id(0) == 0)printf("wg %d, perform_phase1: priv_records_tail %d, priv_records_head %d, records_as_in_mem %x\n", get_group_id(0), get_sub_group_id(), priv_records_tail, priv_records_head, records_as_in_mem);
+#endif
+            }
+        } while (priv_records_tail != priv_records_head); // get out of the loop if the tail reached the head
+    }
+
+    bool atomicNodeAllocation = treeletRootGlobalIndex > 0;
+    bool atomicNodeAllocationProduce = (get_sub_group_id() + lane == 0) && atomicNodeAllocation;
+    uint singleTreeletBumpBVHnodeCnt = (!atomicNodeAllocation && (get_sub_group_id() + lane == 0)) ? nodeDataStart + priv_records_tail : 0;
+
+    uniform uint globalBaseForInternalNodes = 0;
+
+    // we distinguish multi treelet from single treelets here by looking on our treeletRootGlobalIndex
+    // if treelets root is whole tree root (treeletRootGlobalIndex==0) then we are the only treelet so
+    // there's no need to synchronize multiple treelets nodes allocations with atomics.
+    if (atomicNodeAllocationProduce)
+    {
+        *local_globalOffsetForNodes = allocate_inner_nodes(bvh, priv_records_tail - 1);
+    }
+
+    // because, root is allocated elsewhere, and first node placed in global mem is node with local index 1
+            // mapping local to global:
+            // local space                           global space
+            // [0]             - treelet root        [treeletRootGlobalIndex]
+            //                                       ... possibly very long distance ...
+            // [1]             - first non root      [globalBaseForInternalNodes + 1] - this index is returned by atomic allocator above
+            // [2]             - first               [globalBaseForInternalNodes + 2]
+            // ...
+            // [numToAllocate] - last node           [globalBaseForInternalNodes + 3]
+    if (atomicNodeAllocation)
+    {
+        work_group_barrier(CLK_LOCAL_MEM_FENCE);
+        globalBaseForInternalNodes = *local_globalOffsetForNodes -(nodeDataStart+1);
+    }
+
+#if MORTON_VERBOSE_LOG
+    if (get_local_id(0) == 0) { printf("wg %d perform_phase1: globalBaseForInternalNodes %d, num local nodes %d\n", get_group_id(0), globalBaseForInternalNodes, priv_records_tail - 1); }
+#endif
+
+    if (sg_fatleaf_cnt)
+    {
+        short localNodeId = get_from_2xSG_arr(sg_fatleaf_cnt - 1, sg_fatleaf_array, lane);
+        //if (localNodeId >= MORTON_BUILDER_SUBTREE_THRESHOLD * 2) continue;
+        //if(local_startpoints_cnt > 1) return;
+        BuildRecordLocalMortonFlattener fatleafRecord = perNodeData[localNodeId].buildRecord;
+
+        varying uint primID;
+        {
+            uint numChildren = (fatleafRecord.local_parent_index__numItems & 0xFFFF);
+            uint pseudolane = lane < numChildren ? lane : 0;
+                uniform const uint mcID = fatleafRecord.rangeStart;
+                primID = (uint)(mc[mcID + pseudolane].index_code & shift_mask);
+        }
+
+        // process fatleafs, and store their boxes to SLM
+        // also put startpoints for bottom up
+        //uint fatleaf_cnt = *local_startpoints_cnt;
+        while (sg_fatleaf_cnt-- > 1)
+        {
+            short                           nextLocalNodeId   = get_from_2xSG_arr(sg_fatleaf_cnt-1, sg_fatleaf_array, lane);
+            BuildRecordLocalMortonFlattener nextfatleafRecord = perNodeData[nextLocalNodeId].buildRecord;
+            varying uint                    nextPrimId;
+
+            {
+                uint numChildren = (nextfatleafRecord.local_parent_index__numItems & 0xFFFF);
+                uint pseudolane = lane < numChildren ? lane : 0;
+                uniform const uint mcID = nextfatleafRecord.rangeStart;
+                nextPrimId = (uint)(mc[mcID + pseudolane].index_code & shift_mask);
+            }
+
+            phase1_process_fatleaf(
+                globalBaseForInternalNodes, treeletRootGlobalIndex, isInstancePrimLeafType, leafPrimType, leafStride,
+                nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData,
+                false, localNodeId, fatleafRecord, primID);
+
+            fatleafRecord = nextfatleafRecord;
+            localNodeId   = nextLocalNodeId;
+            primID        = nextPrimId;
+        }
+
+        phase1_process_fatleaf(
+            globalBaseForInternalNodes, treeletRootGlobalIndex, isInstancePrimLeafType, leafPrimType, leafStride,
+            nodeData, nodeDataStart, primref, backPointers, mc, nodesToLeafsGap, perNodeData,
+            false, localNodeId, fatleafRecord, primID);
+        }
+
+#if 0
+    // put collected bottom-up startpoints to wg shared array to later distribute the work evenly accross the groups.
+        {
+            ushort myStartpointWriteSite = 0;
+
+            if (lane == 0)
+            {
+                myStartpointWriteSite = atomic_add_local((local uint*)local_startpoints_cnt, (ushort)sg_bu_startpoints_cnt);
+            }
+            myStartpointWriteSite = sub_group_broadcast(myStartpointWriteSite, 0);
+
+            unpack_from_2xSG_arr(sg_bu_startpoints_cnt, sg_bu_startpoints, lane, local_startpoints_arr + myStartpointWriteSite);
+        }
+#endif
+
+        work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        // distribute bottom-up startpoints
+#if 0
+        {
+            short sp_count_to_divide = (*local_startpoints_cnt);
+
+            //calculate the chunk for each sg.
+            sg_bu_startpoints_cnt = sp_count_to_divide / get_num_sub_groups();
+            uint sg_bu_startpoints_cnt_reminder = sp_count_to_divide % get_num_sub_groups();
+
+            uint myReadSite = get_sub_group_id() * sg_bu_startpoints_cnt;
+            if (get_sub_group_id() < sg_bu_startpoints_cnt_reminder) {
+                //from the reminder elements if sg idx is < sg_bu_startpoints_cnt_reminder then sg gets one extra idx
+                // and all sgs before it also have one extra
+                myReadSite += get_sub_group_id();
+                sg_bu_startpoints_cnt++;
+        }
+        else
+        {
+            // all reminder elements are consummed by previous sgs
+            myReadSite += sg_bu_startpoints_cnt_reminder;
+        }
+
+        pack_from_2xSG_arr(local_startpoints_arr + myReadSite, sg_bu_startpoints_cnt, &sg_bu_startpoints, lane);
+    }
+#endif
+
+    SUBGROUP_refit_bottom_up_local(nodeData, backPointers, treeletRootGlobalIndex, globalBaseForInternalNodes, lane, perNodeData, sg_bu_startpoints, sg_bu_startpoints_cnt);
+
+    if (singleTreeletBumpBVHnodeCnt)
+    {
+        bvh->nodeDataCur = singleTreeletBumpBVHnodeCnt;
+    }
+}
+
+GRL_INLINE void update_empty_blas(global struct BVHBase* bvh, uint leafPrimType)
+{
+    if (get_sub_group_id() == 0 )
+    {
+        global struct QBVHNodeN* qnode = BVHBase_nodeData(bvh);
+        BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
+
+        //set required fields to mark that blas is empty
+        uint k = (get_sub_group_local_id() < BVH_NODE_N6) ? get_sub_group_local_id() : 0;
+        qnode->type = leafPrimType;
+        qnode->instMask = 0;
+        qnode->qbounds.lower_x[k] = 0x80;
+        qnode->qbounds.upper_x[k] = 0;
+
+        *InnerNode_GetBackPointer(backPointers, 0) = (((uint)-1) << 6);
+    }
+}
+
+/*
+
+  POSTSORT PHASE1:
+  Two kernels here, selected by MORTON_BUILDER_SUBTREE_THRESHOLD.
+  1. parallel_build_phase1_Indirect_SG - record[0] is set to the subtree tip
+  2. parallel_build_phase1_Indirect_global_root - record[0] is set to the bvh root (no phase2 needed afterwards)
+
+*/
+
+__attribute__( (reqd_work_group_size( 512, 1, 1 )) )
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_build_phase1_Indirect_SG( global struct Globals* globals,
+    global struct MortonCodePrimitive* mc,
+    global struct AABB* primref,
+    global struct BinaryMortonCodeHierarchy* bnodes,
+    global char* bvh_mem)
+{
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    const uint leafPrimType = globals->leafPrimType;
+
+    //special case for empty blas
+    if(globals->numPrimitives == 0)
+    {
+        bvh->nodeDataCur = BVH_ROOT_NODE_OFFSET / 64 + 1;
+        update_empty_blas(bvh, leafPrimType);
+        return;
+    }
+
+    local union UPerNodeData perNodeData[(MORTON_BUILDER_SUBTREE_THRESHOLD * 2) -1];
+    local uint local_records_head;
+    // Two separate SLM variables for local_globalOffsetForNodes to remove one of the barriers
+    local uint local_globalOffsetForNodes, local_globalOffsetForNodes2;
+
+    uint rootIndex = 0;
+    uint parentOfRoot = 0;
+    BuildRecordLocalMortonFlattener  rootBuildRecord;
+
+    /* add start build record to local stack */
+    if (get_sub_group_id() == 0 )
+    {
+        global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64 * bvh->quadLeafStart);
+        uint recordID = get_group_id(0);
+        struct BuildRecordMorton mortonGlobalRecord = records[recordID];
+
+        rootBuildRecord = TranslateToLocalRecord(BinaryMortonCodeHierarchy_getEntry(bnodes, mortonGlobalRecord.nodeID));
+
+        parentOfRoot = mortonGlobalRecord.parent_index;
+        rootIndex = mortonGlobalRecord.current_index;
+
+#if MORTON_VERBOSE_LOG
+        printf("P1_STARTPOINTS: current_index: %d, buildRecord.numItems: %d, buildRecord.binary_hierarchy_index: %d, buildRecord.local_parent_index: %d\n",
+               local_globalOffsetForNodes, buildRecord.numItems, buildRecord.binary_hierarchy_index, buildRecord.local_parent_index);
+#endif
+    }
+
+    if (leafPrimType == NODE_TYPE_INSTANCE)
+    {
+        perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
+            &local_records_head, &local_globalOffsetForNodes,
+            rootBuildRecord, rootIndex, parentOfRoot, NODE_TYPE_INSTANCE, true);
+    }
+    else
+    {
+        perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
+            &local_records_head, &local_globalOffsetForNodes,
+            rootBuildRecord, rootIndex, parentOfRoot, leafPrimType, false);
+    }
+
+}
+
+__attribute__( (reqd_work_group_size( 512, 1, 1 )) )
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_build_phase1_Indirect_global_root( global struct Globals* globals,
+    global struct MortonCodePrimitive* mc,
+    global struct AABB* primref,
+    global struct BinaryMortonCodeHierarchy* bnodes,
+    global char* bvh_mem)
+{
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    const uint leafPrimType = globals->leafPrimType;
+    const uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64;
+
+    bvh->nodeDataCur = nodeDataStart + 1;
+
+    //special case for empty blas
+    if(globals->numPrimitives == 0)
+    {
+        update_empty_blas(bvh, leafPrimType);
+        return;
+    }
+
+    local union UPerNodeData perNodeData[MORTON_BUILDER_SUBTREE_THRESHOLD * 2 - 1];
+    local uint local_records_head;
+    local uint local_globalOffsetForNodes;
+
+    BuildRecordLocalMortonFlattener rootBuildRecord;
+
+    if (get_sub_group_id() == 0 )
+    {
+        struct BinaryMortonCodeHierarchy binaryNode = BinaryMortonCodeHierarchy_getEntry(bnodes, globals->binary_hierarchy_root);
+
+        rootBuildRecord = TranslateToLocalRecord(binaryNode);
+
+        local_globalOffsetForNodes = 0;
+    }
+
+    if (leafPrimType == NODE_TYPE_INSTANCE)
+    {
+        perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
+            &local_records_head, &local_globalOffsetForNodes, rootBuildRecord, 0, (uint)-1, NODE_TYPE_INSTANCE, true);
+    }
+    else
+    {
+        perform_phase1(globals, mc, primref, bnodes, bvh_mem, perNodeData,
+            &local_records_head, &local_globalOffsetForNodes, rootBuildRecord, 0, (uint)-1, leafPrimType, false);
+
+    }
+}
+
+#if 0
+GRL_INLINE void
+DO_OLD_PARALLEL_BUILD_PHASE1( global struct Globals* globals,
+    global struct MortonCodePrimitive* mc,
+    global struct AABB* primref,
+    global struct BinaryMortonCodeHierarchy* bnodes,
+    global char* bvh_mem,
+    uint startID, uint endID,
+    local uint* local_numRecords,
+    local uint* local_numRecordsOld,
+    local struct BuildRecordMorton* local_records
+)
+{
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + bvh->quadLeafStart*64);
+
+    /* iterate over all subtrees this workgroup should build */
+    for ( uint recordID = startID; recordID < endID; recordID++ )
+    {
+        /* add start build record to local stack */
+        if ( get_local_id( 0 ) == 0 )
+        {
+            local_records[0] = records[recordID];
+            *local_numRecords = 1;
+            *local_numRecordsOld = 0;
+        }
+        work_group_barrier( CLK_LOCAL_MEM_FENCE );
+
+        /* terminate when all subtrees are leaves */
+        while ( *local_numRecords != *local_numRecordsOld )
+        {
+            /* remember the old number of build records to detect later
+       * whether we are done */
+            if ( get_local_id( 0 ) == 0 )
+            {
+                *local_numRecordsOld = *local_numRecords;
+            }
+            work_group_barrier( CLK_LOCAL_MEM_FENCE );
+
+            /* all work items in the sub group pick a subtree to build */
+            for ( uint ID = get_local_id( 0 ); ID < *local_numRecordsOld; ID += get_local_size( 0 ) )
+            {
+                /* ignore small subtrees */
+                if ( local_records[ID].items <= BVH_NODE_N6 )
+                    continue;
+
+                /* create QBVH node */
+                create_node( globals, bnodes, bvh_mem, ID, local_numRecords, local_records, &local_records[ID] );
+            }
+
+            /* wait for all work items to have updated local_records array */
+            work_group_barrier( CLK_LOCAL_MEM_FENCE );
+        }
+
+        const uint shift_mask = globals->shift_mask;
+        const uint leafPrimType = globals->leafPrimType;
+        const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
+        BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+        global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+
+        /* create all fat leaf nodes and initiate refit */
+        for ( uint ID = get_local_id( 0 ); ID < *local_numRecords; ID += get_local_size( 0 ) )
+        {
+            struct BuildRecordMorton current = local_records[ID];
+            const uint primrefID = BinaryMortonCodeHierarchy_getRangeStart( bnodes, current.nodeID );
+
+            global struct QBVHNodeN* qnode = nodeData + current.current_index;
+
+            /* get bounds of all children of the fat leaf node */
+            struct AABB bounds[BVH_NODE_N6];
+            for ( uint i = 0; i < current.items; i++ )
+            {
+                /* get primID and bounds of primitive */
+                const uint primID = (uint)(mc[primrefID + i].index_code & shift_mask);
+                bounds[i] = primref[primID];
+
+                /* For all primitives in a fat leaf we store a back
+                 * pointer. This way we can modify the fat leaf node at leaf construction time. */
+                const uint back_pointer = qnode - (struct QBVHNodeN*)bvh_mem;
+
+                /* Store back pointer and primID inside morton code array to
+                 * be later used by leaf creation. */
+                mc[primrefID + i].index_code = ((ulong)back_pointer) << 32 | (ulong)primID;
+            }
+
+            /* update fat leaf node */
+            QBVHNodeN_setType( qnode, leafPrimType );
+            global void* offset;
+            if ( leafPrimType != BVH_INSTANCE_NODE )
+            {
+                offset = bvh_mem + 64*bvh->quadLeafStart + primrefID * sizeof( struct Quad );
+                QBVHNodeN_setChildIncr1( qnode );
+            }
+            else
+            {
+                offset = bvh_mem + 64*bvh->instanceLeafStart + primrefID * sizeof( struct HwInstanceLeaf );
+                QBVHNodeN_setChildIncr2( qnode );
+            }
+            QBVH6Node_set_offset( qnode, offset );
+            QBVHNodeN_setBounds( qnode, bounds, current.items );
+
+            /* set back pointers for fat leaf nodes */
+            *InnerNode_GetBackPointer(backPointers, current.current_index) = (current.parent_index << 6) | (current.items << 3);
+
+            /* bottom up refit */
+            refit_bottom_up( qnode, bvh, bounds, current.items );
+        }
+    }
+}
+
+/*
+
+  This phase takes the build records calculated in phase0 as input and
+  finished the BVH construction for all these subtrees.
+
+*/
+__attribute__((reqd_work_group_size(8, 1, 1)))
+old_parallel_build_phase1(global struct Globals *globals,
+                      global struct MortonCodePrimitive *mc,
+                      global struct AABB *primref,
+                      global struct BinaryMortonCodeHierarchy *bnodes,
+                      global char *bvh_mem)
+{
+    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
+    global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
+
+    /* a queue of build records */
+    local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
+    local uint local_numRecords;
+    local uint local_numRecordsOld;
+
+    /* construct range of build records that each sub group will process */
+    const uint numRecords = globals->numBuildRecords;
+    const uint startID = (get_group_id(0) + 0) * numRecords / get_num_groups(0);
+    const uint endID = (get_group_id(0) + 1) * numRecords / get_num_groups(0);
+
+    DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
+
+}
+
+__attribute__( (reqd_work_group_size( 8, 1, 1 )) )
+old_parallel_build_phase1_Indirect( global struct Globals* globals,
+    global struct MortonCodePrimitive* mc,
+    global struct AABB* primref,
+    global struct BinaryMortonCodeHierarchy* bnodes,
+    global char* bvh_mem )
+{
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64*bvh->quadLeafStart);
+
+    /* a queue of build records */
+    local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
+    local uint local_numRecords;
+    local uint local_numRecordsOld;
+
+    /* construct range of build records that each sub group will process */
+    const uint numRecords = globals->numBuildRecords;
+    uint startID = get_group_id( 0 );
+    uint endID   = startID + 1;
+
+    DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
+
+}
+#endif
diff --git a/src/intel/vulkan/grl/gpu/morton/phase2.cl b/src/intel/vulkan/grl/gpu/morton/phase2.cl
new file mode 100644
index 00000000000..e82d22aaacf
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton/phase2.cl
@@ -0,0 +1,314 @@
+//
+// Copyright (C) 2009-2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "bvh_build_refit.h"
+#include "libs/lsc_intrinsics.h"
+#include "morton/morton_common.h"
+
+/*
+
+  POSTSORT PHASE2:
+  Two kernels here, selected by MORTON_BUILDER_P2_SINGLE_WG_THRESHOLD whish is set to very big value.
+  1. parallel_build_phase2_refit - performs refit using global synchronization and mem_fence_gpu_invalidate.
+                                   This kernel should be used only for very big bvh, it is faster than non-SLM fallback
+                                   in parallel_build_phase2_refit_local.
+  2. parallel_build_phase2_refit_local - should be used for most of the cases, we usually fit into SLM with the number of
+                                   nodes allocated in phase0, but there is also non-SLM fallback there, as the
+                                   decision on which kernel to run is based on the nodes estimates on the host
+                                   side.
+
+*/
+
+
+GRL_INLINE void refit_bottom_up_global_sync(
+    global char* bvh_mem,
+    global uint* global_refit_startpoints,
+    uniform uint nodeId,
+    uniform ushort lane)
+{
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+
+    BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+    global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+
+    // Get the node idx that was put here in phase1
+    const uint innerNodeIdx = global_refit_startpoints[nodeId];
+
+    // Get the qnode and backpointer
+    uniform global struct QBVHNodeN* qnode = nodeData + innerNodeIdx;
+    uint backPointer = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
+
+    varying struct AABB childrenAABB; // one child AABB per lane
+    AABB_init(&childrenAABB);
+
+    uniform uint numChildren = (backPointer >> 3) & 0x7;
+    if(numChildren == 0) return;
+
+    global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
+    varying ushort child_idx = (lane < numChildren) ? lane : 0;
+    childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );
+
+#if MORTON_VERBOSE_LOG
+    if(lane == 0)
+        printf("REFIT2: index: %d, child_idx: %d\n", innerNodeIdx, child_idx);
+#endif
+
+    struct AABB reduce_bounds = AABB_sub_group_reduce_N6( &childrenAABB );
+    reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, 0 );
+
+    subgroup_QBVHNodeN_setBounds(qnode, reduce_bounds, childrenAABB, numChildren, lane);
+
+    uint children_mask = qnode_child[child_idx].instMask;
+    qnode->instMask = sub_group_reduce_or_N6(children_mask);
+
+    SUBGROUP_refit_bottom_up( qnode, bvh, reduce_bounds, numChildren, lane, 0 );
+}
+
+__attribute__( (reqd_work_group_size( 16, 1, 1 )) ) void kernel
+parallel_build_phase2_refit( global char* bvh_mem,
+    global uint* global_refit_startpoints )
+{
+    refit_bottom_up_global_sync(bvh_mem, global_refit_startpoints, get_group_id(0), get_local_id(0));
+}
+
+
+GRL_INLINE void SUBGROUP_refit_bottom_up_global(
+    uniform global struct QBVHNodeN* globalNodeData,
+    uniform struct BackPointers* backPointers,
+    varying ushort lane,
+    varying uint curNodeIndex)
+{
+    uniform uint backpointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex);
+
+    const uint head_lane = 0;
+    uniform struct AABB child_aabb; // this carries reduced aabb between loop turns
+
+    while (curNodeIndex != 0)
+    {
+        global struct QBVHNodeN* qnode = globalNodeData + curNodeIndex;
+        global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
+        uint numChildren = BackPointer_GetNumChildren(backpointer);
+
+        varying ushort child_idx = (lane < numChildren) ? lane : 0;
+        child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx );
+
+        struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
+        reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane);
+
+        /* get bounds of all children from child nodes directly */
+        subgroup_QBVHNodeN_setBounds(qnode, reduced_bounds, child_aabb, numChildren, lane);
+
+        uchar childrenMask = qnode_child[child_idx].instMask;
+        qnode->instMask = sub_group_reduce_or_N6(childrenMask);
+
+        uint parentIndex = BackPointer_GetParentIndex(backpointer);
+
+        mem_fence_gpu_invalidate();
+
+        if (lane == 0)
+        {
+            backpointer = atomic_inc_global((__global uint *)InnerNode_GetBackPointer(backPointers, parentIndex));
+
+            uint globalBackpointer = (parentIndex << 6) | (numChildren << 3);
+
+            /* set global back pointer */
+            *InnerNode_GetBackPointer(backPointers, curNodeIndex) = globalBackpointer;
+
+#if MORTON_VERBOSE_LOG
+            printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, child_loc_idx: %d reduced_bounds: %f\n",
+                   curNodeIndex, curNodeIndex + qnode->offset, qnode->offset, backpointer >> 6, numChildren, child_idx, reduced_bounds.lower.x);
+#endif
+        }
+
+        backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane);
+        curNodeIndex = parentIndex;
+
+        /* if all children got refitted, then continue */
+        uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7;
+        uniform uint numChildrenTotal = (backpointer >> 3) & 0x7;
+
+        if (numChildrenRefitted != numChildrenTotal)
+                return;
+    }
+
+    // process root of the treelet
+    {
+
+#if MORTON_DEBUG_CHECKS
+        if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n");
+#endif
+
+        global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( globalNodeData );
+        uint numChildren = BackPointer_GetNumChildren(backpointer);
+
+        varying ushort child_idx = (lane < numChildren) ? lane : 0;
+        child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx );
+
+        struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
+        reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane);
+
+        /* get bounds of all children from child nodes directly */
+        subgroup_QBVHNodeN_setBounds(globalNodeData, reduced_bounds, child_aabb, numChildren, lane);
+
+        uchar childrenMask = qnode_child[child_idx].instMask;
+        globalNodeData->instMask = sub_group_reduce_or_N6(childrenMask);
+
+        /* reset refit counter for next refit */
+        if (lane == 0)
+        {
+            /* set global back pointer */
+            *InnerNode_GetBackPointer(backPointers, 0) = backpointer & (~7u);
+
+#if MORTON_VERBOSE_LOG
+        printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n",
+               curNodeIndex, 0, 0 + globalNodeData->offset, globalNodeData->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt);
+#endif
+        }
+    }
+}
+
+
+// TODO: Check why 512 wg size has worse performance than 256
+__attribute__( (reqd_work_group_size( 512, 1, 1 )) )
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+parallel_build_phase2_refit_local( global struct Globals* globals,
+    global char* bvh_mem,
+    global struct MortonFlattenedBoxlessNode *boxless_nodes)
+{
+    // Number of nodes created in P0, to be refitted in this stage
+    uint p0_created_num = globals->p0_created_num;
+
+    // Return immediately if host executed this kernel but there is nothing to do
+    if(p0_created_num == 0)
+        return;
+
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+    global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+    varying ushort lane = get_sub_group_local_id();
+
+    // Hardcode SLM to max here as we do not know upfront how much mem will be needed
+    local union UPerNodeData perNodeData[MORTON_BUILDER_P2_ELEMENTS_IN_SLM]; /* 16kb is max slm for 256 wg_size */
+
+    // Number of allocated nodes in phase0 (p0_created_num + children)
+    uint p0_allocated_num = globals->p0_allocated_num;
+
+    // array that will keep 2x8 shorts indices
+    varying uint sg_fatleaf_array = 0x0;
+    uniform uint8_t sg_bu_startpoints_cnt = 0;
+
+    // Determine if we can fit into SLM with all the nodes allocated in phase0,
+    // There are two paths here:
+    // 1. Copy all needed flattened nodes and bounding boxes to SLM and reuse bottom up local,
+    //    which does refit nad creates qnodes in bvh
+    // 2. If not fit into SLM, first create qnodes in bvh, and perform bottom up refit with global atomics synchronization.
+    //    It is not performant to do so, keep it as a guardrail here. On the host side we do fallback
+    //    to the old refit separated path, with wg_size 8 with better EU reuse.
+    if(p0_allocated_num < MORTON_BUILDER_P2_ELEMENTS_IN_SLM)
+    {
+        for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() )
+        {
+            MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID];
+            uint current_id = boxless_node.binary_hierarchy_index >> 6;
+
+            // Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused
+            uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F);
+
+            if(lane == 0)
+                perNodeData[current_id].boxlessNode = boxless_node;
+
+            // When no children are subtree roots, we are done and skip to the next iteration
+            if(children_root_mask == 0x0)
+            {
+                continue;
+            }
+            // When all children are subtree roots, put them to sg_fatleaf_array
+            else if(children_root_mask == 0x3F)
+            {
+                set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane);
+            }
+
+            uniform global struct QBVHNodeN* qnode = nodeData + current_id;
+
+            uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7;
+            uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node);
+            varying ushort child_idx = (lane < numChildren) ? lane : 0;
+
+            varying struct AABB childrenAABB; // one child AABB per lane
+            AABB_init(&childrenAABB);
+
+            uint lead_child_global_id = current_id + lead_child_offset;
+
+            uniform global struct QBVHNodeN* qnode_child = nodeData + lead_child_global_id;
+            childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );
+
+            // Get only AABBs of children that are p1 subtree roots
+            bool lane_active = boxless_node.binary_hierarchy_index & (1 << child_idx);
+            if(lane_active)
+            {
+                uint child_global_id = lead_child_global_id + child_idx;
+                perNodeData[child_global_id].box = childrenAABB;
+                perNodeData[child_global_id].box.lower.w = as_float((uint)qnode_child->instMask);
+            }
+
+#if MORTON_VERBOSE_LOG
+            if(lane == 0)
+                printf("P2_LOCAL: ID: %d, lead_child_offset: %d, child_idx: %d, lane_active: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, lane_active, boxless_node.backPointer >> 6, perNodeData[ID].box.lower.x, qnode->offset);
+#endif
+        }
+
+        work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+        SUBGROUP_refit_bottom_up_local(nodeData, backPointers, 0, 0, lane, perNodeData, sg_fatleaf_array, sg_bu_startpoints_cnt);
+    }
+    else
+    {
+        for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() )
+        {
+            MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID];
+            uint current_id = boxless_node.binary_hierarchy_index >> 6;
+
+            // Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused
+            uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F);
+            uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7;
+
+            uniform global struct QBVHNodeN* qnode = nodeData + current_id;
+            uint nodeType = MortonFlattenedBoxlessNode_GetType(boxless_node);
+            uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node);
+
+            SUBGROUP_QBVHNodeN_setChildIncr1( qnode );
+            if(lane == 0)
+            {
+                QBVH6Node_set_type( qnode, nodeType );
+                qnode->offset = lead_child_offset;
+            }
+
+            // When no children are subtree roots, we are done and skip to the next iteration
+            if(children_root_mask == 0x0)
+            {
+                continue;
+            }
+            // When all children are subtree roots, put them to sg_fatleaf_array
+            else if(children_root_mask == 0x3F)
+            {
+                set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane);
+            }
+
+#if MORTON_VERBOSE_LOG
+            if(lane == 0)
+                printf("P2_GLOBAL: ID: %d, lead_child_offset: %d, child_idx: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, boxless_node.backPointer >> 6, reduce_bounds.lower.x, qnode->offset);
+#endif
+        }
+
+        while (sg_bu_startpoints_cnt > 0)
+        {
+            uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_fatleaf_array, lane);
+
+            SUBGROUP_refit_bottom_up_global(nodeData, backPointers, lane, curNodeIndex);
+        }
+    }
+}
diff --git a/src/intel/vulkan/grl/gpu/morton/post_sort.cl b/src/intel/vulkan/grl/gpu/morton/post_sort.cl
new file mode 100644
index 00000000000..c13762438a3
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton/post_sort.cl
@@ -0,0 +1,521 @@
+//
+// Copyright (C) 2009-2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "libs/lsc_intrinsics.h"
+#include "morton/morton_common.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////
+/*
+
+    This kernel constructs a binary hierarchy in bottom up fashion from
+    the morton codes.
+
+*/
+////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+int Delta(global struct MortonCodePrimitive* mc, const uint64_t key0, const uint i1 )
+{
+    const uint64_t key1 = mc[i1].index_code;
+    return  clz(key0 ^ key1);
+}
+
+int sign( int d )
+{
+    return (d > 0) ? 1 : -1;
+}
+
+__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH )) )
+void kernel build_bottom_up_indirect( global struct Globals* globals,
+    global struct BinaryMortonCodeHierarchy* bnodes,
+    global struct MortonCodePrimitive* mc )
+{
+    /* construct range of primitives that each work group will process */
+    const uint numPrimitives = globals->numPrimitives;
+
+    uint i = get_group_id( 0 ) * get_local_size(0) + get_local_id( 0 );
+
+    if (i == 0)
+    {
+        globals->binary_hierarchy_root = 0;
+        if (numPrimitives == 1)
+        {
+            // special kludge for 1-prim tree.  Make sure the one leaf node is initialized
+            bnodes[i].range.start   = 0;
+            bnodes[i].range.end     = 0;
+            bnodes[i].leftChild     = -1;
+            bnodes[i].rightChild    = -1;
+        }
+
+        // store pointer to the binary hierarchy in the globals struct.
+        //  This will be used
+        globals->binary_hierarchy_buffer = (gpuva_t) bnodes;
+    }
+
+    uint num_inner_nodes = numPrimitives-1;
+    if ( i < num_inner_nodes )
+    {
+        //
+        // direction is 1 if this morton code is the node's first key, -1 if it's the last
+        //    By construction every internal node is either the start or the end of a given key range
+        //  direction should be towards the neighbor with the most bits in common
+
+        uint64_t ki = mc[i].index_code;
+
+        int direction, delta_min;
+        uint lmax;
+        if( i == 0 )
+        {
+            direction = 1;
+            delta_min = -1;
+            lmax = numPrimitives;
+        }
+        else
+        {
+            direction = sign( Delta( mc, ki, i + 1 ) - Delta( mc,  ki, i - 1 ) );
+            delta_min = Delta( mc,  ki, i - direction );
+
+            // find upper bound for length of this node's key range
+            lmax = 8;
+            while ( (i+lmax*direction) < numPrimitives && Delta( mc, ki, i+lmax*direction ) > delta_min)
+                lmax = lmax * 2;
+        }
+
+        // clamp max length so that the binary searches are fully in-bounds
+        uint maxLen = (direction>0) ? (numPrimitives - i) : (i+1);
+        lmax = min(lmax, maxLen);
+
+        // find end of range using binary search
+        uint length = 0;
+        uint end    = lmax-1;
+        while (length != end)
+        {
+            uint mid = length + ((end-length)/2) + ((end-length)%2);
+            bool bigger =  Delta( mc, ki, i+mid*direction) > delta_min;
+            length = bigger ? mid : length;
+            end    = bigger ? end : mid-1;
+        }
+        uint j = i + length*direction ;
+
+        // find split position using binary search
+        uint split = 0;
+        end    = length-1;
+        int delta_node = Delta(mc, ki, j);
+        while (split != end)
+        {
+            uint mid = split + ((end-split)/2) + ((end-split)%2);
+            bool bigger =  Delta( mc, ki, i+mid*direction) > delta_node;
+            split = bigger ? mid : split;
+            end   = bigger ? end : mid-1;
+        }
+        split = i + split*direction + min(direction,0);
+
+        uint left  = split;
+        uint right = split+1;
+
+        // mark leaves
+        if( min(i,j) == split )
+            left = left | (1<<31);
+        if( max(i,j) == split+1 )
+            right = right | (1<<31);
+
+        bnodes[i].range.start = min(i,j);
+        bnodes[i].range.end   = max(i,j);
+        bnodes[i].leftChild   = left;
+        bnodes[i].rightChild  = right;
+    }
+}
+
+
+
+
+
+#if 0
+__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( MAX_HW_SIMD_WIDTH )) )
+void kernel build_bottom_up_indirect( global struct Globals* globals,
+    global struct BinaryMortonCodeHierarchy* bnodes,
+    global struct MortonCodePrimitive* mc )
+{
+    /* construct range of primitives that each work group will process */
+    const uint numPrimitives = globals->numPrimitives;
+
+    // RangeFactor determines the distance between adjacent nodeIds in work group.
+    // The aim of the nodes distribution within work group, for rangeFactor > 1
+    // is to be sure that half of the work groups will entirelly be dropped off
+    // at the bottom layer of the graph. This way the EUs can be reused faster.
+    // The factor needs to be smaller than MAX_HW_SIMD_WIDTH
+    const uint rangeFactor = 2;
+
+    const uint numGroups = ((numPrimitives + MAX_HW_SIMD_WIDTH - 1) / MAX_HW_SIMD_WIDTH);
+    const uint globalId = get_group_id( 0 ) * MAX_HW_SIMD_WIDTH + get_local_id( 0 );
+    const uint numPrimitivesAlignedToWGSize = MAX_HW_SIMD_WIDTH * numGroups;
+    const uint groupsRange = numPrimitivesAlignedToWGSize / rangeFactor;
+
+    /* iterate over all primitives the work group should process */
+    const uint i = (globalId * rangeFactor) % numPrimitivesAlignedToWGSize + globalId / groupsRange;
+
+    if ( i < numPrimitives )
+    {
+        uint node = i | ((uint)1 << 31);
+        uint start = i;
+        uint end = i;
+
+        /* bottom up */
+        while ( true )
+        {
+            /* goto parent node and link parent node to current node */
+            node = updateParent( bnodes, mc, node, start, end, numPrimitives - 1 );
+
+            /* do not continue if we reached this node the first time */
+            if ( node == -1 )
+                break;
+
+            mem_fence_gpu_invalidate();
+
+            /* update range */
+            start = bnodes[node].range.start;
+            end = bnodes[node].range.end;
+
+            /* stop when we reached the root node */
+            if ( start == 0 && end == numPrimitives - 1 )
+            {
+                globals->binary_hierarchy_root = node;
+                break;
+            }
+        }
+    }
+}
+
+#endif
+
+/*
+
+  This function builds one QBVH6 node by opening the provided binary
+  BVH nodes until the QBVH node is full.
+
+ */
+
+GRL_INLINE void create_node(global struct Globals *globals,
+                        global struct BinaryMortonCodeHierarchy *bnodes,
+                        global char *bvh_mem,
+                        uint rID,
+                        local uint *local_numRecords,
+                        local uint *local_QNodeOffset,
+                        struct BuildRecordMorton *records,
+                        struct BuildRecordMorton *current)
+{
+    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
+    const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
+    global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
+    BackPointers *backPointers = BVHBase_GetBackPointers(bvh);
+
+    /* initialize child array */
+    uint numChildren = 2;
+    struct BuildRecordMorton children[BVH_NODE_N6];
+    children[0].nodeID = bnodes[current->nodeID].leftChild;
+    children[0].items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, children[0].nodeID);
+    children[1].nodeID = bnodes[current->nodeID].rightChild;
+    children[1].items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, children[1].nodeID);
+
+    /* fill QBVH6 node with up to 6 children */
+    while (numChildren < BVH_NODE_N6)
+    {
+        /*! find best child to split */
+        uint bestItems = 0;
+        int bestChild = -1;
+        for (int i = 0; i < numChildren; i++)
+        {
+            const uint items = children[i].items;
+
+            /* ignore leaves as they cannot get split */
+            if (items <= cfg_minLeafSize)
+                continue;
+
+            /* find child with largest number of items */
+            if (items > bestItems)
+            {
+                bestItems = items;
+                bestChild = i;
+            }
+        }
+        if (bestChild == -1)
+            break;
+
+        /* perform best found split */
+        const uint bestNodeID = children[bestChild].nodeID;
+        struct BuildRecordMorton *lrecord = &children[bestChild];
+        struct BuildRecordMorton *rrecord = &children[numChildren];
+        lrecord->nodeID = bnodes[bestNodeID].leftChild;
+        lrecord->items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, lrecord->nodeID);
+        rrecord->nodeID = bnodes[bestNodeID].rightChild;
+        rrecord->items = BinaryMortonCodeHierarchy_getNumPrimitives(bnodes, rrecord->nodeID);
+        numChildren++;
+    }
+
+    /* allocate memory for all children */
+    const uint child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren);
+    global struct QBVHNodeN *childNodes = (global struct QBVHNodeN *)(bvh_mem + child_node_offset);
+
+    /* create node, but to not set bounds yet as these get calculated during refit */
+    const uint current_index = current->current_index;
+    struct QBVHNodeN *qnode = nodeData + current_index;
+    QBVH6Node_set_type(qnode, BVH_INTERNAL_NODE);
+    QBVHNodeN_setChildIncr1(qnode);
+    QBVH6Node_set_offset(qnode, childNodes);
+
+    /* set back pointers */
+    *InnerNode_GetBackPointer(backPointers,  current_index) = (current->parent_index << 6) | (numChildren << 3);
+
+    /* update parent pointer of build records of all children */
+    for (uint ID = 0; ID < numChildren; ID++)
+    {
+        children[ID].current_index = childNodes - nodeData + ID;
+        children[ID].parent_index = current_index;
+    }
+
+    /* write out child build records */
+    const uint global_offset = atomic_add_local(local_numRecords, numChildren - 1);
+    records[rID] = children[0];
+
+    for (uint i = 1; i < numChildren; i++)
+        records[global_offset + i - 1] = children[i];
+
+    mem_fence_workgroup_default();
+
+}
+
+#if 0
+/* This function calculates the similarity between two morton
+ * codes. It essentially counts how many bits of the morton codes are
+ * equal starting at the top. The more bits are equal, the similar the
+ * codes, and the closer the primitives are located spatially. */
+
+GRL_INLINE uint64_t delta(global struct MortonCodePrimitive *mc,
+                      const uint id)
+{
+    const uint64_t key0 = mc[id + 0].index_code;
+    const uint64_t key1 = mc[id + 1].index_code;
+    return clz(key0 ^ key1);
+}
+
+
+
+/* This function checks for a range [left,right] of morton codes, if
+ * it is spatially closer to the left or to the right nodes. */
+
+GRL_INLINE bool merge_to_right(global struct MortonCodePrimitive *mc,
+                           const uint left,
+                           const uint right,
+                           const uint last)
+{
+    /* merge to right if we are at the left end of the array */
+    if (left == 0)
+        return true;
+
+    /* merge to left if we are at the right end of the array */
+    if (right == last)
+        return false;
+
+    /* otherwise merge to the side where the morton code sequence has
+   * the largest number of equal bits from the top */
+    return delta(mc, right) > delta(mc, left - 1);
+}
+
+GRL_INLINE uint updateParent(global struct BinaryMortonCodeHierarchy *bnodes,
+                         global struct MortonCodePrimitive *mc,
+                         const uint nodeID,
+                         const uint left,
+                         const uint right,
+                         const uint last)
+{
+    uint parent;
+
+    /* check if we should merge this node to the left or right */
+    if (merge_to_right(mc, left, right, last))
+    {
+        parent = right;
+        bnodes[parent].leftChild = nodeID;
+        bnodes[parent].range.start = left;
+    }
+    else
+    {
+        parent = left - 1;
+        bnodes[parent].rightChild = nodeID;
+        bnodes[parent].range.end = right;
+    }
+
+    mem_fence_gpu_default();
+
+    /* stop ascending the tree if we reached this node the first time */
+    const bool first = atomic_inc_global((global uint *)&bnodes[parent].flag) == 0;
+    return first ? -1 : parent;
+}
+
+GRL_INLINE void
+DO_OLD_PARALLEL_BUILD_PHASE1( global struct Globals* globals,
+    global struct MortonCodePrimitive* mc,
+    global struct AABB* primref,
+    global struct BinaryMortonCodeHierarchy* bnodes,
+    global char* bvh_mem,
+    uint startID, uint endID,
+    local uint* local_numRecords,
+    local uint* local_numRecordsOld,
+    local struct BuildRecordMorton* local_records
+)
+{
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + bvh->quadLeafStart*64);
+
+    /* iterate over all subtrees this workgroup should build */
+    for ( uint recordID = startID; recordID < endID; recordID++ )
+    {
+        /* add start build record to local stack */
+        if ( get_local_id( 0 ) == 0 )
+        {
+            local_records[0] = records[recordID];
+            *local_numRecords = 1;
+            *local_numRecordsOld = 0;
+        }
+        work_group_barrier( CLK_LOCAL_MEM_FENCE );
+
+        /* terminate when all subtrees are leaves */
+        while ( *local_numRecords != *local_numRecordsOld )
+        {
+            /* remember the old number of build records to detect later
+       * whether we are done */
+            if ( get_local_id( 0 ) == 0 )
+            {
+                *local_numRecordsOld = *local_numRecords;
+            }
+            work_group_barrier( CLK_LOCAL_MEM_FENCE );
+
+            /* all work items in the sub group pick a subtree to build */
+            for ( uint ID = get_local_id( 0 ); ID < *local_numRecordsOld; ID += get_local_size( 0 ) )
+            {
+                /* ignore small subtrees */
+                if ( local_records[ID].items <= BVH_NODE_N6 )
+                    continue;
+
+                /* create QBVH node */
+                create_node( globals, bnodes, bvh_mem, ID, local_numRecords, local_records, &local_records[ID] );
+            }
+
+            /* wait for all work items to have updated local_records array */
+            work_group_barrier( CLK_LOCAL_MEM_FENCE );
+        }
+
+        const uint shift_mask = globals->shift_mask;
+        const uint leafPrimType = globals->leafPrimType;
+        const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
+        BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
+        global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
+
+        /* create all fat leaf nodes and initiate refit */
+        for ( uint ID = get_local_id( 0 ); ID < *local_numRecords; ID += get_local_size( 0 ) )
+        {
+            struct BuildRecordMorton current = local_records[ID];
+            const uint primrefID = BinaryMortonCodeHierarchy_getRangeStart( bnodes, current.nodeID );
+
+            global struct QBVHNodeN* qnode = nodeData + current.current_index;
+
+            /* get bounds of all children of the fat leaf node */
+            struct AABB bounds[BVH_NODE_N6];
+            for ( uint i = 0; i < current.items; i++ )
+            {
+                /* get primID and bounds of primitive */
+                const uint primID = (uint)(mc[primrefID + i].index_code & shift_mask);
+                bounds[i] = primref[primID];
+
+                /* For all primitives in a fat leaf we store a back
+                 * pointer. This way we can modify the fat leaf node at leaf construction time. */
+                const uint back_pointer = qnode - (struct QBVHNodeN*)bvh_mem;
+
+                /* Store back pointer and primID inside morton code array to
+                 * be later used by leaf creation. */
+                mc[primrefID + i].index_code = ((ulong)back_pointer) << 32 | (ulong)primID;
+            }
+
+            /* update fat leaf node */
+            QBVHNodeN_setType( qnode, leafPrimType );
+            global void* offset;
+            if ( leafPrimType != BVH_INSTANCE_NODE )
+            {
+                offset = bvh_mem + 64*bvh->quadLeafStart + primrefID * sizeof( struct Quad );
+                QBVHNodeN_setChildIncr1( qnode );
+            }
+            else
+            {
+                offset = bvh_mem + 64*bvh->instanceLeafStart + primrefID * sizeof( struct HwInstanceLeaf );
+                QBVHNodeN_setChildIncr2( qnode );
+            }
+            QBVH6Node_set_offset( qnode, offset );
+            QBVHNodeN_setBounds( qnode, bounds, current.items );
+
+            /* set back pointers for fat leaf nodes */
+            *InnerNode_GetBackPointer(backPointers, current.current_index) = (current.parent_index << 6) | (current.items << 3);
+
+            /* bottom up refit */
+            refit_bottom_up( qnode, bvh, bounds, current.items );
+        }
+    }
+}
+
+/*
+
+  This phase takes the build records calculated in phase0 as input and
+  finished the BVH construction for all these subtrees.
+
+*/
+__attribute__((reqd_work_group_size(8, 1, 1)))
+old_parallel_build_phase1(global struct Globals *globals,
+                      global struct MortonCodePrimitive *mc,
+                      global struct AABB *primref,
+                      global struct BinaryMortonCodeHierarchy *bnodes,
+                      global char *bvh_mem)
+{
+    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
+    global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
+
+    /* a queue of build records */
+    local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
+    local uint local_numRecords;
+    local uint local_numRecordsOld;
+
+    /* construct range of build records that each sub group will process */
+    const uint numRecords = globals->numBuildRecords;
+    const uint startID = (get_group_id(0) + 0) * numRecords / get_num_groups(0);
+    const uint endID = (get_group_id(0) + 1) * numRecords / get_num_groups(0);
+
+    DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
+
+}
+
+__attribute__( (reqd_work_group_size( 8, 1, 1 )) )
+old_parallel_build_phase1_Indirect( global struct Globals* globals,
+    global struct MortonCodePrimitive* mc,
+    global struct AABB* primref,
+    global struct BinaryMortonCodeHierarchy* bnodes,
+    global char* bvh_mem )
+{
+    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
+    global struct BuildRecordMorton* records = (global struct BuildRecordMorton*)(bvh_mem + 64*bvh->quadLeafStart);
+
+    /* a queue of build records */
+    local struct BuildRecordMorton local_records[MORTON_BUILDER_SUBTREE_THRESHOLD];
+    local uint local_numRecords;
+    local uint local_numRecordsOld;
+
+    /* construct range of build records that each sub group will process */
+    const uint numRecords = globals->numBuildRecords;
+    uint startID = get_group_id( 0 );
+    uint endID   = startID + 1;
+
+    DO_OLD_PARALLEL_BUILD_PHASE1( globals, mc, primref, bnodes, bvh_mem, startID, endID, &local_numRecords, &local_numRecordsOld, local_records );
+
+}
+#endif
diff --git a/src/intel/vulkan/grl/gpu/morton/pre_sort.cl b/src/intel/vulkan/grl/gpu/morton/pre_sort.cl
new file mode 100644
index 00000000000..099f926e194
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton/pre_sort.cl
@@ -0,0 +1,117 @@
+//
+// Copyright (C) 2009-2022 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "morton/morton_common.h"
+
+GRL_INLINE uint get_morton_shift( uint numPrimitives )
+{
+    return 32 - clz( numPrimitives );
+}
+
+GRL_INLINE uint get_morton_shift_mask( uint numPrimitives )
+{
+    uint shift = get_morton_shift( numPrimitives );
+    uint mask =(uint)(((ulong)1 << shift));
+    return mask - 1; // separated due to problems in DX
+}
+
+__attribute__((reqd_work_group_size(1, 1, 1))) void kernel init( global struct Globals *globals )
+{
+    /* variable shift for putting morton code + index to 64 bit */
+    const uint shift = 32 - clz(globals->numPrimitives);
+    globals->shift = shift;
+    globals->shift_mask = (uint)(((ulong)1 << shift));
+    globals->shift_mask -= 1; // separated due to problems in DX
+    globals->binary_hierarchy_root = 0;
+    globals->morton_sort_in_flight = 0;
+    globals->sort_iterations = get_morton_sort_lsb_req_iterations(shift);
+}
+
+/*
+
+  This kernel create a morton code array containing a morton code and
+  index into the primref array.
+
+  The code uses the maximal number of bits for the morton code, such
+  that the morton code and index can still both get stored in 64 bits.
+
+  The algorithm first maps the centroids of the primitives and their
+  bounding box diagonal into a 4D grid, and then interleaves all 4
+  grid coordinates to construct the to morton code.
+
+ */
+
+__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) )
+__attribute__( (intel_reqd_sub_group_size( 16 )) ) void kernel
+create_morton_codes_indirect( global struct Globals* globals,
+    global struct BVHBase* bvh,
+    global struct AABB* primref,
+    global struct MortonCodePrimitive* morton_codes,
+    global struct MortonCodePrimitive* morton_codes_tmp,
+    uint use_new_morton_sort)
+{
+    /* construct range of morton codes each work group should create */
+    const uint numPrimitives = globals->numPrimitives;
+    const uint startID = get_group_id( 0 ) * get_local_size( 0 );
+    const uint endID   = min((uint)(startID + get_local_size(0)), numPrimitives);
+
+    /* get lower and upper bounds of geometry and length of scene diagonal */
+    const float3 lower = globals->centroidBounds.lower.xyz;
+    const float3 upper = globals->centroidBounds.upper.xyz;
+    const float diag = length( AABB3f_size( &bvh->Meta.bounds ).xyz );
+
+    /* calculates the 4D grid */
+    const uint shift = get_morton_shift( numPrimitives );
+    const uint grid_size = 1 << (64 - shift) / 4;
+    const float4 grid_base = (float4)(lower, 0.0f);
+    const float4 grid_extend = (float4)(upper - lower, diag);
+    const float4 grid_scale = select( (grid_size * 0.99f) / grid_extend, 0.0f, grid_extend == 0.0f ); // FIXME: 0.99f!!!!!
+
+    const uint req_iterations = get_morton_sort_lsb_req_iterations(shift);
+
+    /* each work group iterates over its range of morton codes to create */
+    uint primID = startID + get_local_id( 0 );
+    if( primID < endID )
+    {
+        /* calculate position inside 4D grid */
+        float4 centroid2 = AABB_centroid2( &primref[primID] );
+        centroid2.w = length( AABB_size( &primref[primID] ).xyz );
+        const uint4 gridpos = convert_uint4_rtz( (centroid2 - grid_base) * grid_scale );
+
+        /* calculate and store morton code */
+        const ulong code = ulong_bitInterleave4D( gridpos );
+        const ulong index_code = ((ulong)code << shift) | (ulong)primID;
+
+        // It is required for morton code to be in morton_codes buffer after LSB sort finishes.
+        // If there would be odd iteration number needed for sorting, it is needed
+        // to skip some iterations of sorting. For odd number of iteration start with morton_codes_tmp buffer
+        if(req_iterations & 1 && !use_new_morton_sort)
+            morton_codes_tmp[primID].index_code = index_code;
+        else
+            morton_codes[primID].index_code = index_code;
+    }
+}
+
+/*
+
+  Initialization of the binary morton code hierarchy.
+
+ */
+
+__attribute__( (reqd_work_group_size( MAX_HW_SIMD_WIDTH, 1, 1 )) ) void kernel init_bottom_up_indirect( global struct Globals* globals,
+    global struct BinaryMortonCodeHierarchy* bnodes )
+{
+    /* construct range each work group will process */
+    const uint numPrimitives = globals->numPrimitives;
+    const uint startID = get_group_id( 0 ) * get_local_size(0);
+    const uint endID   = min((uint)(startID + get_local_size(0)), numPrimitives);
+
+    /* each workgroup iterates over its range to initialize the binary BVH */
+    uint i = startID + get_local_id( 0 );
+    if( i < endID )
+        BinaryMortonCodeHierarchy_init( &bnodes[i], 0, numPrimitives - 1 );
+}
diff --git a/src/intel/vulkan/grl/gpu/morton_builder.grl b/src/intel/vulkan/grl/gpu/morton_builder.grl
new file mode 100644
index 00000000000..f221fd39fed
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton_builder.grl
@@ -0,0 +1,335 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module morton_builder;
+
+kernel_module morton_kernels ("morton/pre_sort.cl")
+{
+    kernel opencl_build_kernel_init                                     < kernelFunction="init" >;
+    kernel opencl_build_morton_kernel_create_morton_codes_indirect      < kernelFunction="create_morton_codes_indirect" >;
+    kernel opencl_build_morton_kernel_init_bottom_up_indirect           < kernelFunction="init_bottom_up_indirect" >;
+}
+
+kernel_module morton_kernels ("morton/post_sort.cl")
+{
+    links lsc_intrinsics;
+
+    kernel opencl_build_morton_kernel_build_bottom_up_indirect          < kernelFunction="build_bottom_up_indirect" >;
+}
+
+kernel_module morton_kernels ("morton/phase0.cl")
+{
+    links lsc_intrinsics;
+
+    kernel opencl_build_morton_kernel_parallel_build_phase0             < kernelFunction="parallel_build_phase0" >;
+    kernel opencl_build_morton_kernel_parallel_build_phase0_local_sync  < kernelFunction="parallel_build_phase0_local_sync" >;
+}
+
+kernel_module morton_kernels ("morton/phase1.cl")
+{
+    links lsc_intrinsics;
+
+    kernel opencl_build_morton_kernel_parallel_build_phase1_Indirect    < kernelFunction="parallel_build_phase1_Indirect_SG" >;
+    kernel opencl_build_morton_kernel_parallel_build_phase1_root        < kernelFunction="parallel_build_phase1_Indirect_global_root" >;
+}
+
+kernel_module morton_kernels ("morton/phase2.cl")
+{
+    links lsc_intrinsics;
+
+    kernel opencl_build_morton_kernel_parallel_build_phase2_refit       < kernelFunction="parallel_build_phase2_refit" >;
+    kernel opencl_build_morton_kernel_parallel_build_phase2_refit_local < kernelFunction="parallel_build_phase2_refit_local" >;
+}
+
+import struct MKBuilderState "structs.grl";
+
+/*
+metakernel begin(
+    MKBuilderState state,
+    qword morton_code_buffer,
+    dword primLeafType,
+    dword numHwThreads)
+{
+    dispatch opencl_build_kernel_init(1, 1, 1) args(
+        state.build_globals
+        );
+
+    control(wait_idle);
+
+
+    dispatch opencl_build_morton_kernel_create_morton_codes(numHwThreads, 1, 1) args(
+        state.build_globals,
+        state.bvh_buffer,
+        state.build_primref_buffer,
+        morton_code_buffer);
+
+    control(wait_idle);
+
+}
+
+metakernel build_bottom_up(
+    MKBuilderState state,
+    qword buildrecords_bottom_up,
+    qword morton_code_buffer,
+    dword numHwThreads)
+{
+    dispatch opencl_build_morton_kernel_init_bottom_up(numHwThreads, 1, 1) args(
+        state.build_globals,
+        buildrecords_bottom_up);
+
+    control(wait_idle);
+
+    dispatch opencl_build_morton_kernel_build_bottom_up(numHwThreads, 1, 1) args(
+        state.build_globals,
+        buildrecords_bottom_up,
+        morton_code_buffer);
+
+    control(wait_idle);
+
+}
+
+
+metakernel parallel_build(
+    MKBuilderState state,
+    qword buildrecords_bottom_up,
+    qword morton_code_buffer,
+    dword numHwThreads)
+{
+    dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args(
+        state.build_globals,
+        buildrecords_bottom_up,
+        state.bvh_buffer);
+
+    control(wait_idle);
+
+    dispatch opencl_build_morton_kernel_parallel_build_phase1(numHwThreads, 1, 1) args(
+        state.build_globals,
+        morton_code_buffer,
+        state.build_primref_buffer,
+        buildrecords_bottom_up,
+        state.bvh_buffer);
+
+   control(wait_idle);
+
+}
+
+*/
+
+metakernel NewMorton_pre_sort(
+    qword num_primrefs_counter,
+    MKBuilderState state,
+    qword morton_code_buffer,
+    qword morton_code_buffer_tmp,
+    qword buildrecords_bottom_up,
+    dword use_new_morton_sort)
+{
+
+
+    {
+        REG1 = 15;
+        REG2 = 4;
+        REG0 = load_dword( num_primrefs_counter );
+
+        REG0 = REG0 + REG1;     // JDB TODO:  TGL will need to do this computation in the EU and store it in globals
+        REG1 = ~REG1;
+        REG0 = REG0 & REG1;
+        REG0 = REG0 >> REG2;
+    }
+
+    dispatch opencl_build_kernel_init(1, 1, 1) args( state.build_globals );
+
+    DISPATCHDIM_X = REG0.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    /*
+    // new bottom-up kernel does not need this
+    dispatch_indirect opencl_build_morton_kernel_init_bottom_up_indirect args(
+        state.build_globals,
+        buildrecords_bottom_up);
+        */
+    dispatch_indirect opencl_build_morton_kernel_create_morton_codes_indirect args(
+        state.build_globals,
+        state.bvh_buffer,
+        state.build_primref_buffer,
+        morton_code_buffer,
+        morton_code_buffer_tmp,
+        use_new_morton_sort);
+
+
+}
+
+
+
+metakernel NewMorton_post_sort(
+    qword num_primrefs_counter,
+    qword num_buildrecords_counter,
+    MKBuilderState state,
+    qword buildrecords_bottom_up,
+    qword morton_code_buffer )
+{
+
+    {
+        REG1 = 15;
+        REG2 = 4;
+        REG0 = load_dword( num_primrefs_counter );
+
+        REG0 = REG0 + REG1;     // JDB TODO:  TGL will need to do this computation in the EU and store it in globals
+        REG1 = ~REG1;
+        REG0 = REG0 & REG1;
+        REG0 = REG0 >> REG2;
+    }
+
+    DISPATCHDIM_X = REG0.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_build_morton_kernel_build_bottom_up_indirect args(
+        state.build_globals,
+        buildrecords_bottom_up,
+        morton_code_buffer);
+
+
+    /*
+   dispatch opencl_build_morton_kernel_build_bottom_up(16, 1, 1) args(
+        state.build_globals,
+        buildrecords_bottom_up,
+        morton_code_buffer);
+        */
+
+    control(wait_idle);
+
+    dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args(
+        state.build_globals,
+        buildrecords_bottom_up,
+        state.bvh_buffer);
+
+    control(wait_idle);
+
+    DISPATCHDIM_X = load_dword( num_buildrecords_counter );
+
+    dispatch_indirect opencl_build_morton_kernel_parallel_build_phase1_Indirect args(
+        state.build_globals,
+        morton_code_buffer,
+        state.build_primref_buffer,
+        buildrecords_bottom_up,
+        state.bvh_buffer);
+
+   control(wait_idle);
+
+}
+
+metakernel NewMorton_bottom_up(
+    qword num_primrefs_counter,
+    MKBuilderState state,
+    qword buildrecords_bottom_up,
+    qword morton_code_buffer )
+{
+
+    {
+        REG1 = 15;
+        REG2 = 4;
+        REG0 = load_dword( num_primrefs_counter );
+
+        REG0 = REG0 + REG1;     // JDB TODO:  TGL will need to do this computation in the EU and store it in globals
+        REG1 = ~REG1;
+        REG0 = REG0 & REG1;
+        REG0 = REG0 >> REG2;
+    }
+
+    DISPATCHDIM_X = REG0.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_build_morton_kernel_build_bottom_up_indirect args(
+        state.build_globals,
+        buildrecords_bottom_up,
+        morton_code_buffer);
+}
+
+
+metakernel NewMorton_phase0(
+    MKBuilderState state,
+    qword buildrecords_bottom_up,
+    qword morton_p0_refit_startpoints)
+{
+
+    dispatch opencl_build_morton_kernel_parallel_build_phase0(1, 1, 1) args(
+        state.build_globals,
+        buildrecords_bottom_up,
+        state.bvh_buffer,
+        morton_p0_refit_startpoints);
+}
+
+metakernel NewMorton_phase0_local_sync(
+    MKBuilderState state,
+    qword buildrecords_bottom_up,
+    qword p0_boxless_nodes)
+{
+
+    dispatch opencl_build_morton_kernel_parallel_build_phase0_local_sync(1, 1, 1) args(
+        state.build_globals,
+        buildrecords_bottom_up,
+        state.bvh_buffer,
+        p0_boxless_nodes);
+}
+
+
+metakernel NewMorton_phase1(
+    qword num_buildrecords_counter,
+    MKBuilderState state,
+    qword buildrecords_bottom_up,
+    qword morton_code_buffer)
+{
+
+    DISPATCHDIM_X = load_dword( num_buildrecords_counter );
+
+    dispatch_indirect opencl_build_morton_kernel_parallel_build_phase1_Indirect args(
+        state.build_globals,
+        morton_code_buffer,
+        state.build_primref_buffer,
+        buildrecords_bottom_up,
+        state.bvh_buffer);
+}
+
+metakernel NewMorton_phase1_root(
+    qword num_buildrecords_counter,
+    MKBuilderState state,
+    qword buildrecords_bottom_up,
+    qword morton_code_buffer)
+{
+    dispatch opencl_build_morton_kernel_parallel_build_phase1_root(1, 1, 1) args(
+        state.build_globals,
+        morton_code_buffer,
+        state.build_primref_buffer,
+        buildrecords_bottom_up,
+        state.bvh_buffer);
+}
+
+metakernel NewMorton_phase2(
+    qword num_leaves_counter,
+    MKBuilderState state,
+    qword bottom_node_ids )
+{
+
+    DISPATCHDIM_X = load_dword( num_leaves_counter );
+
+    dispatch_indirect opencl_build_morton_kernel_parallel_build_phase2_refit args(
+        state.bvh_buffer,
+        bottom_node_ids);
+}
+
+metakernel NewMorton_phase2_local(
+    MKBuilderState state,
+    qword p0_boxless_nodes)
+{
+
+    dispatch opencl_build_morton_kernel_parallel_build_phase2_refit_local(1, 1, 1) args(
+        state.build_globals,
+        state.bvh_buffer,
+        p0_boxless_nodes);
+}
diff --git a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.cl b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.cl
new file mode 100644
index 00000000000..075d44a51ba
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.cl
@@ -0,0 +1,9 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+// just inlines the kernels that are there in the header
+#include "morton_msb_radix_bitonic_sort.h"
+\ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.h b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.h
new file mode 100644
index 00000000000..4fb6c21b014
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort.h
@@ -0,0 +1,924 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "common.h"
+#include "morton_msb_radix_bitonic_sort_shared.h"
+
+#include "libs/lsc_intrinsics.h"
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Configuration switches
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#define DEBUG 0
+#define MERGE_BLS_WITHIN_SG 0
+
+///////////////////////////////////////////////////////////////////////////////
+
+
+#if DEBUG
+#define DEBUG_CODE(A) A
+#else
+#define DEBUG_CODE(A)
+#endif
+
+#define BOTTOM_LEVEL_SORT_WG_SIZE 512
+
+// this kernel is only used to put into metakernel for debug to print that the code reached that place
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+void kernel debug_print_kernel(uint variable)
+{
+    if(get_local_id(0) == 0)
+    printf("I'm here! %d\n", variable);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(1, 1, 1)))
+void kernel check_bls_sort(global struct Globals* globals, global ulong* input)
+{
+    uint prims_num = globals->numPrimitives;
+
+    printf("in check_bls_sort kernel. Values count:: %d\n", prims_num);
+
+    ulong left = input[0];
+    ulong right;
+    for (int i = 0; i < prims_num - 1; i++)
+    {
+        right = input[i + 1];
+        printf("sorted val: %llu\n", left);
+        if (left > right)
+        {
+            printf("element %d is bigger than %d: %llu > %llu\n", i, i+1, left, right);
+        }
+        left = right;
+    }
+}
+
+inline uint wg_scan_inclusive_add_opt(local uint* tmp, uint val, uint SG_SIZE, uint WG_SIZE)
+{
+    const uint hw_thread_in_wg_id = get_local_id(0) / SG_SIZE;
+    const uint sg_local_id = get_local_id(0) % SG_SIZE;
+    const uint NUM_HW_THREADS_IN_WG = WG_SIZE / SG_SIZE;
+
+    uint acc = sub_group_scan_inclusive_add(val);
+    if (NUM_HW_THREADS_IN_WG == 1)
+    {
+        return acc;
+    }
+    tmp[hw_thread_in_wg_id] = sub_group_broadcast(acc, SG_SIZE - 1);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    uint loaded_val = sg_local_id < NUM_HW_THREADS_IN_WG ? tmp[sg_local_id] : 0;
+    uint wgs_acc = sub_group_scan_exclusive_add(loaded_val);
+    uint acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id);
+    // for > 256 workitems in SIMD16 we won't fit in 16 workitems per subgroup, so we need additional iteration
+    // same for > 64 workitems and more in SIMD8
+    uint num_iterations = (NUM_HW_THREADS_IN_WG + SG_SIZE - 1) / SG_SIZE;
+    for (int i = 1; i < num_iterations; i++)
+    {
+        // need to add tmp[] because of "exclusive" scan, so last element misses it
+        uint prev_max_sum = sub_group_broadcast(wgs_acc, SG_SIZE - 1) + tmp[(i * SG_SIZE) - 1];
+        loaded_val = (sg_local_id + i * SG_SIZE) < NUM_HW_THREADS_IN_WG ? tmp[sg_local_id] : 0;
+        wgs_acc = sub_group_scan_exclusive_add(loaded_val);
+        wgs_acc += prev_max_sum;
+        uint new_acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id % SG_SIZE);
+        if (hw_thread_in_wg_id >= i * SG_SIZE)
+            acc_for_this_hw_thread = new_acc_for_this_hw_thread;
+    }
+    return acc + acc_for_this_hw_thread;
+}
+
+struct MSBDispatchArgs
+{
+    global struct MSBRadixContext* context;
+    uint num_of_wgs; // this is the number of workgroups that was dispatched for this context
+    ulong* wg_key_start; // this is where keys to process start for current workgroup
+    ulong* wg_key_end;
+    uint shift_bit;
+};
+
+
+
+
+struct MSBDispatchArgs get_msb_dispatch_args(global struct VContextScheduler* scheduler)
+{
+    global struct MSBDispatchQueue* queue = &scheduler->msb_queue;
+
+    uint group = get_group_id(0);
+    struct MSBDispatchRecord record;
+
+    // TODO_OPT:  Load this entire prefix array into SLM instead of searching..
+    //    Or use sub-group ops
+    uint i = 0;
+    while (i < queue->num_records)
+    {
+        uint n = queue->records[i].wgs_to_dispatch;
+
+        if (group < n)
+        {
+            record = queue->records[i];
+            break;
+        }
+
+        group -= n;
+        i++;
+    }
+
+    uint context_id = i;
+    global struct MSBRadixContext* context = &scheduler->contexts[context_id];
+
+    // moving to ulongs to avoid uint overflow
+    ulong group_id_in_dispatch = group;
+    ulong start_offset = context->start_offset;
+    ulong num_keys = context->num_keys;
+    ulong wgs_to_dispatch = record.wgs_to_dispatch;
+
+    struct MSBDispatchArgs args;
+    args.context = context;
+    args.num_of_wgs = record.wgs_to_dispatch;
+    args.wg_key_start = context->keys_in + start_offset + (group_id_in_dispatch * num_keys / wgs_to_dispatch);
+    args.wg_key_end = context->keys_in + start_offset + ((group_id_in_dispatch+1) * num_keys / wgs_to_dispatch);
+    args.shift_bit = MSB_SHIFT_BYTE_START_OFFSET - context->iteration * MSB_BITS_PER_ITERATION;
+    return args;
+}
+
+
+
+
+void BLSDispatchQueue_push(global struct BLSDispatchQueue* queue, struct BLSDispatchRecord* record)
+{
+    uint new_idx = atomic_inc_global(&queue->num_records);
+    queue->records[new_idx] = *record;
+    DEBUG_CODE(printf("adding bls of size: %d\n", record->count));
+}
+
+
+
+
+void DO_CountSort(struct BLSDispatchRecord dispatchRecord, local ulong* SLM_shared, global ulong* output)
+{
+    uint tid = get_local_id(0);
+
+    global ulong* in = ((global ulong*)(dispatchRecord.keys_in)) + dispatchRecord.start_offset;
+
+    ulong a = tid < dispatchRecord.count ? in[tid] : ULONG_MAX;
+
+    SLM_shared[tid] = a;
+
+    uint counter = 0;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    ulong curr = SLM_shared[get_sub_group_local_id()];
+
+    for (uint i = 16; i < dispatchRecord.count; i += 16)
+    {
+        ulong next  = SLM_shared[i + get_sub_group_local_id()];
+
+        for (uint j = 0; j < 16; j++)
+        {
+            // some older drivers have bug when shuffling ulong so we process by shuffling 2x uint
+            uint2 curr_as_uint2 = as_uint2(curr);
+            uint2 sg_curr_as_uint2 = (uint2)(sub_group_broadcast(curr_as_uint2.x, j), sub_group_broadcast(curr_as_uint2.y, j));
+            ulong c = as_ulong(sg_curr_as_uint2);
+            if (c < a)
+                counter++;
+        }
+
+        curr = next;
+    }
+
+
+    // last iter
+    for (uint j = 0; j < 16; j++)
+    {
+        // some older drivers have bug when shuffling ulong so we process by shuffling 2x uint
+        uint2 curr_as_uint2 = as_uint2(curr);
+        uint2 sg_curr_as_uint2 = (uint2)(sub_group_broadcast(curr_as_uint2.x, j), sub_group_broadcast(curr_as_uint2.y, j));
+        ulong c = as_ulong(sg_curr_as_uint2);
+        if (c < a)
+            counter++;
+    }
+
+    // save elements to its sorted positions
+    if (tid < dispatchRecord.count)
+        output[dispatchRecord.start_offset + counter] = a;
+}
+
+void DO_Bitonic(struct BLSDispatchRecord dispatchRecord, local ulong* SLM_shared, global ulong* output)
+{
+    uint lid = get_local_id(0);
+    uint elements_to_sort = BOTTOM_LEVEL_SORT_THRESHOLD;
+    while ((elements_to_sort >> 1) >= dispatchRecord.count && elements_to_sort >> 1 >= BOTTOM_LEVEL_SORT_WG_SIZE)
+    {
+        elements_to_sort >>= 1;
+    }
+
+    for (int i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++)
+    {
+        uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE;
+
+        if (tid >= dispatchRecord.count)
+            SLM_shared[tid] = ULONG_MAX;
+        else
+            SLM_shared[tid] = ((global ulong*)(dispatchRecord.keys_in))[dispatchRecord.start_offset + tid];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    uint k_iterations = elements_to_sort;
+    while(k_iterations >> 1 >= dispatchRecord.count && k_iterations != 0)
+    {
+        k_iterations >>= 1;
+    }
+
+    for (unsigned int k = 2; k <= k_iterations; k *= 2)
+    {
+        for (unsigned int j = k / 2; j > 0; j /= 2)
+        {
+            // this loop is needed when we can't create big enough workgroup so we need to process multiple times
+            for (uint i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++)
+            {
+                uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE;
+                unsigned int ixj = tid ^ j;
+                if (ixj > tid)
+                {
+                    if ((tid & k) == 0)
+                    {
+                        if (SLM_shared[tid] > SLM_shared[ixj])
+                        {
+                            ulong tmp = SLM_shared[tid];
+                            SLM_shared[tid] = SLM_shared[ixj];
+                            SLM_shared[ixj] = tmp;
+                        }
+                    }
+                    else
+                    {
+                        if (SLM_shared[tid] < SLM_shared[ixj])
+                        {
+                            ulong tmp = SLM_shared[tid];
+                            SLM_shared[tid] = SLM_shared[ixj];
+                            SLM_shared[ixj] = tmp;
+                        }
+                    }
+                }
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+    }
+
+    for (int i = 0; i < elements_to_sort / BOTTOM_LEVEL_SORT_WG_SIZE; i++)
+    {
+        uint tid = lid + i * BOTTOM_LEVEL_SORT_WG_SIZE;
+
+        if (tid < dispatchRecord.count)
+            output[dispatchRecord.start_offset + tid] = SLM_shared[tid];
+    }
+}
+
+
+
+
+void DO_Create_Separate_BLS_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input)
+{
+    uint lid = get_local_id(0);
+
+    uint start = context->start[lid];
+    uint count = context->count[lid];
+    uint start_offset = context->start_offset + start;
+
+    struct BLSDispatchRecord record;
+    record.start_offset = start_offset;
+    record.count = count;
+    record.keys_in = context->keys_out;
+
+    if (count == 0) // we don't have elements so don't do anything
+    {
+    }
+    else if (count == 1) // single element so just write it out
+    {
+        input[start_offset] = ((global ulong*)record.keys_in)[start_offset];
+    }
+    else if (count <= BOTTOM_LEVEL_SORT_THRESHOLD)
+    {
+        BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
+    }
+}
+
+
+
+
+// We try to merge small BLS into larger one within the sub_group
+void DO_Create_SG_Merged_BLS_Work_Parallel(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input)
+{
+    uint lid = get_local_id(0);
+    uint sid = get_sub_group_local_id();
+
+    uint create_msb_work = context->count[lid] > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0;
+
+    uint start = context->start[lid];
+    uint count = context->count[lid];
+    uint ctx_start_offset = context->start_offset;
+
+    if (sid == 0 || create_msb_work) // these SIMD lanes are the begining of merged BLS
+    {
+        struct BLSDispatchRecord record;
+        if (create_msb_work)
+        {
+            record.start_offset = ctx_start_offset + start + count;
+            record.count = 0;
+        }
+        else // SIMD lane 0 case
+        {
+            record.start_offset = ctx_start_offset + start; 
+            record.count = count;
+        }
+
+        record.keys_in = context->keys_out;
+
+        uint loop_idx = 1;
+        while (sid + loop_idx < 16) // loop over subgroup
+        {
+            uint _create_msb_work = intel_sub_group_shuffle_down(create_msb_work, 0u, loop_idx);
+            uint _count = intel_sub_group_shuffle_down(count, 0u, loop_idx);
+            uint _start = intel_sub_group_shuffle_down(start, 0u, loop_idx);
+
+            if (_create_msb_work) // found out next MSB work, so range of merges ends
+                break;
+
+            // need to push record since nothing more will fit
+            if (record.count + _count > BOTTOM_LEVEL_SORT_MERGING_THRESHOLD)
+            {
+                if (record.count == 1)
+                {
+                    input[record.start_offset] = record.keys_in[record.start_offset];
+                }
+                else if (record.count > 1)
+                {
+                    BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
+                }
+                record.start_offset = ctx_start_offset + _start;
+                record.count = _count;
+            }
+            else
+            {
+                record.count += _count;
+            }
+            loop_idx++;
+        }
+        // if we have any elements left, then schedule them
+        if (record.count == 1) // only one element, so just write it out
+        {
+            input[record.start_offset] = record.keys_in[record.start_offset];
+        }
+        else if (record.count > 1)
+        {
+            BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
+        }
+    }
+}
+
+
+
+
+// We try to merge small BLS into larger one within the sub_group
+void DO_Create_SG_Merged_BLS_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input)
+{
+    uint lid = get_local_id(0);
+    uint sid = get_sub_group_local_id();
+
+    uint create_msb_work = context->count[lid] > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0;
+
+    uint start = context->start[lid];
+    uint count = context->count[lid];
+    uint ctx_start_offset = context->start_offset;
+
+    if (sid == 0)
+    {
+        struct BLSDispatchRecord record;
+        record.start_offset = ctx_start_offset + start;
+        record.count = 0;
+        record.keys_in = context->keys_out;
+
+        for (int i = 0; i < 16; i++)
+        {
+            uint _create_msb_work = sub_group_broadcast(create_msb_work, i);
+            uint _count = sub_group_broadcast(count, i);
+            uint _start = sub_group_broadcast(start, i);
+            if (_create_msb_work)
+            {
+                if (record.count == 1) // only one element, so just write it out
+                {
+                    input[record.start_offset] = record.keys_in[record.start_offset];
+                }
+                else if (record.count > 1)
+                {
+                    BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
+                }
+                record.start_offset = ctx_start_offset + _start + _count;
+                record.count = 0;
+                continue;
+            }
+            // need to push record since nothing more will fit
+            if (record.count + _count > BOTTOM_LEVEL_SORT_MERGING_THRESHOLD)
+            {
+                BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
+                record.start_offset = ctx_start_offset + _start;
+                record.count = _count;
+            }
+            else
+            {
+                record.count += _count;
+            }
+        }
+        // if we have any elements left, then schedule them
+        if (record.count == 1) // only one element, so just write it out
+        {
+            input[record.start_offset] = record.keys_in[record.start_offset];
+        }
+        else if (record.count > 1)
+        {
+            BLSDispatchQueue_push((global struct BLSDispatchQueue*)scheduler->next_bls_queue, &record);
+        }
+    }
+}
+
+
+
+
+void DO_Create_Work(global struct VContextScheduler* scheduler, global struct MSBRadixContext* context, global ulong* input, local uint* slm_for_wg_scan, uint sg_size, uint wg_size)
+{
+    uint lid = get_local_id(0);
+
+    uint iteration = context->iteration + 1;
+    uint start = context->start[lid];
+    uint count = context->count[lid];
+    uint start_offset = context->start_offset + start;
+
+    uint create_msb_work = count > BOTTOM_LEVEL_SORT_THRESHOLD ? 1 : 0;
+
+#if MERGE_BLS_WITHIN_SG
+    DO_Create_SG_Merged_BLS_Work_Parallel(scheduler, context, input);
+#else
+    DO_Create_Separate_BLS_Work(scheduler, context, input);
+#endif
+
+    uint new_entry_id = wg_scan_inclusive_add_opt(slm_for_wg_scan, create_msb_work, sg_size, wg_size);//work_group_scan_inclusive_add(create_msb_work);
+    uint stack_begin_entry;
+    // last workitem in wg contains number of all new entries
+    if (lid == (MSB_RADIX_NUM_BINS - 1))
+    {
+        stack_begin_entry = atomic_add_global(&scheduler->msb_stack.num_entries, new_entry_id);
+    }
+    stack_begin_entry = work_group_broadcast(stack_begin_entry, (MSB_RADIX_NUM_BINS - 1));
+    new_entry_id += stack_begin_entry -1;
+    
+
+    if (create_msb_work)
+    {
+        scheduler->msb_stack.entries[new_entry_id].start_offset = start_offset;
+        scheduler->msb_stack.entries[new_entry_id].count = count;
+        scheduler->msb_stack.entries[new_entry_id].iteration = iteration;
+    }
+
+    if (lid == 0) {
+        DEBUG_CODE(printf("num of new bls: %d\n", scheduler->next_bls_queue->num_records));
+    }
+}
+
+
+struct BatchedBLSDispatchEntry
+{
+    /////////////////////////////////////////////////////////////
+    //  State data used for communication with command streamer
+    //  NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl'
+    /////////////////////////////////////////////////////////////
+    qword p_data_buffer;
+    qword num_elements; // number of elements in p_data_buffer
+};
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel sort_morton_codes_batched_BLS_dispatch(global struct BatchedBLSDispatchEntry* bls_dispatches)
+{
+    uint dispatch_id = get_group_id(0);
+    uint lid = get_local_id(0);
+
+    local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD];
+
+    struct BatchedBLSDispatchEntry dispatchArgs = bls_dispatches[dispatch_id];
+    struct BLSDispatchRecord dispatchRecord;
+    dispatchRecord.start_offset = 0;
+    dispatchRecord.count = dispatchArgs.num_elements;
+    dispatchRecord.keys_in = (ulong*)dispatchArgs.p_data_buffer;
+
+    DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_bottom_level_single_wg for %d elements\n", dispatchRecord.count));
+
+    if(dispatchRecord.count > 1)
+        DO_Bitonic(dispatchRecord, SLM_shared, (global ulong*)dispatchRecord.keys_in);
+}
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel sort_morton_codes_bottom_level_single_wg(global struct Globals* globals, global ulong* input, global ulong* output)
+{
+    uint lid = get_local_id(0);
+
+    DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_bottom_level_single_wg for %d elements\n", globals->numPrimitives));
+
+    local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD];
+    
+    struct BLSDispatchRecord dispatchRecord;
+    dispatchRecord.start_offset = 0;
+    dispatchRecord.count = globals->numPrimitives;
+    dispatchRecord.keys_in = (ulong*)input;
+
+    //TODO: count or bitonic here?
+    //DO_Bitonic(dispatchRecord, SLM_shared, output);
+    DO_CountSort(dispatchRecord, SLM_shared, output);
+}
+
+
+
+
+// This kernel initializes first context to start up the whole execution
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MSB_RADIX_NUM_BINS, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel sort_morton_codes_msb_begin(
+    global struct Globals* globals,
+    global struct VContextScheduler* scheduler,
+    global ulong* buf0,
+    global ulong* buf1)
+{
+    uint lid = get_local_id(0);
+    uint gid = get_group_id(0);
+
+    DEBUG_CODE(if (lid == 0)printf("running sort_morton_codes_msb_begin\n"));
+
+    scheduler->contexts[gid].count[lid] = 0;
+
+    if (gid == 0 && lid == 0)
+    {
+        global struct MSBRadixContext* context = &scheduler->contexts[lid];
+        const uint num_prims = globals->numPrimitives;
+
+        scheduler->bls_queue0.num_records = 0;
+        scheduler->bls_queue1.num_records = 0;
+
+        scheduler->curr_bls_queue = &scheduler->bls_queue1;
+        scheduler->next_bls_queue = &scheduler->bls_queue0;
+
+        context->start_offset = 0;
+        context->num_wgs_in_flight = 0;
+        context->num_keys = num_prims;
+        context->iteration = 0;
+        context->keys_in = buf0;
+        context->keys_out = buf1;
+
+        uint msb_wgs_to_dispatch = (num_prims + MSB_WG_SORT_ELEMENTS_THRESHOLD - 1) / MSB_WG_SORT_ELEMENTS_THRESHOLD;
+        scheduler->msb_queue.records[0].wgs_to_dispatch = msb_wgs_to_dispatch;
+
+        scheduler->num_wgs_msb = msb_wgs_to_dispatch;
+        scheduler->num_wgs_bls = 0;
+        scheduler->msb_stack.num_entries = 0;
+        scheduler->msb_queue.num_records = 1;
+    }
+}
+
+
+
+
+__attribute__((reqd_work_group_size(MSB_RADIX_NUM_VCONTEXTS, 1, 1)))
+kernel void
+scheduler(global struct VContextScheduler* scheduler, global ulong* buf0, global ulong* buf1)
+{
+    uint lid = get_local_id(0);
+    
+    DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_scheduler\n"));
+
+    uint context_idx = lid;
+
+    const uint num_of_stack_entries = scheduler->msb_stack.num_entries;
+
+    uint msb_wgs_to_dispatch = 0;
+    if (lid < num_of_stack_entries)
+    {
+        struct MSBStackEntry entry = scheduler->msb_stack.entries[(num_of_stack_entries-1) - lid];
+        global struct MSBRadixContext* context = &scheduler->contexts[lid];
+        context->start_offset = entry.start_offset;
+        context->num_wgs_in_flight = 0;
+        context->num_keys = entry.count;
+        context->iteration = entry.iteration;
+        context->keys_in = entry.iteration % 2 == 0 ? buf0 : buf1;
+        context->keys_out = entry.iteration % 2 == 0 ? buf1 : buf0;
+
+        msb_wgs_to_dispatch = (entry.count + MSB_WG_SORT_ELEMENTS_THRESHOLD - 1) / MSB_WG_SORT_ELEMENTS_THRESHOLD;
+        scheduler->msb_queue.records[lid].wgs_to_dispatch = msb_wgs_to_dispatch;
+    }
+
+    msb_wgs_to_dispatch = work_group_reduce_add(msb_wgs_to_dispatch);// TODO: if compiler implementation is slow, then consider to manually write it
+
+    if (lid == 0)
+    {
+        // swap queue for next iteration
+        struct BLSDispatchQueue* tmp = scheduler->curr_bls_queue;
+        scheduler->curr_bls_queue = scheduler->next_bls_queue;
+        scheduler->next_bls_queue = tmp;
+
+        scheduler->next_bls_queue->num_records = 0;
+
+        scheduler->num_wgs_bls = scheduler->curr_bls_queue->num_records;
+        scheduler->num_wgs_msb = msb_wgs_to_dispatch;
+
+        if (num_of_stack_entries < MSB_RADIX_NUM_VCONTEXTS)
+        {
+            scheduler->msb_queue.num_records = num_of_stack_entries;
+            scheduler->msb_stack.num_entries = 0;
+        }
+        else
+        {
+            scheduler->msb_queue.num_records = MSB_RADIX_NUM_VCONTEXTS;
+            scheduler->msb_stack.num_entries -= MSB_RADIX_NUM_VCONTEXTS;
+        }
+    }
+
+    DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_scheduler finished, to spawn %d MSB wgs in %d contexts and %d BLS wgs, MSB records on stack %d\n",
+        scheduler->num_wgs_msb, scheduler->msb_queue.num_records, scheduler->num_wgs_bls, scheduler->msb_stack.num_entries));
+}
+
+
+
+
+// this is the lowest sub-task, which should end return sorted codes
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(BOTTOM_LEVEL_SORT_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel sort_morton_codes_bottom_level( global struct VContextScheduler* scheduler, global ulong* output)
+{
+    uint lid = get_local_id(0);
+
+    DEBUG_CODE(if (get_group_id(0) == 0 && lid == 0) printf("running sort_morton_codes_bottom_level\n"));
+
+    local struct BLSDispatchRecord l_dispatchRecord;
+    if (lid == 0)
+    {
+        uint record_idx = get_group_id(0);
+        l_dispatchRecord = scheduler->curr_bls_queue->records[record_idx];
+        //l_dispatchRecord = BLSDispatchQueue_pop((global struct BLSDispatchQueue*)scheduler->curr_bls_queue);
+        atomic_dec_global(&scheduler->num_wgs_bls);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    struct BLSDispatchRecord dispatchRecord = l_dispatchRecord;
+
+    local ulong SLM_shared[BOTTOM_LEVEL_SORT_THRESHOLD];
+
+    // right now use only bitonic sort
+    // TODO: maybe implement something else
+    if (1)
+    {
+        //DO_Bitonic(dispatchRecord, SLM_shared, output);
+        DO_CountSort(dispatchRecord, SLM_shared, output);
+    }
+}
+
+
+
+
+#define MSB_COUNT_WG_SIZE MSB_RADIX_NUM_BINS
+#define MSB_COUNT_SG_SIZE 16
+
+// count how many elements per buckets we have
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MSB_COUNT_WG_SIZE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MSB_COUNT_SG_SIZE)))
+void kernel sort_morton_codes_msb_count_items( global struct VContextScheduler* scheduler)
+{
+    uint lid = get_local_id(0);
+    uint lsz = MSB_RADIX_NUM_BINS;
+
+    DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_msb_count_items\n"));
+
+    local uint bucket_count[MSB_RADIX_NUM_BINS];
+    local uint finish_count;
+    bucket_count[lid] = 0;
+    if (lid == 0)
+    {
+        finish_count = 0;
+    }
+
+    struct MSBDispatchArgs dispatchArgs = get_msb_dispatch_args(scheduler);
+
+    global struct MSBRadixContext* context = dispatchArgs.context;
+
+    global ulong* key_start = (global ulong*)dispatchArgs.wg_key_start + lid;
+    global ulong* key_end = (global ulong*)dispatchArgs.wg_key_end;
+    uint shift_bit = dispatchArgs.shift_bit;
+    uchar shift_byte = shift_bit / 8; // so we count how many uchars to shift
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    global uchar* ks = (global uchar*)key_start;
+    ks += shift_byte;
+    global uchar* ke = (global uchar*)key_end;
+    ke += shift_byte;
+
+    // double buffering on value loading
+    if (ks < ke)
+    {
+        uchar bucket_id = *ks;
+        ks += lsz * sizeof(ulong);
+
+        for (global uchar* k = ks; k < ke; k += lsz * sizeof(ulong))
+        {
+            uchar next_bucket_id = *k;
+            atomic_inc_local(&bucket_count[bucket_id]);
+            bucket_id = next_bucket_id;
+        }
+
+        atomic_inc_local(&bucket_count[bucket_id]);
+
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //update global counters for context
+    uint count = bucket_count[lid];
+    if (count > 0)
+        atomic_add_global(&context->count[lid], bucket_count[lid]);
+
+    mem_fence_gpu_invalidate();
+    work_group_barrier(0);
+
+    bool final_wg = true;
+    // count WGs which have reached the end
+    if (dispatchArgs.num_of_wgs > 1)
+    {
+        if (lid == 0)
+            finish_count = atomic_inc_global(&context->num_wgs_in_flight) + 1;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        final_wg = finish_count == dispatchArgs.num_of_wgs;
+    }
+
+    local uint partial_dispatches[MSB_COUNT_WG_SIZE / MSB_COUNT_SG_SIZE];
+    // if this is last wg for current dispatch, update context
+    if (final_wg)
+    {
+        // code below does work_group_scan_exclusive_add(context->count[lid]);
+        {
+            uint lane_val = context->count[lid];
+            uint sg_result = sub_group_scan_inclusive_add(lane_val);
+
+            partial_dispatches[get_sub_group_id()] = sub_group_broadcast(sg_result, MSB_COUNT_SG_SIZE - 1);
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            uint slm_result = sub_group_scan_exclusive_add(partial_dispatches[get_sub_group_local_id()]);
+            slm_result = sub_group_broadcast(slm_result, get_sub_group_id());
+            uint result = slm_result + sg_result - lane_val;
+            context->start[lid] = result;//work_group_scan_exclusive_add(context->count[lid]);
+        }
+
+        context->count[lid] = 0;
+        if(lid == 0)
+            context->num_wgs_in_flight = 0;
+    }
+}
+
+
+
+
+// sort elements into appropriate buckets
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MSB_RADIX_NUM_BINS, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16)))
+void kernel sort_morton_codes_msb_bin_items(
+    global struct VContextScheduler* scheduler, global ulong* input)
+{
+    uint lid = get_local_id(0);
+    uint lsz = get_local_size(0);
+
+    DEBUG_CODE(if (lid == 0) printf("running sort_morton_codes_msb_bin_items\n"));
+
+    local uint finish_count;
+    if (lid == 0)
+    {
+        finish_count = 0;
+    }
+
+    struct MSBDispatchArgs dispatchArgs = get_msb_dispatch_args(scheduler);
+    global struct MSBRadixContext* context = dispatchArgs.context;
+
+    global ulong* key_start = (global ulong*)dispatchArgs.wg_key_start + lid;
+    global ulong* key_end = (global ulong*)dispatchArgs.wg_key_end;
+    uint shift_bit = dispatchArgs.shift_bit;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    global ulong* sorted_keys = (global ulong*)context->keys_out + context->start_offset;
+    
+#if MSB_RADIX_NUM_BINS == MSB_WG_SORT_ELEMENTS_THRESHOLD // special case meaning that we process exactly 1 element per workitem
+    // here we'll do local counting, then move to global
+
+    local uint slm_counters[MSB_RADIX_NUM_BINS];
+    slm_counters[lid] = 0;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    uint place_in_slm_bucket;
+    uint bucket_id;
+    ulong val;
+
+    bool active_lane = key_start < key_end;
+
+    if (active_lane)
+    {
+        val = *key_start;
+
+        bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1);
+        place_in_slm_bucket = atomic_inc_local(&slm_counters[bucket_id]);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // override slm_counters with global counters - we don't need to override counters with 0 elements since we won't use them anyway
+    if (slm_counters[lid])
+        slm_counters[lid] = atomic_add_global(&context->count[lid], slm_counters[lid]);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    uint id_in_bucket = slm_counters[bucket_id] + place_in_slm_bucket;//atomic_inc_global(&context->count[bucket_id]);
+
+    if (active_lane)
+        sorted_keys[context->start[bucket_id] + id_in_bucket] = val;
+#else
+    // double buffering on value loading
+    if (key_start < key_end)
+    {
+        ulong val = *key_start;
+        key_start += lsz;
+
+        for (global ulong* k = key_start; k < key_end; k += lsz)
+        {
+            ulong next_val = *k;
+            uint bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1);
+            uint id_in_bucket = atomic_inc_global(&context->count[bucket_id]);
+
+            //printf("dec: %llu, val: %llX bucket_id: %X", *k, *k, bucket_id);
+            sorted_keys[context->start[bucket_id] + id_in_bucket] = val;
+
+            val = next_val;
+        }
+
+        uint bucket_id = (val >> (ulong)shift_bit) & (MSB_RADIX_NUM_BINS - 1);
+        uint id_in_bucket = atomic_inc_global(&context->count[bucket_id]);
+
+        sorted_keys[context->start[bucket_id] + id_in_bucket] = val;
+    }
+#endif
+
+    // make sure all groups's "counters" and "starts" are visible to final workgroup
+    mem_fence_gpu_invalidate();
+    work_group_barrier(0);
+
+    bool final_wg = true;
+    // count WGs which have reached the end
+    if (dispatchArgs.num_of_wgs > 1)
+    {
+        if (lid == 0)
+            finish_count = atomic_inc_global(&context->num_wgs_in_flight) + 1;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        final_wg = finish_count == dispatchArgs.num_of_wgs;
+    }
+
+    local uint slm_for_wg_funcs[MSB_COUNT_WG_SIZE / MSB_COUNT_SG_SIZE];
+    // if this is last wg for current dispatch, then prepare sub-tasks
+    if (final_wg)
+    {
+        DO_Create_Work(scheduler, context, input, slm_for_wg_funcs, 16, MSB_RADIX_NUM_BINS);
+
+        // clear context's counters for future execution
+        context->count[lid] = 0;
+    }
+
+}
+\ No newline at end of file
diff --git a/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort_shared.h b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort_shared.h
new file mode 100644
index 00000000000..c2ab0d4a2c9
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort_shared.h
@@ -0,0 +1,135 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+//
+//   This file contains structure definitions shared by GRL OCL kernels and host code
+//
+
+#pragma once
+
+#include "GRLGen12.h"
+
+// NOTE:
+// MSB(Most significant byte) - here I refer to it as a part of sorting that does MSB Radix sort, which can spawn additional work
+// BLS(Bottom level sort) - here I refer to it as a last part of sorting a particular range(currently Bitonic), which cannot spawn additional work
+//
+
+#define MSB_RADIX_NUM_BINS    256
+#define MSB_BITS_PER_ITERATION 8 // how many bits are sorted per iteration
+#define MSB_SHIFT_BYTE_START_OFFSET 56 // start offset for byte shifting, first iteration will start from here
+
+#define MSB_RADIX_NUM_VCONTEXTS 8 // NOTE: mkulikow: maybe expand/shrink? More means more MSB processed in parallel but more memory used
+
+#define MSB_STACK_ENTRIES_NUM (MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS * 7) // first level doesn't get spawned, so 7 iterations must fit here,
+// since at max one algorithm iteration can spawn MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS we need 7 of these
+
+#define MSB_DISPATCH_QUEUE_NUM_RECORDS (MSB_RADIX_NUM_VCONTEXTS) // one per context
+
+#define BLS_DISPATCH_QUEUE_NUM_RECORDS (MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS) // each context can spawn MSB_RADIX_NUM_BINS,
+// so at max one algorithm iteration can spawn MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS
+
+#define MSB_WG_SORT_ELEMENTS_THRESHOLD 256 // This tells us how many elements at max we can process in a single workgroup.
+                                           // If a single MSB entry needs more, then it will spawn more WGs
+                                           // after updating this also needs to update msb_radix_bitonic_sort.grl's computation of initial workgroups num
+
+#define BOTTOM_LEVEL_SORT_THRESHOLD 512 // TODO: is 4096 best value? ON skl gives best performance
+// Right now we use 256 workitems in simd16 which give us 16 hw threads, assuming 2KB per thread, we have 32KB SLM to play with.
+// Since we use ulong(8bytes) we can store 4096 elements
+// This also tells us that if number of elements to sort is less than this, we don't need to allocate scheduler
+// Need to keep in sync with the GRL const BOTTOM_LEVEL_SORT_THRESHOLD
+
+#define BOTTOM_LEVEL_SORT_MERGING_THRESHOLD 512 // This is the amount till which we'll merge small BLS'es produced by MSB into a single bigger BLS
+
+GRL_NAMESPACE_BEGIN(GRL)
+
+
+
+
+GRL_NAMESPACE_BEGIN(RTAS)
+GRL_NAMESPACE_BEGIN(MORTON_MSB_RADIX_BITONIC_SORT)
+
+struct MSBStackEntry
+{
+    uint start_offset;
+    uint count;
+    uint iteration;
+};
+
+struct MSBStack
+{
+    dword num_entries;
+    struct MSBStackEntry entries[MSB_STACK_ENTRIES_NUM];
+};
+
+struct MSBRadixContext
+{
+    uint start[MSB_RADIX_NUM_BINS];
+    uint count[MSB_RADIX_NUM_BINS];
+    uint num_wgs_in_flight; // this is used to identify which msb wg is last
+    uint num_keys; // number of keys to process
+    uint iteration;
+    ulong* keys_in;
+    ulong* keys_out;
+
+    uint start_offset; //offset from the beginning of the buffer
+};
+
+struct MSBDispatchRecord
+{
+    uint wgs_to_dispatch; // amount of workgroups to dispatch for this current record
+};
+
+struct MSBDispatchQueue
+{
+    dword num_records;
+    struct MSBDispatchRecord records[MSB_RADIX_NUM_VCONTEXTS]; // each context have its own record
+};
+
+// BLS(Bottom Level Sort) - last stage of sorting which will not spawn any new tasks
+struct BLSDispatchRecord
+{
+    uint start_offset; // offset from the beginning of the buffer
+    uint count;
+    ulong* keys_in; // we don't need keys_out since we will write always to the same output buffer 
+};
+
+struct BLSDispatchQueue
+{
+    dword num_records;
+    struct BLSDispatchRecord records[BLS_DISPATCH_QUEUE_NUM_RECORDS];
+};
+
+struct VContextScheduler
+{
+    /////////////////////////////////////////////////////////////
+    //  State data used for communication with command streamer
+    //  NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl'
+    /////////////////////////////////////////////////////////////
+
+    dword num_wgs_msb; // number of MSB workgroups being processed by current iteration
+    dword num_wgs_bls; // number of BLS workgroups being processed by current iteration
+
+    dword scheduler_postsync;
+    dword _pad1;
+
+    /////////////////////////////////////////////////////////////
+
+    struct MSBDispatchQueue msb_queue;
+    struct BLSDispatchQueue bls_queue0;
+    struct BLSDispatchQueue bls_queue1;
+
+    struct BLSDispatchQueue* curr_bls_queue;
+    struct BLSDispatchQueue* next_bls_queue;
+
+    struct MSBStack msb_stack;
+
+    struct MSBRadixContext contexts[MSB_RADIX_NUM_VCONTEXTS];
+};
+
+GRL_NAMESPACE_END(MORTON_MSB_RADIX_BITONIC_SORT)
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
diff --git a/src/intel/vulkan/grl/gpu/morton_radix_sort.cl b/src/intel/vulkan/grl/gpu/morton_radix_sort.cl
new file mode 100644
index 00000000000..e123b2f46d3
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton_radix_sort.cl
@@ -0,0 +1,9 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+// just inlines the kernels that are there in the header
+#include "morton_radix_sort.h"
diff --git a/src/intel/vulkan/grl/gpu/morton_radix_sort.h b/src/intel/vulkan/grl/gpu/morton_radix_sort.h
new file mode 100644
index 00000000000..d58ec829883
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/morton_radix_sort.h
@@ -0,0 +1,855 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "common.h"
+#include "libs/lsc_intrinsics.h"
+
+/* ============================================================================= */
+/* ============================== LSB RADIX SORT =============================== */
+/* ============================================================================= */
+
+#define RADIX_BINS 256
+#define SCATTER_WG_SIZE 512
+#define MORTON_LSB_SORT_NO_SHIFT_THRESHOLD 0xFFFFFFFF // turn off, because current hierarchy build requires full sort
+
+uint2 get_thread_range( uint numItems, uint numGroups, uint taskID )
+{
+    uint items_per_group = (numItems / numGroups);
+    uint remainder = numItems - (items_per_group * numGroups);
+    uint startID = taskID * items_per_group  + min(taskID, remainder);
+    uint endID   = startID + items_per_group + ((taskID < remainder) ? 1 : 0);
+
+    return (uint2)(startID,endID);
+}
+
+GRL_INLINE void sort_morton_codes_bin_items_taskID_func(global struct Globals* globals,
+                                                 global uint* global_histogram,
+                                                 global uchar* input,
+                                                 local uint* histogram,
+                                                 uint iteration,
+                                                 uint numGroups,
+                                                 uint numItems,
+                                                 bool shift_primID,
+                                                 uint taskID,
+                                                 uint startID,
+                                                 uint endID)
+{
+    const uint shift = globals->shift;
+
+    for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0))
+        histogram[i] = 0;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (shift_primID)
+    {
+        for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
+        {
+            // Read input as ulong to make bitshift, so the bits representing primID are not being
+            // taken into account during sorting, which would result in smaller sort loops for
+            // cases where morton shift are bigger than 8 bits
+            ulong* ptr_ul = (ulong*)&input[8 * i];
+            ulong code = *ptr_ul;
+            uchar* ptr = (uchar*)&code;
+            code >>= shift;
+
+            uchar bin = ptr[iteration];
+            atomic_inc_local(&histogram[bin]);
+        }
+    }
+    else
+    {
+        for (uint i = startID + get_local_id(0); i < endID; i += get_local_size(0))
+        {
+            uchar bin = input[8 * i + iteration];
+            atomic_inc_local(&histogram[bin]);
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0))
+        global_histogram[RADIX_BINS * taskID + i] = histogram[i];
+}
+
+GRL_INLINE void sort_morton_codes_bin_items_func(global struct Globals* globals,
+    global uint* global_histogram,
+    global uint* wg_flags,
+    global uchar* input,
+    local uint* histogram,
+    uint iteration,
+    uint numGroups,
+    uint numItems,
+    bool shift_primID,
+    bool update_wg_flags)
+{
+    if (shift_primID)
+    {
+        // This check is present in other LSB sort functions as well, its purpose is
+        // to skip first n iterations where n is the difference between max iterations
+        // and actually needed iterations to sort without primIDs
+        const uint req_iterations = globals->sort_iterations;
+        if (iteration < req_iterations)
+            return;
+
+        // iteration needs to be adjusted to reflect the skipped cycles
+        iteration -= req_iterations;
+    }
+
+    const uint taskID = get_group_id(0);
+
+    if (taskID == 0 && update_wg_flags)
+    {
+        for (uint i = get_local_id(0); i < RADIX_BINS; i += get_local_size(0))
+            wg_flags[i] = 0;
+    }
+
+    uint2 ids = get_thread_range(numItems, numGroups, taskID);
+    uint startID = ids.x;
+    uint endID = ids.y;
+
+    sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, shift_primID,
+                                            taskID, startID, endID);
+}
+
+__attribute__((reqd_work_group_size(512, 1, 1)))
+void kernel
+sort_morton_codes_bin_items(
+    global struct Globals* globals,
+    global uint* global_histogram,
+    global uint* wg_flags,
+    global uchar* input,
+    uint iteration,
+    uint numGroups,
+    uint update_wg_flags
+)
+{
+    local uint histogram[RADIX_BINS];
+    const uint numItems = globals->numPrimitives;
+    if(numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
+        sort_morton_codes_bin_items_func(globals, global_histogram, wg_flags, input, histogram, iteration, numGroups, numItems, false, update_wg_flags);
+    else
+        sort_morton_codes_bin_items_func(globals, global_histogram, wg_flags, input, histogram, iteration, numGroups, numItems, true, update_wg_flags);
+}
+
+
+GRL_INLINE void sort_morton_codes_reduce_bins_func(global struct Globals* globals,
+                                                   global uint* global_histogram,
+                                                   local uint* partials,
+                                                   uint numTasks,
+                                                   uint iteration,
+                                                   bool shift_primID)
+{
+    const uint localID = get_local_id(0);
+
+    if (shift_primID)
+    {
+        const uint req_iterations = globals->sort_iterations;
+        if (iteration < req_iterations)
+            return;
+    }
+
+    uint t = 0;
+    for (uint j = 0; j < numTasks; j++)
+    {
+        const uint count = load_uint_L1C_L3C(&global_histogram[RADIX_BINS * j + localID], 0);
+        store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * j + localID], 0, t);
+        t += count;
+    }
+
+    // each lane now contains the number of elements in the corresponding bin
+    //     prefix sum this for use in the subsequent scattering pass.
+    uint global_count = t;
+
+    partials[get_sub_group_id()] = sub_group_reduce_add(global_count);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    uint lane = get_sub_group_local_id();
+    uint p = partials[lane];
+    p = (lane < get_sub_group_id()) ? p : 0;
+
+    global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count);
+
+    store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * numTasks + localID], 0, global_count);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(256, 1, 1)))
+void kernel
+sort_morton_codes_reduce_bins(global struct Globals* globals,
+    uint numTasks,
+    global uint* global_histogram,
+    uint iteration)
+{
+    local uint partials[RADIX_BINS];
+    const uint numItems = globals->numPrimitives;
+    if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
+        sort_morton_codes_reduce_bins_func(globals, global_histogram, partials, numTasks, iteration, false);
+    else
+        sort_morton_codes_reduce_bins_func(globals, global_histogram, partials, numTasks, iteration, true);
+}
+
+
+#if 1
+GRL_INLINE void sort_morton_codes_scatter_items_func(
+    global struct Globals* globals,
+    global uint* global_histogram,
+    global ulong* input,
+    global ulong* output,
+    local uint* local_offset,
+    local uint* flags,
+    uint iteration,
+    uint numGroups,
+    uint numItems,
+    bool shift_primID,
+    bool update_morton_sort_in_flight)
+{
+    const uint gID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+
+    const uint global_shift = globals->shift;
+    const uint localID = get_local_id(0);
+    const uint taskID = get_group_id(0);
+
+    if (gID == 0 && update_morton_sort_in_flight)
+        globals->morton_sort_in_flight = 0;
+
+    uint2 ids = get_thread_range(numItems, numGroups, taskID);
+    uint startID = ids.x;
+    uint endID = ids.y;
+
+    if (shift_primID)
+    {
+        const uint req_iterations = globals->sort_iterations;
+        if (iteration < req_iterations)
+            return;
+
+        iteration -= req_iterations;
+    }
+
+    const uint shift = 8 * iteration;
+
+    // load the global bin counts, and add each bin's global prefix
+    //   to the local prefix
+    {
+        uint global_prefix = 0, local_prefix = 0;
+        if (localID < RADIX_BINS)
+        {
+            local_prefix = global_histogram[RADIX_BINS * taskID + localID];
+            global_prefix = global_histogram[RADIX_BINS * numGroups + localID];
+            local_offset[localID] = global_prefix + local_prefix;
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+
+    // move elements in WG-sized chunks.   The elements need to be moved sequentially (can't use atomics)
+    //   because relative order has to be preserved for LSB radix sort to work
+
+    // For each bin, a bit vector indicating which elements are in the bin
+    for (uint block_base = startID; block_base < endID; block_base += get_local_size(0))
+    {
+        // initialize bit vectors
+        for (uint i = 4 * localID; i < RADIX_BINS * SCATTER_WG_SIZE / 32; i += 4 * get_local_size(0))
+        {
+            flags[i + 0] = 0;
+            flags[i + 1] = 0;
+            flags[i + 2] = 0;
+            flags[i + 3] = 0;
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // read sort key, determine which bin it goes into, scatter into the bit vector
+        //  and pre-load the local offset
+        uint ID = localID + block_base;
+        ulong key = 0;
+        uint bin_offset = 0;
+        uint bin = 0;
+        uint bin_word = localID / 32;
+        uint bin_bit = 1 << (localID % 32);
+
+        if (ID < endID)
+        {
+            key = input[ID];
+
+            if (shift_primID)
+                bin = ((key >> global_shift) >> shift) & (RADIX_BINS - 1);
+            else
+                bin = (key >> shift) & (RADIX_BINS - 1);
+
+            atomic_add_local(&flags[(SCATTER_WG_SIZE / 32) * bin + bin_word], bin_bit);
+            bin_offset = local_offset[bin];
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        if (ID < endID)
+        {
+            // each key reads the bit-vectors for its bin,
+            //    - Computes local prefix sum to determine its output location
+            //    - Computes number of items added to its bin (last thread adjusts bin position)
+            uint prefix = 0;
+            uint count = 0;
+            for (uint i = 0; i < (SCATTER_WG_SIZE / 32); i++)
+            {
+                uint bits = flags[(SCATTER_WG_SIZE / 32) * bin + i];
+                uint bc = popcount(bits);
+                uint pc = popcount(bits & (bin_bit - 1));
+                prefix += (i < bin_word) ? bc : 0;
+                prefix += (i == bin_word) ? pc : 0;
+
+                count += bc;
+            }
+
+            // store the key in its proper place..
+            output[prefix + bin_offset] = key;
+
+            // last item for each bin adjusts local offset for next outer loop iteration
+            if (prefix == count - 1)
+                local_offset[bin] += count;
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+    }
+
+    /* uint local_offset[RADIX_BINS];   */
+    /* uint offset_global = 0; */
+    /* for (int i=0;i<RADIX_BINS;i++) */
+    /*   { */
+    /*     const uint count_global = global_histogram[RADIX_BINS*numTasks+i]; */
+    /*     const uint offset_local  = global_histogram[RADIX_BINS*taskID+i]; */
+    /*     local_offset[i] = offset_global + offset_local; */
+    /*     offset_global += count_global; */
+    /*   } */
+
+    /* for (uint ID=startID;ID<endID;ID++) */
+    /* { */
+    /*   const uint bin = (input[ID] >> shift) & (RADIX_BINS-1); */
+    /*   const uint offset = local_offset[bin]; */
+    /*   output[offset] = input[ID]; */
+    /*   local_offset[bin]++; */
+    /* } */
+}
+
+#else
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+sort_morton_codes_scatter_items(
+    global struct Globals* globals,
+    uint shift,
+    global uint* global_histogram,
+    global char* input0,
+    global char* input1,
+    unsigned int input0_offset,
+    unsigned int input1_offset,
+    uint iteration)
+{
+    const uint numItems = globals->numPrimitives;
+    const uint local_size = get_local_size(0);
+    const uint taskID = get_group_id(0);
+    const uint numTasks = get_num_groups(0);
+    const uint localID = get_local_id(0);
+    const uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+
+    const uint startID = (taskID + 0) * numItems / numTasks;
+    const uint endID = (taskID + 1) * numItems / numTasks;
+
+    global ulong* input = (global ulong*)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset);
+    global ulong* output = (global ulong*)((iteration % 2) == 0 ? input1 + input1_offset : input0 + input0_offset);
+
+    local uint local_offset[RADIX_BINS];
+    uint off = 0;
+    for (int i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size)
+    {
+        const uint count = global_histogram[RADIX_BINS * numTasks + i];
+        const uint offset_task = global_histogram[RADIX_BINS * taskID + i];
+        const uint sum = sub_group_reduce_add(count);
+        const uint prefix_sum = sub_group_scan_exclusive_add(count);
+        local_offset[i] = off + offset_task + prefix_sum;
+        off += sum;
+    }
+
+    for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size)
+    {
+        const uint bin = (input[ID] >> shift) & (RADIX_BINS - 1);
+        const uint offset = atomic_add_local(&local_offset[bin], 1);
+        output[offset] = input[ID];
+    }
+
+    /* uint local_offset[RADIX_BINS];   */
+    /* uint offset_global = 0; */
+    /* for (int i=0;i<RADIX_BINS;i++) */
+    /*   { */
+    /*     const uint count_global = global_histogram[RADIX_BINS*numTasks+i]; */
+    /*     const uint offset_local  = global_histogram[RADIX_BINS*taskID+i]; */
+    /*     local_offset[i] = offset_global + offset_local; */
+    /*     offset_global += count_global; */
+    /*   } */
+
+    /* for (uint ID=startID;ID<endID;ID++) */
+    /* { */
+    /*   const uint bin = (input[ID] >> shift) & (RADIX_BINS-1); */
+    /*   const uint offset = local_offset[bin]; */
+    /*   output[offset] = input[ID]; */
+    /*   local_offset[bin]++; */
+    /* } */
+}
+#endif
+
+#if 1
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(SCATTER_WG_SIZE, 1, 1)))
+void kernel
+sort_morton_codes_scatter_items(
+    global struct Globals *globals,
+    global uint *global_histogram,
+    global ulong *input,
+    global ulong *output,
+    uint iteration,
+    uint numGroups,
+    uint update_morton_sort_in_flight)
+{
+    local uint local_offset[RADIX_BINS];
+    local uint flags[RADIX_BINS*SCATTER_WG_SIZE/32];
+    const uint numItems = globals->numPrimitives;
+    if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
+        sort_morton_codes_scatter_items_func(globals, global_histogram, input, output, local_offset,
+                                             flags, iteration, numGroups, numItems, false, update_morton_sort_in_flight);
+    else
+        sort_morton_codes_scatter_items_func(globals, global_histogram, input, output, local_offset,
+                                             flags, iteration, numGroups, numItems, true, update_morton_sort_in_flight);
+}
+
+#else
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+sort_morton_codes_scatter_items(
+    global struct Globals *globals,
+    uint shift,
+    global uint *global_histogram,
+    global char *input0,
+    global char *input1,
+    unsigned int input0_offset,
+    unsigned int input1_offset,
+    uint iteration)
+{
+    const uint numItems = globals->numPrimitives;
+    const uint local_size = get_local_size(0);
+    const uint taskID = get_group_id(0);
+    const uint numTasks = get_num_groups(0);
+    const uint localID = get_local_id(0);
+    const uint globalID = get_local_id(0) + get_group_id(0)*get_local_size(0);
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+
+    const uint startID = (taskID + 0) * numItems / numTasks;
+    const uint endID = (taskID + 1) * numItems / numTasks;
+
+    global ulong *input = (global ulong *)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset);
+    global ulong *output = (global ulong *)((iteration % 2) == 0 ? input1 + input1_offset : input0 + input0_offset);
+
+    local uint local_offset[RADIX_BINS];
+    uint off = 0;
+    for (int i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size)
+    {
+        const uint count = global_histogram[RADIX_BINS * numTasks + i];
+        const uint offset_task = global_histogram[RADIX_BINS * taskID + i];
+        const uint sum = sub_group_reduce_add(count);
+        const uint prefix_sum = sub_group_scan_exclusive_add(count);
+        local_offset[i] = off + offset_task + prefix_sum;
+        off += sum;
+    }
+
+    for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size)
+    {
+        const uint bin = (input[ID] >> shift) & (RADIX_BINS - 1);
+        const uint offset = atomic_add_local(&local_offset[bin], 1);
+        output[offset] = input[ID];
+    }
+
+    /* uint local_offset[RADIX_BINS];   */
+    /* uint offset_global = 0; */
+    /* for (int i=0;i<RADIX_BINS;i++) */
+    /*   { */
+    /*     const uint count_global = global_histogram[RADIX_BINS*numTasks+i]; */
+    /*     const uint offset_local  = global_histogram[RADIX_BINS*taskID+i]; */
+    /*     local_offset[i] = offset_global + offset_local; */
+    /*     offset_global += count_global; */
+    /*   } */
+
+    /* for (uint ID=startID;ID<endID;ID++) */
+    /* { */
+    /*   const uint bin = (input[ID] >> shift) & (RADIX_BINS-1); */
+    /*   const uint offset = local_offset[bin]; */
+    /*   output[offset] = input[ID]; */
+    /*   local_offset[bin]++; */
+    /* } */
+}
+#endif
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(512, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
+void kernel
+sort_morton_codes_merged(
+    global struct Globals* globals,
+    global uint* global_histogram,
+    global uchar* input,
+    uint iteration,
+    uint numGroups
+)
+{
+    const uint numItems = globals->numPrimitives;
+    const uint taskID = get_group_id(0);
+    const uint loc_id = get_local_id(0);
+    const uint lane = get_sub_group_local_id();
+
+    uint2 ids = get_thread_range(numItems, numGroups, taskID);
+    uint startID = ids.x;
+    uint endID = ids.y;
+
+    local uint histogram[RADIX_BINS];
+    local uint hist_tmp[RADIX_BINS];
+
+    if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
+    {
+        sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, false,
+            taskID, startID, endID);
+    }
+    else
+    {
+        const uint req_iterations = globals->sort_iterations;
+        if (iteration < req_iterations)
+            return;
+
+        iteration -= req_iterations;
+
+        sort_morton_codes_bin_items_taskID_func(globals, global_histogram, input, histogram, iteration, numGroups, numItems, true,
+            taskID, startID, endID);
+    }
+
+    uint last_group = 0;
+    if (loc_id == 0)
+        last_group = atomic_inc_global(&globals->morton_sort_in_flight);
+
+    write_mem_fence(CLK_GLOBAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    last_group = work_group_broadcast(last_group, 0);
+
+    bool isLastGroup = (loc_id < RADIX_BINS) && (last_group == numGroups - 1);
+
+    uint global_count = 0;
+
+    if (isLastGroup)
+    {
+        for (uint j = 0; j < numGroups; j++)
+        {
+            const uint count = (j == taskID) ? histogram[loc_id] : load_uint_L1C_L3C(&global_histogram[RADIX_BINS * j + loc_id], 0);
+            store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * j + loc_id], 0, global_count);
+            global_count += count;
+        }
+
+        hist_tmp[get_sub_group_id()] = (get_sub_group_id() < MAX_HW_SIMD_WIDTH) ? sub_group_reduce_add(global_count) : 0;
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (isLastGroup)
+    {
+        uint p = hist_tmp[lane];
+        p = (lane < get_sub_group_id()) ? p : 0;
+
+        global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count);
+
+        store_uint_L1WB_L3WB(&global_histogram[RADIX_BINS * numGroups + loc_id], 0, global_count);
+    }
+}
+
+#if 0
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(16))) void kernel
+sort_morton_codes_bin_items(
+    global struct Globals* globals,
+    uint shift,
+    global uint* global_histogram,
+    global char* input0,
+    global char* input1,
+    unsigned int input0_offset,
+    unsigned int input1_offset,
+    uint iteration)
+{
+    const uint numItems = globals->numPrimitives;
+    const uint local_size = get_local_size(0);
+    const uint taskID = get_group_id(0);
+    const uint numTasks = get_num_groups(0);
+    const uint localID = get_local_id(0);
+    const uint globalID = get_local_id(0) + get_group_id(0) * get_local_size(0);
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint subgroup_size = get_sub_group_size();
+
+    const uint startID = (taskID + 0) * numItems / numTasks;
+    const uint endID = (taskID + 1) * numItems / numTasks;
+
+    global ulong* input = (global ulong*)((iteration % 2) == 0 ? input0 + input0_offset : input1 + input1_offset);
+
+#if 1
+    local uint histogram[RADIX_BINS];
+    for (uint i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size)
+        histogram[i] = 0;
+
+    for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size)
+    {
+        const uint bin = ((uint)(input[ID] >> (ulong)shift)) & (RADIX_BINS - 1);
+        atomic_add(&histogram[bin], 1);
+    }
+
+    for (uint i = subgroupLocalID; i < RADIX_BINS; i += subgroup_size)
+        global_histogram[RADIX_BINS * taskID + i] = histogram[i];
+
+#else
+    uint histogram[RADIX_BINS];
+    for (int i = 0; i < RADIX_BINS; i++)
+        histogram[i] = 0;
+
+    for (uint ID = startID + subgroupLocalID; ID < endID; ID += subgroup_size)
+    {
+        const uint bin = ((uint)(input[ID] >> (ulong)shift)) & (RADIX_BINS - 1);
+        histogram[bin]++;
+    }
+
+    for (uint i = 0; i < RADIX_BINS; i++)
+    {
+        const uint reduced_counter = sub_group_reduce_add(histogram[i]);
+        global_histogram[RADIX_BINS * taskID + i] = reduced_counter;
+    }
+#endif
+}
+
+#endif
+
+#define WG_SIZE_WIDE 256
+#define SG_SIZE_SCAN 16
+
+// Fast implementation of work_group_scan_exclusive using SLM for WG size 256 and SG size 16
+GRL_INLINE uint work_group_scan_exclusive_add_opt(local uint* tmp, uint val)
+{
+    const uint hw_thread_in_wg_id = get_local_id(0) / SG_SIZE_SCAN;
+    const uint sg_local_id = get_local_id(0) % SG_SIZE_SCAN;
+    const uint NUM_HW_THREADS_IN_WG = WG_SIZE_WIDE / SG_SIZE_SCAN;
+
+    uint acc = sub_group_scan_exclusive_add(val);
+    uint acc2 = acc + val;
+
+    tmp[hw_thread_in_wg_id] = sub_group_broadcast(acc2, SG_SIZE_SCAN - 1);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uint loaded_val = tmp[sg_local_id];
+    uint wgs_acc = sub_group_scan_exclusive_add(loaded_val);
+    uint acc_for_this_hw_thread = sub_group_broadcast(wgs_acc, hw_thread_in_wg_id);
+    return acc + acc_for_this_hw_thread;
+}
+
+// Wide reduce algorithm is divided into 2 kernels:
+// 1. First, partial exclusive add scans are made within each work group using SLM.
+//    Then, The last work group for each histogram bin perform exclusive add scan along the bins using separate histgram_partials buffer.
+//    Last work group is determined using global atomics on wg_flags buffer.
+// 2. Second kernel globally adds the values from histgram_partials to the histogram buffer where partial sums are.
+//    Then, last work group performs one more work_group scan and add so the histogram buffer values are adjusted with the global ones.
+GRL_INLINE void sort_morton_codes_reduce_bins_wide_partial_sum_func(
+    global struct Globals* globals,
+    global uint* global_histogram,
+    global uint* global_histogram_partials,
+    global uint* wg_flags,
+    local uint* exclusive_scan_tmp,
+    uint numTasks,
+    uint numGroups,
+    uint iteration,
+    bool shift_primID)
+{
+    if (shift_primID)
+    {
+        const uint req_iterations = globals->sort_iterations;
+        if (iteration < req_iterations)
+            return;
+
+        iteration -= req_iterations;
+    }
+
+    const uint groupID = get_group_id(0) % RADIX_BINS;
+    const uint scanGroupID = get_group_id(0) / RADIX_BINS;
+    uint localID = get_local_id(0);
+    uint globalID = localID + (scanGroupID * WG_SIZE_WIDE);
+    const uint lastGroup = (numGroups / WG_SIZE_WIDE);
+    const uint endID = min(numTasks, (uint)(scanGroupID * WG_SIZE_WIDE + WG_SIZE_WIDE)) - 1;
+
+    uint temp = 0;
+    uint last_count = 0;
+    if (globalID < numTasks)
+    {
+        temp = global_histogram[RADIX_BINS * globalID + groupID];
+
+        // Store the last value of the work group, it is either last element of histogram or last item in work group
+        if (globalID == endID)
+            last_count = temp;
+    }
+
+    uint val = work_group_scan_exclusive_add_opt(exclusive_scan_tmp, temp);
+
+    if (globalID <= numTasks)
+    {
+        global_histogram[RADIX_BINS * globalID + groupID] = val;
+
+        // Store the block sum value to separate buffer
+        if (globalID == endID)
+            global_histogram_partials[scanGroupID * WG_SIZE_WIDE + groupID] = val + last_count;
+    }
+
+    // Make sure that global_histogram_partials is updated in all work groups
+    write_mem_fence(CLK_GLOBAL_MEM_FENCE);
+    barrier(0);
+
+    // Now, wait for the last group for each histogram bin, so we know that
+    // all work groups already updated the global_histogram_partials buffer
+    uint last_group = 0;
+    if (localID == 0)
+        last_group = atomic_inc_global(&wg_flags[groupID]);
+
+    last_group = work_group_broadcast(last_group, 0);
+    bool isLastGroup = (last_group == lastGroup - 1);
+
+    // Each of the last groups computes the scan exclusive add for each partial sum we have
+    if (isLastGroup)
+    {
+        uint temp1 = 0;
+        if (localID < lastGroup)
+            temp1 = global_histogram_partials[localID * WG_SIZE_WIDE + groupID];
+
+        uint val2 = work_group_scan_exclusive_add_opt(exclusive_scan_tmp, temp1);
+
+        if (localID < lastGroup)
+            global_histogram_partials[localID * WG_SIZE_WIDE + groupID] = val2;
+    }
+}
+
+GRL_INLINE void sort_morton_codes_reduce_bins_wide_add_reduce_func(
+    global struct Globals* globals,
+    global uint* global_histogram,
+    global uint* global_histogram_partials,
+    local uint* partials,
+    uint numTasks,
+    uint numGroups,
+    uint iteration,
+    bool shift_primID)
+{
+    if (shift_primID)
+    {
+        const uint req_iterations = globals->sort_iterations;
+        if (iteration < req_iterations)
+            return;
+
+        iteration -= req_iterations;
+    }
+
+    const uint groupID = get_group_id(0) % RADIX_BINS;
+    const uint scanGroupID = get_group_id(0) / RADIX_BINS;
+    const uint lastGroup = (numGroups / WG_SIZE_WIDE);
+    uint localID = get_local_id(0);
+    uint globalID = localID + (scanGroupID * WG_SIZE_WIDE);
+    const uint endID = min(numTasks, (uint)(scanGroupID * WG_SIZE_WIDE + WG_SIZE_WIDE)) - 1;
+
+    // Add the global sums to the partials, skip the firsy scanGroupID as the first add
+    // value is 0 in case of exclusive add scans
+    if (scanGroupID > 0 && globalID <= numTasks)
+    {
+        uint add_val = global_histogram_partials[scanGroupID * RADIX_BINS + groupID];
+        atomic_add_global(&global_histogram[globalID * RADIX_BINS + groupID], add_val);
+    }
+
+    // Wait for the last group
+    uint last_group = 0;
+    if (localID == 0)
+        last_group = atomic_inc_global(&globals->morton_sort_in_flight);
+
+    last_group = work_group_broadcast(last_group, 0);
+    bool isLastGroup = (last_group == numGroups - 1);
+
+    // Do the exclusive scan within all bins with global data now
+    if (isLastGroup)
+    {
+        mem_fence_gpu_invalidate();
+
+        uint global_count = global_histogram[numTasks * RADIX_BINS + localID];
+
+        partials[get_sub_group_id()] = sub_group_reduce_add(global_count);
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        uint lane = get_sub_group_local_id();
+        uint p = partials[lane];
+        p = (lane < get_sub_group_id()) ? p : 0;
+
+        global_count = sub_group_reduce_add(p) + sub_group_scan_exclusive_add(global_count);
+
+        store_uint_L1WB_L3WB(&global_histogram[numTasks * RADIX_BINS + localID], 0, global_count);
+    }
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(WG_SIZE_WIDE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(SG_SIZE_SCAN)))
+void kernel
+sort_morton_codes_reduce_bins_wide_partial_sum(
+    global struct Globals* globals,
+    uint numTasks,
+    uint numGroups,
+    global uint* global_histogram,
+    global uint* global_histogram_partials,
+    global uint* wg_flags,
+    uint iteration)
+{
+    local uint exclusive_scan_tmp[WG_SIZE_WIDE / SG_SIZE_SCAN];
+
+    const uint numItems = globals->numPrimitives;
+    if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
+        sort_morton_codes_reduce_bins_wide_partial_sum_func(globals, global_histogram, global_histogram_partials, wg_flags, exclusive_scan_tmp, numTasks, numGroups, iteration, false);
+    else
+        sort_morton_codes_reduce_bins_wide_partial_sum_func(globals, global_histogram, global_histogram_partials, wg_flags, exclusive_scan_tmp, numTasks, numGroups, iteration, true);
+}
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(WG_SIZE_WIDE, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(SG_SIZE_SCAN)))
+void kernel
+sort_morton_codes_reduce_bins_wide_add_reduce(
+    global struct Globals* globals,
+    uint numTasks,
+    uint numGroups,
+    global uint* global_histogram,
+    global uint* global_histogram_partials,
+    uint iteration)
+{
+    local uint partials[RADIX_BINS];
+
+    const uint numItems = globals->numPrimitives;
+    if (numItems < MORTON_LSB_SORT_NO_SHIFT_THRESHOLD)
+        sort_morton_codes_reduce_bins_wide_add_reduce_func(globals, global_histogram, global_histogram_partials, partials, numTasks, numGroups, iteration, false);
+    else
+        sort_morton_codes_reduce_bins_wide_add_reduce_func(globals, global_histogram, global_histogram_partials, partials, numTasks, numGroups, iteration, true);
+}
diff --git a/src/intel/vulkan/grl/gpu/msb_radix_bitonic_sort.grl b/src/intel/vulkan/grl/gpu/msb_radix_bitonic_sort.grl
new file mode 100644
index 00000000000..dee315adcda
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/msb_radix_bitonic_sort.grl
@@ -0,0 +1,297 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module msb_radix_bitonic_sort;
+
+kernel_module msb_radix_sort ("morton_msb_radix_bitonic_sort.cl")
+{
+    links lsc_intrinsics;
+
+    kernel opencl_debug_print                                    < kernelFunction="debug_print_kernel">;
+    kernel opencl_check_bls                                      < kernelFunction="check_bls_sort">;
+
+    kernel opencl_bottom_level_sort_single_wg                    < kernelFunction="sort_morton_codes_bottom_level_single_wg">;
+
+    kernel opencl_build_morton_kernel_sort_msb_init              < kernelFunction="sort_morton_codes_msb_begin">;
+
+    kernel opencl_build_morton_kernel_sort_msb_scheduler         < kernelFunction="scheduler">;
+
+    kernel opencl_build_morton_kernel_sort_bottom_level          < kernelFunction="sort_morton_codes_bottom_level">;
+
+    kernel opencl_build_morton_kernel_sort_msb_count_items       < kernelFunction="sort_morton_codes_msb_count_items">;
+    kernel opencl_build_morton_kernel_sort_msb_bin_items         < kernelFunction="sort_morton_codes_msb_bin_items">;
+
+    kernel opencl_build_morton_kernel_sort_batched_bls_dispatch  < kernelFunction="sort_morton_codes_batched_BLS_dispatch">;
+}
+
+
+const MSB_RADIX_NUM_VCONTEXTS  = 8;
+const BOTTOM_LEVEL_SORT_THRESHOLD  = 512;
+
+struct MSBRadixScheduler
+{
+    dword num_wgs_msb;
+    dword num_wgs_bls;
+
+    dword scheduler_postsync;
+    dword _pad1;
+};
+
+struct MSBRadixArgs
+{
+    qword p_scheduler;
+    qword p_num_primitives;
+};
+
+
+
+
+struct BatchedBLSDispatchEntry
+{
+    qword p_data_buffer;
+    qword num_elements; // number of elements in p_data_buffer
+};
+
+
+
+
+metakernel add_bls_dispatch_init(qword p_storage)
+{
+    define REG_numWgs         REG14;
+    define REG_p_storage      REG15;
+
+    REG_numWgs = 0;
+    REG_p_storage = p_storage;
+}
+
+
+
+
+// basically this code does:
+// bls_args_for_dispatches[dispatchID] = { bls_new_pointer, numPrimitives };
+// dispatchId++;
+//
+metakernel add_bls_dispatch(
+    qword p_data,
+    qword p_num_primitives
+)
+{
+    define C_1                                REG0;
+    define C_8                                REG1;
+
+    define C_MIN_PRIMREFS                     REG2;
+
+    define REG_p_data                         REG3;
+    define REG_num_prims                      REG4;
+    define REG_no_dispatch                    REG5;
+
+    define REG_numWgs                         REG14;
+    define REG_p_storage                      REG15;
+
+    C_MIN_PRIMREFS = 2;
+
+    REG_num_prims = 0;
+    REG_num_prims.lo = load_dword(p_num_primitives);
+
+    REG_no_dispatch  = REG_num_prims < C_MIN_PRIMREFS;
+
+    goto l_finish if(REG_no_dispatch.lo);
+
+    C_1 = 1;
+    C_8 = 8;
+
+    // pseudocode: BatchedBLSDispatchEntry.p_data_buffer = p_data
+    REG_p_data = p_data;
+    store_qword( REG_p_storage, REG_p_data ); // store the data pointer
+
+    REG_p_storage = REG_p_storage + C_8; // point to next member in BatchedBLSDispatchEntry struct
+
+    // pseudocode: BatchedBLSDispatchEntry.num_elements = *p_num_primitives
+    store_qword( REG_p_storage, REG_num_prims );
+
+    REG_p_storage = REG_p_storage + C_8; // point to next BatchedBLSDispatchEntry instance
+
+    REG_numWgs = REG_numWgs + C_1;
+
+l_finish:
+
+}
+
+
+
+
+metakernel batched_bls_dispatch(
+    qword private_mem
+)
+{
+    define REG_numWgs REG14;
+
+    DISPATCHDIM_X = REG_numWgs;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect opencl_build_morton_kernel_sort_batched_bls_dispatch args(private_mem);
+}
+
+
+
+
+metakernel sort_bottom_level(
+    qword build_globals,
+    qword input,
+    qword p_num_primitives)
+{
+    define REG_num_prims       REG0;
+    define C_MIN_PRIMREFS      REG1;
+    define REG_no_dispatch     REG2;
+
+    REG_num_prims  = load_dword( p_num_primitives );
+
+    C_MIN_PRIMREFS = 2;
+
+    REG_no_dispatch  = REG_num_prims < C_MIN_PRIMREFS;
+
+    goto l_finish if(REG_no_dispatch.lo);
+
+    dispatch opencl_bottom_level_sort_single_wg (1, 1, 1) args(build_globals, input, input);
+
+l_finish:
+
+}
+
+
+
+
+metakernel sort(
+    qword build_globals,
+    qword input,
+    qword tmp,
+    MSBRadixArgs sort_args)
+{
+    define REG_num_prims       REG0;
+    {
+        define C_MIN_PRIMREFS           REG1;
+        define C_MAX_PRIMREFS           REG2;
+        define REG_no_dispatch          REG3;
+        define REG_dispatch_single_wg   REG4;
+    
+        REG_num_prims  = load_dword( sort_args.p_num_primitives );
+        C_MIN_PRIMREFS = 2;
+        C_MAX_PRIMREFS = BOTTOM_LEVEL_SORT_THRESHOLD;
+    
+        REG_no_dispatch  = REG_num_prims < C_MIN_PRIMREFS;
+        REG_dispatch_single_wg = REG_num_prims < C_MAX_PRIMREFS;
+    
+        goto l_sort_finish if(REG_no_dispatch.lo);
+        goto l_dispatch_single_wg if(REG_dispatch_single_wg.lo);
+        goto l_full_sort;
+    }
+
+l_dispatch_single_wg:
+
+    {
+        dispatch opencl_bottom_level_sort_single_wg (1, 1, 1) args(build_globals, input, input);
+        goto l_sort_finish;
+    }
+
+l_full_sort:
+
+    define p_scheduler                  sort_args.p_scheduler;
+    define p_scheduler_postsync        (sort_args.p_scheduler + offsetof(MSBRadixScheduler.scheduler_postsync) );
+    define p_num_wgs_bls               (sort_args.p_scheduler + offsetof(MSBRadixScheduler.num_wgs_bls) );
+
+    define REG_scheduler_postsync    REG3;
+    REG_scheduler_postsync = p_scheduler_postsync;
+
+    define C_0    REG4;
+    define C_8    REG5;
+    define C_255  REG6;
+    C_0 = 0;
+    C_8 = 8;
+    C_255 = 255;
+
+    store_qword( REG_scheduler_postsync, C_0 ); // initialize scheduler semaphore
+
+    REG_num_prims = REG_num_prims + C_255;
+    REG_num_prims = REG_num_prims >> C_8;
+
+    DISPATCHDIM_X = REG_num_prims.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    control( cs_store_fence ); // commit the semaphore write 
+
+    // initialize the whole execution
+    dispatch opencl_build_morton_kernel_sort_msb_init (MSB_RADIX_NUM_VCONTEXTS, 1, 1) args(build_globals, sort_args.p_scheduler, input, tmp)
+        postsync store_dword( p_scheduler_postsync, 1 );
+
+    // wait on count_items kernel
+    semaphore_wait while( *p_scheduler_postsync != 1 );
+
+    dispatch_indirect opencl_build_morton_kernel_sort_msb_count_items args(sort_args.p_scheduler)
+        postsync store_dword( p_scheduler_postsync, 2 );
+        
+    // wait on count_items kernel
+    semaphore_wait while( *p_scheduler_postsync != 2 );
+
+    dispatch_indirect opencl_build_morton_kernel_sort_msb_bin_items args(sort_args.p_scheduler, input)
+        postsync store_dword( p_scheduler_postsync, 0 );
+
+    define C_MASK_HI REG4;
+    C_MASK_HI = 0x00000000ffffffff;
+
+    l_build_loop:
+    {
+        semaphore_wait while( *p_scheduler_postsync != 0 );
+        {
+            dispatch opencl_build_morton_kernel_sort_msb_scheduler(1,1,1) args( sort_args.p_scheduler, input, tmp )
+                postsync store_dword( p_scheduler_postsync, 1 );
+        
+            // wait on scheduler kernel
+            semaphore_wait while( *p_scheduler_postsync != 1 );
+        }
+
+        // load and process the scheduler results
+        define REG_wg_counts    REG0;
+        define REG_num_msb_wgs  REG0.lo;
+        define REG_num_bls_wgs  REG0.hi;
+        define REG_p_scheduler  REG1;
+        define REG_no_msb_wgs   REG2;
+        {
+            REG_p_scheduler = p_scheduler;
+            REG_wg_counts    = load_qword( REG_p_scheduler ); 
+
+            REG_no_msb_wgs = REG_wg_counts  & C_MASK_HI;
+            REG_no_msb_wgs = REG_no_msb_wgs == 0;
+        }
+
+        // dispatch new bls WGs
+        DISPATCHDIM_X = REG_num_bls_wgs;
+        dispatch_indirect opencl_build_morton_kernel_sort_bottom_level args( p_scheduler, input );
+
+        // jump out if there are no msb WGs
+        goto l_sort_finish if (REG_no_msb_wgs);
+
+        DISPATCHDIM_X = REG_num_msb_wgs;
+        dispatch_indirect opencl_build_morton_kernel_sort_msb_count_items args(sort_args.p_scheduler)
+            postsync store_dword( p_scheduler_postsync, 2 );
+        
+        // wait on count_items kernel
+        semaphore_wait while( *p_scheduler_postsync != 2 );
+
+        dispatch_indirect opencl_build_morton_kernel_sort_msb_bin_items args(sort_args.p_scheduler, input)
+            postsync store_dword( p_scheduler_postsync, 0 );
+
+        // wait till all BLS finished launching
+        semaphore_wait while( *p_num_wgs_bls != 0 );
+
+        goto l_build_loop;
+    }
+
+l_sort_finish:
+
+}
diff --git a/src/intel/vulkan/grl/gpu/new_sah_builder.grl b/src/intel/vulkan/grl/gpu/new_sah_builder.grl
new file mode 100644
index 00000000000..d0a9694acc2
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/new_sah_builder.grl
@@ -0,0 +1,665 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module new_sah_builder;
+
+kernel_module bfs_kernels ("bvh_build_BFS.cl")
+{
+    links lsc_intrinsics;
+
+    kernel opencl_build_kernel_BinnedSAH_BFS_pass1_initial      <  kernelFunction="BFS_pass1_initial"  >   ;
+    kernel opencl_build_kernel_BinnedSAH_BFS_pass1_indexed      <  kernelFunction="BFS_pass1_indexed"  >   ;
+    kernel opencl_build_kernel_BinnedSAH_BFS_pass2_initial      <  kernelFunction="BFS_pass2_initial"  >   ;
+    kernel opencl_build_kernel_BinnedSAH_BFS_pass2_indexed      <  kernelFunction="BFS_pass2_indexed"  >   ;
+
+    kernel opencl_build_kernel_BinnedSAH_DFS                    <  kernelFunction="DFS"        >;
+    // kernel opencl_build_kernel_BinnedSAH_BuildQNodes            <  kernelFunction="build_qnodes" >;
+    kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Kickoff    <  kernelFunction="build_qnodes_pc_kickoff" >;
+    kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify    <  kernelFunction="build_qnodes_pc_amplify" >;
+    kernel opencl_build_kernel_BinnedSAH_begin                  <  kernelFunction = "begin" >;
+    kernel opencl_build_kernel_BinnedSAH_scheduler              <  kernelFunction = "scheduler" >;
+
+    kernel opencl_build_kernel_BinnedSAH_BFS_pass1_initial_batch   < kernelFunction="BFS_pass1_initial_batchable"  >;
+    kernel opencl_build_kernel_BinnedSAH_BFS_pass1_indexed_batch   < kernelFunction="BFS_pass1_indexed_batchable"  >;
+    kernel opencl_build_kernel_BinnedSAH_BFS_pass2_initial_batch   < kernelFunction="BFS_pass2_initial_batchable"  >;
+    kernel opencl_build_kernel_BinnedSAH_BFS_pass2_indexed_batch   < kernelFunction="BFS_pass2_indexed_batchable"  >;
+
+    kernel opencl_build_kernel_BinnedSAH_categorize_builds_and_init_scheduler < kernelFunction="categorize_builds_and_init_scheduler" >;
+    kernel opencl_build_kernel_BinnedSAH_begin_batched     < kernelFunction="begin_batchable"   >;
+
+    kernel opencl_build_kernel_BinnedSAH_qnode_init_scheduler_batched      < kernelFunction="build_qnodes_init_scheduler_batched" >;
+    kernel opencl_build_kernel_BinnedSAH_qnode_begin_batched               < kernelFunction="build_qnodes_begin_batchable" >;
+    kernel opencl_build_kernel_BinnedSAH_qnode_scheduler                   < kernelFunction="build_qnodes_scheduler" >;
+    kernel opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify_batch         < kernelFunction="build_qnodes_pc_amplify_batched" >;
+
+    kernel opencl_build_kernel_BinnedSAH_BuildQnodes_TryToFillGRB_batched < kernelFunction="build_qnodes_try_to_fill_grb_batched" >;
+
+}
+
+kernel opencl_build_kernel_DFS_single_wg             < source="bvh_build_DFS.cl", kernelFunction="DFS_single_wg" >
+kernel opencl_build_kernel_DFS_trivial               < source="bvh_build_DFS.cl", kernelFunction="DFS_trivial"  >
+kernel opencl_build_kernel_DFS_single_wg_batch       < source="bvh_build_DFS.cl", kernelFunction="DFS_single_wg_batchable" >
+kernel opencl_build_kernel_DFS_trivial_batch         < source="bvh_build_DFS.cl", kernelFunction="DFS_trivial_batchable"   >
+
+kernel single_pass_binsah                            < source="bvh_build_DFS.cl", kernelFunction="DFS"                           >
+
+
+const DFS_MIN_PRIMREFS  = 6;
+const DFS_MAX_PRIMREFS  = 256;
+const BFS_WG_SIZE_SHIFT = 9;
+
+
+
+struct Scheduler
+{
+    dword num_bfs_wgs;
+    dword num_dfs_wgs;
+
+    dword scheduler_postsync;
+    dword _pad1;
+
+    dword num_trivial_builds;
+    dword num_single_builds;
+
+    dword batched_build_wg_count;
+    dword batched_build_loop_mask;
+
+};
+
+
+struct SAHBuildArgs
+{
+    qword p_num_primitives;
+    qword p_qnode_child_buffer;
+    qword p_scheduler;
+    qword p_sah_globals;
+    qword p_globals;
+    qword p_primref_buffer;
+    qword p_primref_index_buffers;
+    qword p_bvh_base;
+    qword p_bvh2;
+    qword p_root_buffer_counters;
+    dword sah_build_flags;
+    dword leaf_size;
+    dword leaf_type;
+    dword max_internal_nodes;
+};
+
+
+metakernel single_pass_binsah(
+    qword build_globals,
+    qword bvh_buffer,
+    qword build_primref_buffer,
+    qword build_primref_index_buffers,
+    dword alloc_backpointers )
+{
+
+    dispatch single_pass_binsah(1, 1, 1) args(
+        build_globals,
+        bvh_buffer,
+        build_primref_buffer,
+        build_primref_index_buffers,
+        alloc_backpointers
+    );
+
+}
+
+
+
+metakernel new_sah_build( SAHBuildArgs build_args )
+{
+    define REG_num_prims    REG0;
+
+    {
+        define C_MIN_PRIMREFS           REG1;
+        define C_MAX_PRIMREFS           REG2;
+        define REG_dispatch_trivial     REG3;
+        define REG_dispatch_single_wg   REG4;
+
+        REG_num_prims  = load_dword( build_args.p_num_primitives );
+        C_MIN_PRIMREFS = DFS_MIN_PRIMREFS;
+        C_MAX_PRIMREFS = DFS_MAX_PRIMREFS;
+
+        REG_dispatch_trivial   = REG_num_prims <= C_MIN_PRIMREFS;
+        REG_dispatch_single_wg = REG_num_prims <= C_MAX_PRIMREFS;
+
+        goto l_dispatch_trivial   if(REG_dispatch_trivial.lo);
+        goto l_dispatch_single_wg if(REG_dispatch_single_wg.lo);
+        goto l_full_build;
+    }
+
+l_dispatch_trivial:
+    {
+        dispatch opencl_build_kernel_DFS_trivial    (1,1,1)
+            args( build_args.p_globals,
+                  build_args.p_bvh_base,
+                  build_args.p_primref_buffer,
+                  build_args.p_primref_index_buffers,
+                  build_args.sah_build_flags
+                  );
+
+        control( wait_idle );
+        goto l_done;
+    }
+
+l_dispatch_single_wg:
+    {
+        dispatch opencl_build_kernel_DFS_single_wg    (1,1,1)
+            args( build_args.p_globals,
+                  build_args.p_bvh_base,
+                  build_args.p_primref_buffer,
+                  build_args.p_primref_index_buffers,
+                  build_args.sah_build_flags
+                  );
+
+        control( wait_idle );
+        goto l_done;
+    }
+
+
+l_full_build:
+
+
+    {
+        define p_scheduler                  build_args.p_scheduler;
+        define p_num_dfs_wgs                build_args.p_scheduler + offsetof(Scheduler.num_dfs_wgs);
+        define p_scheduler_postsync         (build_args.p_scheduler + offsetof(Scheduler.scheduler_postsync) );
+        define C_0    REG1;
+        define C_8    REG2;
+        C_8 = 8;
+        C_0 = 0;
+
+
+        //
+        //  Init pass
+        //
+        store_dword( p_scheduler_postsync, C_0.lo );
+
+        // compute number of BFS WGs from prim-count
+        // NOTE:  This code uses a hardcoded WG size of 512 for BFS
+        //    If the BFS wg size ever changes, it needs to be touched
+        //    This is necessary because DG2 shifter only supports POW2 shifts
+        {
+            define REG_scheduler_postsync    REG3;
+            define C_511    REG4;
+            define C_1      REG5;
+
+            REG_scheduler_postsync = p_scheduler_postsync;
+            C_511 = 511;
+            C_1   = 1;
+
+            store_qword( REG_scheduler_postsync, C_0 ); // initialize scheduler semaphore
+
+            REG_num_prims = REG_num_prims + C_511;
+            REG_num_prims = REG_num_prims >> C_8;
+            REG_num_prims = REG_num_prims >> C_1;
+
+            DISPATCHDIM_X = REG_num_prims.lo;
+            DISPATCHDIM_Y = 1;
+            DISPATCHDIM_Z = 1;
+
+            control( cs_store_fence ); // commit the semaphore write
+
+            // launch scheduler init kernel
+            dispatch opencl_build_kernel_BinnedSAH_begin (1,1,1)
+                args(
+                    build_args.p_scheduler,
+                    build_args.leaf_size,
+                    build_args.leaf_type,
+                    build_args.p_primref_index_buffers,
+                    build_args.p_primref_buffer,
+                    build_args.p_bvh2,
+                    build_args.p_bvh_base,
+                    build_args.p_globals,
+                    build_args.p_sah_globals,
+                    build_args.p_qnode_child_buffer,
+                    build_args.sah_build_flags
+                )
+                postsync store_dword( p_scheduler_postsync, 1 );
+
+            // wait on init kernel
+            semaphore_wait while( *p_scheduler_postsync != 1 );
+
+            // launch BFS1 pass1
+            dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_initial
+                args( build_args.p_scheduler,
+                      build_args.p_sah_globals)
+                postsync store_dword( p_scheduler_postsync, 0 );
+
+            // wait on BFS pass1
+            semaphore_wait while( *p_scheduler_postsync != 0 );
+
+            // launch BFS pass2
+            dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_initial
+                args( build_args.p_scheduler,
+                      build_args.p_sah_globals )
+                postsync store_dword( p_scheduler_postsync, 1 );
+        }
+
+        // after BFS pass 2 we drop into a scheduling loop
+
+        l_build_loop:
+        {
+            semaphore_wait while( *p_scheduler_postsync != 1 );
+
+            {
+                dispatch opencl_build_kernel_BinnedSAH_scheduler(1,1,1)
+                    args( build_args.p_scheduler, build_args.p_sah_globals )
+                    postsync store_dword( p_scheduler_postsync, 0 );
+
+                // wait on the scheduler
+                semaphore_wait while( *p_scheduler_postsync != 0 );
+            }
+
+            // load and process the scheduler results
+            define REG_wg_counts    REG0;
+            define REG_num_bfs_wgs  REG0.lo;
+            define REG_num_dfs_wgs  REG0.hi;
+            define REG_loop_break   REG1;
+            define REG_p_scheduler  REG2;
+            {
+                REG_p_scheduler = p_scheduler;
+                REG_wg_counts    = load_qword( REG_p_scheduler );
+
+                define C_MASK_LO REG3 ;
+                C_MASK_LO = 0xffffffff;
+
+                REG_loop_break = REG_wg_counts  & C_MASK_LO;
+                REG_loop_break = REG_loop_break == 0;
+            }
+
+            // dispatch new DFS WGs
+            DISPATCHDIM_X = REG_num_dfs_wgs;
+            dispatch_indirect opencl_build_kernel_BinnedSAH_DFS
+                args( p_scheduler,
+                      build_args.p_sah_globals );
+
+            // jump out if there are no bfs WGs
+            goto l_build_qnodes if (REG_loop_break);
+
+            // dispatch new BFS1 WGs
+            DISPATCHDIM_X = REG_num_bfs_wgs;
+            dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_indexed
+                args( p_scheduler,
+                      build_args.p_sah_globals )
+                postsync store_dword( p_scheduler_postsync, 2 );
+
+           semaphore_wait while( *p_scheduler_postsync != 2 );
+
+           // dispatch new BFS2 WGs
+           dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_indexed
+               args( p_scheduler,
+                     build_args.p_sah_globals )
+               postsync store_dword( p_scheduler_postsync, 1 );
+
+            //goto l_build_loop if not(REG_num_dfs_wgs); //TODO: maybe add some logic to do "bool have_dfs_work" which will add some cycles but may be faster than checking semaphore
+
+            // wait until all upcoming DFS WGs have finished launching
+            //   so that the scheduler can refill the launch array
+                // TODO_OPT:  Look at replacing this with a ring buffer so that scheduler stalls instead (and only rarely)
+            semaphore_wait while( *p_num_dfs_wgs != 0 );
+
+
+            goto l_build_loop;
+        }
+    }
+
+l_build_qnodes:
+
+    control( wait_idle );
+
+    // P/C qnode build
+
+    dispatch opencl_build_kernel_BinnedSAH_BuildQNodes_Kickoff (1,1,1)
+        args( build_args.p_sah_globals,
+              build_args.p_qnode_child_buffer,
+              build_args.sah_build_flags );
+
+    {
+        define p_pc_counters ( build_args.p_root_buffer_counters );
+
+        define REG_addr      REG0;
+        define REG_produced  REG1;
+        define REG_consumed  REG2;
+        define REG_have_work REG3;
+        define REG_wg_count  REG4;
+        define C_8 REG5;
+        define C_16 REG6;
+        define C_1 REG7;
+        C_1 = 1;
+        C_8 =  8;
+        C_16 = 16;
+        REG_addr =  build_args.p_root_buffer_counters; // HINT: should we use REG_addr or just pass separate arguments to metakernel to avoid add/sub from address
+
+        REG_consumed = 0;
+
+        l_qnode_loop:
+
+            control( wait_idle ); // wait for previous pass
+
+            // load counters and compute number of wgs to respawn
+            REG_produced  = load_qword( REG_addr ); REG_addr = REG_addr + C_8;
+            REG_wg_count  = REG_produced - REG_consumed;
+            REG_have_work = REG_wg_count > 0;
+
+            goto l_done if not(REG_have_work.lo);
+
+            // save REG_consumed as a starting position in p_qnode_child_buffer
+            store_qword(REG_addr, REG_consumed); REG_addr = REG_addr + C_8;
+
+            // save REG_produced as ending position in p_qnode_child_buffer
+            store_qword(REG_addr, REG_produced); REG_addr = REG_addr - C_16;
+
+            REG_consumed = REG_consumed + REG_wg_count; // update consumed for next iteration
+
+            // calculate amount of workgroups to schedule
+            REG_wg_count = REG_wg_count + C_1;
+            REG_wg_count = REG_wg_count >> C_1;
+
+            DISPATCHDIM_X = REG_wg_count.lo;
+
+            control( cs_store_fence ); // commit the stores
+
+            dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify
+                    args( build_args.p_sah_globals,
+                          build_args.p_qnode_child_buffer,
+                          build_args.sah_build_flags);
+
+            goto l_qnode_loop;
+    }
+
+l_done:
+}
+
+
+
+
+
+
+
+
+
+struct SAHBuildArgsBatchable
+{
+    qword p_globals_ptrs;
+    qword p_scheduler;
+    qword p_buffers_info;
+    qword p_sah_globals;
+
+    dword num_max_qnode_global_root_buffer_entries;
+    dword num_builds;
+
+};
+
+
+metakernel new_sah_build_batchable( SAHBuildArgsBatchable build_args )
+{
+    define p_scheduler                  build_args.p_scheduler;
+    define p_scheduler_postsync         (build_args.p_scheduler + offsetof(Scheduler.scheduler_postsync) );
+    define p_num_dfs_wgs                (build_args.p_scheduler + offsetof(Scheduler.num_dfs_wgs));
+
+    // initialize scheduler semaphore
+    REG0.lo = 0;
+    store_dword( p_scheduler_postsync, REG0.lo );
+
+
+    // dispatch categorization pass
+    dispatch opencl_build_kernel_BinnedSAH_categorize_builds_and_init_scheduler(2,1,1)
+        args(
+              build_args.p_scheduler,
+              build_args.p_globals_ptrs,
+              build_args.p_buffers_info,
+              build_args.p_sah_globals,
+              build_args.num_builds
+          )
+          postsync store_dword( p_scheduler_postsync, 1 );
+
+    // wait on the categorization pass
+    semaphore_wait while( *p_scheduler_postsync != 1 );
+
+
+    //  dispatch the trivial and single-WG passes
+    {
+        REG0 = load_qword( build_args.p_scheduler + offsetof(Scheduler.num_trivial_builds) );
+        DISPATCHDIM_X = REG0.lo;
+        DISPATCHDIM_Y = 1;
+        DISPATCHDIM_Z = 1;
+
+        // dispatch trivial builds
+
+        dispatch_indirect opencl_build_kernel_DFS_trivial_batch
+            args( build_args.p_sah_globals );
+
+        control( wait_idle );
+
+        // dispatch single-wg builds
+
+        DISPATCHDIM_X = REG0.hi;
+        dispatch_indirect opencl_build_kernel_DFS_single_wg_batch
+            args( build_args.p_sah_globals, build_args.p_scheduler );
+    }
+
+    // compute the number of builds not covered by the trivial passes
+    // skip the builder loop if all builds are satisfied by trivial passes
+    {
+        REG1 = REG0.lo;
+        REG2 = REG0.hi;
+        REG3 = build_args.num_builds;
+        REG5 = REG2 + REG1;
+        REG5 = REG3 - REG5;
+        REG4 = REG5 == 0 ;
+
+        goto l_done if (REG4.lo);
+    }
+
+    // REG5 (number of non-trivial builds) will be used to launch build_qnodes kernel after the build loop
+    define REG_num_nontrivial REG5;
+
+l_build_outer_loop:
+    {
+
+        // configure the scheduler to initiate a new block of builds
+
+        dispatch opencl_build_kernel_BinnedSAH_begin_batched (1,1,1)
+            args( build_args.p_scheduler, build_args.p_sah_globals )
+            postsync store_dword( p_scheduler_postsync, 0 );
+
+        // wait on init kernel
+        semaphore_wait while( *p_scheduler_postsync != 0 );
+
+
+        // read results produced by scheduler init kernel
+        //   lo == BFS wg count.  hi == all ones if we need to loop again
+        //
+        REG0 = build_args.p_scheduler + offsetof(Scheduler.batched_build_wg_count);
+        REG4 = load_qword( REG0 );
+
+        // launch BFS1 pass1
+        DISPATCHDIM_X = REG4.lo;
+        dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_initial_batch
+            args( build_args.p_scheduler,
+                    build_args.p_sah_globals)
+            postsync store_dword( p_scheduler_postsync, 1 );
+
+        // wait on BFS pass1
+        semaphore_wait while( *p_scheduler_postsync != 1 );
+
+        // launch BFS pass2
+        dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_initial_batch
+            args( build_args.p_scheduler,
+                    build_args.p_sah_globals )
+            postsync store_dword( p_scheduler_postsync, 0 );
+
+        l_build_loop:
+            {
+                semaphore_wait while( *p_scheduler_postsync != 0 );
+
+                {
+                    dispatch opencl_build_kernel_BinnedSAH_scheduler(1,1,1)
+                        args( build_args.p_scheduler, build_args.p_sah_globals )
+                        postsync store_dword( p_scheduler_postsync, 1 );
+
+                    // wait on the scheduler
+                    semaphore_wait while( *p_scheduler_postsync != 1 );
+                }
+
+                // load and process the scheduler results
+                define REG_wg_counts    REG0;
+                define REG_num_bfs_wgs  REG0.lo;
+                define REG_num_dfs_wgs  REG0.hi;
+                define REG_loop_break   REG1;
+                define REG_p_scheduler  REG2;
+                {
+                    REG_p_scheduler = p_scheduler;
+                    REG_wg_counts    = load_qword( REG_p_scheduler );
+
+                    define C_MASK_LO REG3 ;
+                    C_MASK_LO = 0xffffffff;
+
+                    REG_loop_break = REG_wg_counts  & C_MASK_LO;
+                    REG_loop_break = REG_loop_break == 0;
+                }
+
+                // dispatch new DFS WGs
+                DISPATCHDIM_X = REG_num_dfs_wgs;
+                dispatch_indirect opencl_build_kernel_BinnedSAH_DFS
+                    args( p_scheduler,
+                          build_args.p_sah_globals );
+
+                // jump out if there are no bfs WGs
+                goto l_continue_outer_loop if (REG_loop_break);
+
+                // dispatch new BFS1 WGs
+                DISPATCHDIM_X = REG_num_bfs_wgs;
+                dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass1_indexed_batch
+                    args( p_scheduler,
+                          build_args.p_sah_globals )
+                    postsync store_dword( p_scheduler_postsync, 2 );
+
+               semaphore_wait while( *p_scheduler_postsync != 2 );
+
+                // dispatch new BFS2 WGs
+                dispatch_indirect opencl_build_kernel_BinnedSAH_BFS_pass2_indexed_batch
+                    args( p_scheduler,
+                          build_args.p_sah_globals )
+                    postsync store_dword( p_scheduler_postsync, 0 );
+
+                //goto l_build_loop if not(REG_num_dfs_wgs); //TODO: maybe add some logic to do "bool have_dfs_work" which will add some cycles but may be faster than checking semaphore
+
+                // wait until all upcoming DFS WGs have finished launching
+                //   so that the scheduler can refill the launch array
+                // TODO_OPT:  Look at replacing this with a ring buffer so that scheduler stalls instead (and only rarely)
+                semaphore_wait while( *p_num_dfs_wgs != 0 );
+
+                goto l_build_loop;
+            }
+
+
+        l_continue_outer_loop:
+
+
+            goto l_build_outer_loop if(REG4.hi);
+
+    }
+
+////////
+//
+// Qnode build phase
+//
+////////
+
+    //  Wait for all outstanding DFS dispatches to complete, then build the QNodes
+    control( wait_idle );
+
+    define REG_wg_counts   REG1;
+    define REG_p_scheduler REG2;
+    define REG_have_work   REG3;
+    define REG_GRB_NUM_MAX_ENTRIES    REG4;
+
+    // init scheduler for qnode phase
+    dispatch opencl_build_kernel_BinnedSAH_qnode_init_scheduler_batched(1,1,1)
+        args( build_args.p_scheduler,
+              build_args.num_builds,
+              build_args.num_max_qnode_global_root_buffer_entries);
+
+    REG_p_scheduler = p_scheduler;
+
+    control( wait_idle );
+
+    REG_wg_counts   = load_qword( REG_p_scheduler );
+
+    DISPATCHDIM_X = REG_wg_counts.lo;
+
+    // configure the scheduler to initiate a new block of builds
+    dispatch_indirect opencl_build_kernel_BinnedSAH_qnode_begin_batched
+        args( build_args.p_scheduler,
+              build_args.p_sah_globals);
+
+    // read results produced by init scheduler kernel
+    //   lo == num of builds processed.  hi == num of maximum global root buffer entries
+    //
+    REG0 = build_args.p_scheduler + offsetof(Scheduler.batched_build_wg_count);
+    REG5 = load_qword( REG0 );
+
+    REG_GRB_NUM_MAX_ENTRIES.lo = REG5.hi;
+    REG_GRB_NUM_MAX_ENTRIES.hi = 0;
+
+l_qnode_loop:
+    {
+        control( wait_idle ); // wait for previous pass
+
+        dispatch opencl_build_kernel_BinnedSAH_qnode_scheduler(1,1,1) args( build_args.p_scheduler );
+
+        control( wait_idle );
+
+        REG_wg_counts   = load_qword( REG_p_scheduler );
+        REG_have_work = REG_wg_counts > 0;
+
+        goto l_done if not(REG_have_work.lo);
+
+        DISPATCHDIM_X = REG_wg_counts.lo;
+
+        dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes_Amplify_batch
+                args( build_args.p_sah_globals,
+                      build_args.p_scheduler );
+
+        control( wait_idle );
+
+        REG_wg_counts   = load_qword( REG_p_scheduler ); // reload values
+        REG_wg_counts.lo = REG_wg_counts.hi;
+        REG_wg_counts.hi = 0;
+
+        REG_have_work = REG_wg_counts < REG_GRB_NUM_MAX_ENTRIES;
+
+        goto l_qnode_loop if not(REG_have_work.lo);
+
+        DISPATCHDIM_X = REG5.lo; // dispatch single workgroup for each build scheduled
+
+        dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQnodes_TryToFillGRB_batched
+                args( build_args.p_sah_globals,
+                      build_args.p_scheduler );
+
+        goto l_qnode_loop;
+    }
+
+////////
+//
+// Old implementation - TODO: maybe add switch between two implementations?
+//
+////////
+    //  Wait for all outstanding DFS dispatches to complete, then build the QNodes
+    //DISPATCHDIM_X = REG5.lo;
+
+    //dispatch_indirect opencl_build_kernel_BinnedSAH_BuildQNodes
+    //    args( build_args.p_sah_globals, build_args.p_scheduler );
+
+
+l_done:
+
+    control( wait_idle );
+
+}
diff --git a/src/intel/vulkan/grl/gpu/postbuild_info.grl b/src/intel/vulkan/grl/gpu/postbuild_info.grl
new file mode 100644
index 00000000000..3039e533a9b
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/postbuild_info.grl
@@ -0,0 +1,49 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module postbuild_info; // In postbuild we assume output data structure to be DXR compatible
+
+kernel compacted_size < source="bvh_postbuild_info.cl", kernelFunction="compacted_size" >
+kernel current_size < source="bvh_postbuild_info.cl", kernelFunction="current_size" >
+kernel serialized_size < source="bvh_postbuild_info.cl", kernelFunction="serialized_size" >
+kernel decoded_size < source="bvh_postbuild_info.cl", kernelFunction="decoded_size" >
+
+metakernel compacted_size(
+    qword bvh,
+    qword postbuildInfo)
+{
+    dispatch compacted_size(1,1,1) args(
+        bvh,
+        postbuildInfo);
+}
+
+metakernel current_size(
+    qword bvh,
+    qword postbuildInfo)
+{
+    dispatch current_size(1,1,1) args(
+        bvh,
+        postbuildInfo);
+}
+
+metakernel serialized_size(
+    qword bvh,
+    qword postbuildInfo)
+{
+    dispatch serialized_size(1,1,1) args(
+        bvh,
+        postbuildInfo);
+}
+
+metakernel decoded_size(
+    qword bvh,
+    qword postbuildInfo)
+{
+    dispatch decoded_size(1,1,1) args(
+        bvh,
+        postbuildInfo);
+}
diff --git a/src/intel/vulkan/grl/gpu/presplit.grl b/src/intel/vulkan/grl/gpu/presplit.grl
new file mode 100644
index 00000000000..d0f6e53fbb1
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/presplit.grl
@@ -0,0 +1,62 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module presplit;
+
+kernel_module presplit_kernels ("bvh_build_presplit.cl")
+{
+    links lsc_intrinsics;
+
+    kernel opencl_kernel_compute_num_presplits               < kernelFunction="compute_num_presplits" >;
+    kernel opencl_kernel_priority_sum                        < kernelFunction="priority_sum"          >;
+    kernel opencl_kernel_perform_presplits                   < kernelFunction="perform_presplits"     >;
+}
+
+import struct MKBuilderState "structs.grl";
+import struct MKSizeEstimate "structs.grl";
+
+
+metakernel compute_num_presplits(
+    MKBuilderState state,
+    qword presplit_buffer,
+    dword numHwThreads )
+{
+    dispatch opencl_kernel_compute_num_presplits ( numHwThreads, 1, 1 ) args(
+        state.build_globals,
+        state.bvh_buffer,
+        state.build_primref_buffer,
+        presplit_buffer,
+        state.geomDesc_buffer );
+}
+
+
+metakernel priority_sum(
+    MKBuilderState state,
+    MKSizeEstimate estimate,
+    qword presplit_buffer )
+{
+    dispatch opencl_kernel_priority_sum ( 1, 1, 1 ) args(
+        state.build_globals,
+        presplit_buffer,
+        estimate.numPrimitivesToSplit / 2 );
+}
+
+metakernel perform_presplits(
+    MKBuilderState state,
+    MKSizeEstimate estimate,
+    qword presplit_buffer,
+    dword numHwThreads )
+{
+    dispatch opencl_kernel_perform_presplits ( numHwThreads, 1, 1 ) args(
+        state.build_globals,
+        state.bvh_buffer,
+        state.build_primref_buffer,
+        presplit_buffer,
+        state.bvh_buffer,
+        state.geomDesc_buffer,
+        estimate.numPrimitivesToSplit / 2 );
+}
diff --git a/src/intel/vulkan/grl/gpu/qbvh6.h b/src/intel/vulkan/grl/gpu/qbvh6.h
new file mode 100644
index 00000000000..22260d07f41
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/qbvh6.h
@@ -0,0 +1,933 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "GRLGen12.h"
+
+#include "shared.h"
+#include "quad.h"
+
+/* ====== GENERAL BVH config ====== */
+
+#define BVH_NODE_N6 6
+#define BVH_NODE_N 8
+#define BVH_NODE_N_LOG 3
+
+#define SAH_LOG_BLOCK_SHIFT 2
+#define BVH_LEAF_N_MIN BVH_NODE_N6
+#define BVH_LEAF_N_MAX BVH_NODE_N6
+
+#define BVH_NODE_DEFAULT_MASK 0xff
+#define BVH_NODE_DEGENERATED_MASK 0x00
+
+/* ====== QUANTIZATION config ====== */
+
+#define QUANT_BITS 8
+#define QUANT_MIN 0
+#define QUANT_MAX 255
+#define QUANT_MAX_MANT (255.0f / 256.0f)
+
+#define NO_NODE_OFFSET 0
+
+/* ======================================================================= */
+/* ============================== BVH BASE =============================== */
+/* ======================================================================= */
+
+GRL_INLINE void setBVHBaseBounds(struct BVHBase *base, struct AABB *aabb)
+{
+    base->Meta.bounds.lower[0] = aabb->lower.x;
+    base->Meta.bounds.lower[1] = aabb->lower.y;
+    base->Meta.bounds.lower[2] = aabb->lower.z;
+
+    base->Meta.bounds.upper[0] = aabb->upper.x;
+    base->Meta.bounds.upper[1] = aabb->upper.y;
+    base->Meta.bounds.upper[2] = aabb->upper.z;
+}
+
+GRL_INLINE global struct QBVHNodeN *BVHBase_nodeData(struct BVHBase *bvh)
+{
+    return (global struct QBVHNodeN *)((void *)bvh + BVH_ROOT_NODE_OFFSET);
+}
+
+GRL_INLINE global struct QBVHNodeN *BVHBase_rootNode(struct BVHBase *bvh)
+{
+    return (global struct QBVHNodeN *)((void *)bvh + BVH_ROOT_NODE_OFFSET);
+}
+
+GRL_INLINE global struct Quad *BVHBase_quadLeaves(struct BVHBase *bvh)
+{
+    return (global struct Quad *)((void *)bvh + 64 * (ulong)bvh->quadLeafStart);
+}
+
+GRL_INLINE uint64_t BVHBase_numNodes(struct BVHBase *bvh)
+{
+    return bvh->nodeDataCur - BVH_ROOT_NODE_OFFSET / 64;
+}
+
+GRL_INLINE uint64_t BVHBase_numQuads(struct BVHBase *bvh)
+{
+    return bvh->quadLeafCur - bvh->quadLeafStart;
+}
+
+GRL_INLINE uint64_t BVHBase_numProcedurals(struct BVHBase *bvh)
+{
+    return bvh->proceduralDataCur - bvh->proceduralDataStart;
+}
+
+GRL_INLINE uint64_t BVHBase_numInstances(struct BVHBase *bvh)
+{
+    return bvh->instanceLeafEnd - bvh->instanceLeafStart;
+}
+
+/* =================================================================== */
+/* ============================== QBVH =============================== */
+/* =================================================================== */
+
+__constant const float ulp = FLT_EPSILON;
+
+GRL_INLINE struct AABB conservativeAABB(struct AABB *aabb)
+{
+    struct AABB box;
+    const float4 v4 = max(fabs(aabb->lower), fabs(aabb->upper));
+    const float v = ulp * max(v4.x, max(v4.y, v4.z));
+    box.lower = aabb->lower - (float4)v;
+    box.upper = aabb->upper + (float4)v;
+    return box;
+}
+
+GRL_INLINE struct AABB3f conservativeAABB3f(struct AABB3f* aabb3d)
+{
+    struct AABB aabb4d = AABBfromAABB3f(*aabb3d);
+    struct AABB box = conservativeAABB(&aabb4d);
+    return AABB3fFromAABB(box);
+}
+
+struct QBVH_AABB
+{
+    uchar lower_x[BVH_NODE_N6];
+    uchar upper_x[BVH_NODE_N6];
+    uchar lower_y[BVH_NODE_N6];
+    uchar upper_y[BVH_NODE_N6];
+    uchar lower_z[BVH_NODE_N6];
+    uchar upper_z[BVH_NODE_N6];
+};
+
+struct QBVHNodeN
+{
+    float lower[3];
+    int offset;
+    // 16 bytes
+    uchar type;
+    uchar pad;
+    // 18 bytes
+    char exp[3];
+    uchar instMask;
+    // 22 bytes
+    uchar childData[6];
+    // 28 bytes
+    struct QBVH_AABB qbounds; // + 36 bytes
+                              // 64 bytes
+};
+
+GRL_INLINE uint QBVHNodeN_blockIncr(struct QBVHNodeN *This, uint childID)
+{
+    return This->childData[childID] & 0x3;
+}
+
+GRL_INLINE uint QBVHNodeN_startPrim(struct QBVHNodeN *This, uint childID)
+{
+    return (This->childData[childID] >> 2) & 0xF;
+}
+
+GRL_INLINE void initQBVHNodeN(struct QBVHNodeN *qnode)
+{
+    uint *ptr = (uint *)qnode;
+    for (uint i = 0; i < 16; i++)
+        ptr[i] = 0;
+}
+
+GRL_INLINE struct AABB extractAABB_QBVHNodeN(struct QBVHNodeN *qnode, uint i)
+{
+    struct AABB aabb;
+    const float4 base = (float4)(qnode->lower[0], qnode->lower[1], qnode->lower[2], 0.0f);
+    const int4 lower_i = (int4)(qnode->qbounds.lower_x[i], qnode->qbounds.lower_y[i], qnode->qbounds.lower_z[i], 0);
+    const int4 upper_i = (int4)(qnode->qbounds.upper_x[i], qnode->qbounds.upper_y[i], qnode->qbounds.upper_z[i], 0);
+    const int4 exp_i = (int4)(qnode->exp[0], qnode->exp[1], qnode->exp[2], 0.0f);
+    aabb.lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8);
+    aabb.upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8);
+    return aabb;
+}
+
+GRL_INLINE struct AABB getAABB_QBVHNodeN(struct QBVHNodeN *qnode)
+{
+    struct AABB aabb;
+#if 0
+  AABB_init(&aabb);
+  for (uint i = 0; i < BVH_NODE_N6; i++)
+  {
+    struct AABB v = extractAABB_QBVHNodeN(qnode, i);
+    AABB_extend(&aabb, &v);
+  }
+#else
+    uint lower_x = qnode->qbounds.lower_x[0];
+    uint lower_y = qnode->qbounds.lower_y[0];
+    uint lower_z = qnode->qbounds.lower_z[0];
+
+    uint upper_x = qnode->qbounds.upper_x[0];
+    uint upper_y = qnode->qbounds.upper_y[0];
+    uint upper_z = qnode->qbounds.upper_z[0];
+
+    for (uint i = 1; i < BVH_NODE_N6; i++)
+    {
+        uint lx = qnode->qbounds.lower_x[i];
+        uint ly = qnode->qbounds.lower_y[i];
+        uint lz = qnode->qbounds.lower_z[i];
+
+        uint ux = qnode->qbounds.upper_x[i];
+        uint uy = qnode->qbounds.upper_y[i];
+        uint uz = qnode->qbounds.upper_z[i];
+
+        bool valid = lx <= ux;
+        if (valid)
+        {
+            lower_x = min(lower_x, lx);
+            lower_y = min(lower_y, ly);
+            lower_z = min(lower_z, lz);
+
+            upper_x = max(upper_x, ux);
+            upper_y = max(upper_y, uy);
+            upper_z = max(upper_z, uz);
+        }
+    }
+
+    const float4 base = (float4)(qnode->lower[0], qnode->lower[1], qnode->lower[2], 0.0f);
+    const int4 lower_i = (int4)(lower_x, lower_y, lower_z, 0);
+    const int4 upper_i = (int4)(upper_x, upper_y, upper_z, 0);
+    const int4 exp_i = (int4)(qnode->exp[0], qnode->exp[1], qnode->exp[2], 0.0f);
+    aabb.lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8);
+    aabb.upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8);
+#endif
+    return aabb;
+}
+
+GRL_INLINE struct AABB3f InternalNode_getAABB3f(struct InternalNode* node)
+{
+    return AABB3fFromAABB(getAABB_QBVHNodeN((struct QBVHNodeN*)node));
+}
+
+GRL_INLINE uint getNumChildren_QBVHNodeN(struct QBVHNodeN *qnode)
+{
+    uint children = 0;
+    for (uint i = 0; i < BVH_NODE_N6; i++)
+    {
+        uint lx = qnode->qbounds.lower_x[i];
+        uint ux = qnode->qbounds.upper_x[i];
+        bool valid = lx <= ux;
+        if (valid)
+            children++;
+    }
+    return children;
+}
+
+GRL_INLINE long extractQBVHNodeN_offset(struct QBVHNodeN *qnode)
+{
+    return ((long)qnode->offset) << 6;
+}
+
+GRL_INLINE void *QBVHNodeN_childrenPointer(struct QBVHNodeN *qnode)
+{
+    const int offset = qnode->offset;
+    return (void *)(qnode + offset);
+}
+
+GRL_INLINE void subgroup_setQBVHNodeN_setFields_reduced_bounds(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated, struct AABB reduced_aabb)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+    const uint k = subgroupLocalID;
+    const float up = 1.0f + ulp;
+    const float down = 1.0f - ulp;
+
+    struct AABB aabb = reduced_aabb; // needs to execute with full subgroup width
+    aabb = AABB_sub_group_broadcast(&aabb, 0);
+
+    if (subgroupLocalID < BVH_NODE_N6)
+    {
+        struct AABB conservative_aabb = conservativeAABB(&aabb);
+        const float3 len = AABB_size(&conservative_aabb).xyz * up;
+        int3 exp;
+        const float3 mant = frexp_vec3(len, &exp);
+        const float3 org = conservative_aabb.lower.xyz;
+
+        exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
+
+        qbvh_node->offset = offset;
+        qbvh_node->type = type;
+
+        qbvh_node->lower[0] = org.x;
+        qbvh_node->lower[1] = org.y;
+        qbvh_node->lower[2] = org.z;
+
+        qbvh_node->exp[0] = exp.x;
+        qbvh_node->exp[1] = exp.y;
+        qbvh_node->exp[2] = exp.z;
+
+        qbvh_node->instMask = mask;
+
+        uchar3 lower_uchar = (uchar3)(0x80);
+        uchar3 upper_uchar = (uchar3)(0);
+
+        if (subgroupLocalID < numChildren)
+        {
+            struct AABB child_aabb = conservativeAABB(input_aabb);
+
+            float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8));
+            lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX));
+            float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8));
+            upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX));
+
+            lower_uchar = convert_uchar3_rtn(lower);
+            upper_uchar = convert_uchar3_rtp(upper);
+
+            if (degenerated)
+            {
+                lower_uchar = upper_uchar = 0;
+            }
+        }
+
+        qbvh_node->qbounds.lower_x[k] = lower_uchar.x;
+        qbvh_node->qbounds.lower_y[k] = lower_uchar.y;
+        qbvh_node->qbounds.lower_z[k] = lower_uchar.z;
+        qbvh_node->qbounds.upper_x[k] = upper_uchar.x;
+        qbvh_node->qbounds.upper_y[k] = upper_uchar.y;
+        qbvh_node->qbounds.upper_z[k] = upper_uchar.z;
+
+        qbvh_node->childData[k] = (type == NODE_TYPE_INSTANCE) ? 2 : 1;
+
+#if ENABLE_CONVERSION_CHECKS == 1
+
+        if (!(exp.x >= -128 && exp.x <= 127))
+            printf("exp_x error \n");
+        if (!(exp.y >= -128 && exp.y <= 127))
+            printf("exp_y error \n");
+        if (!(exp.z >= -128 && exp.z <= 127))
+            printf("exp_z error \n");
+
+        struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k);
+        if (!AABB_subset(&child_aabb, &child_qaabb))
+        {
+            uint3 lower_i = convert_uint3(lower_uchar);
+            uint3 upper_i = convert_uint3(upper_uchar);
+
+            printf("\n ERROR %d\n", k);
+            printf("lower %f upper %f \n lower_i %d  upper_i %d \n", lower, upper, lower_i, upper_i);
+            printf("%i uncompressed \n", k);
+            AABB_print(&child_aabb);
+            printf("%i compressed \n", k);
+            AABB_print(&child_qaabb);
+
+            printf("%i uncompressed (as int) \n", k);
+            AABB_printasInt(&child_aabb);
+            printf("%i compressed (as int) \n", k);
+            AABB_printasInt(&child_qaabb);
+
+            int4 e0 = child_aabb.lower < child_qaabb.lower;
+            int4 e1 = child_aabb.upper > child_qaabb.upper;
+            printf("e0 %d e1 %d \n", e0, e1);
+        }
+#endif
+    }
+}
+
+GRL_INLINE void subgroup_setQBVHNodeN_setFields(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated)
+{
+    struct AABB aabb = AABB_sub_group_reduce_N6(input_aabb);
+    subgroup_setQBVHNodeN_setFields_reduced_bounds(offset, type, input_aabb, numChildren, mask, qbvh_node, degenerated, aabb);
+}
+
+GRL_INLINE void subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, const uchar mask, struct QBVHNodeN* qbvh_node, const bool degenerated, bool active_lane)
+{
+    const uint lane = get_sub_group_local_id() % 8;
+    const uint node_in_sg = get_sub_group_local_id() / 8;
+    const uint k = lane;
+    const float up = 1.0f + ulp;
+    const float down = 1.0f - ulp;
+
+    struct AABB aabb = AABB_sub_group_reduce_N6(input_aabb); // needs to execute with full subgroup width
+    aabb = AABB_sub_group_shuffle(&aabb, node_in_sg * 8);
+
+    if (lane < BVH_NODE_N6 && active_lane)
+    {
+        struct AABB conservative_aabb = conservativeAABB(&aabb);
+        const float3 len = AABB_size(&conservative_aabb).xyz * up;
+        int3 exp;
+        const float3 mant = frexp_vec3(len, &exp);
+        const float3 org = conservative_aabb.lower.xyz;
+
+        exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
+
+        qbvh_node->offset = offset;
+        qbvh_node->type = type;
+
+        qbvh_node->lower[0] = org.x;
+        qbvh_node->lower[1] = org.y;
+        qbvh_node->lower[2] = org.z;
+
+        qbvh_node->exp[0] = exp.x;
+        qbvh_node->exp[1] = exp.y;
+        qbvh_node->exp[2] = exp.z;
+
+        qbvh_node->instMask = mask;
+
+        uchar3 lower_uchar = (uchar3)(0x80);
+        uchar3 upper_uchar = (uchar3)(0);
+
+        if (lane < numChildren)
+        {
+            struct AABB child_aabb = conservativeAABB(input_aabb);
+
+            float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8));
+            lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX));
+            float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8));
+            upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX));
+
+            lower_uchar = convert_uchar3_rtn(lower);
+            upper_uchar = convert_uchar3_rtp(upper);
+
+            if (degenerated)
+            {
+                lower_uchar = upper_uchar = 0;
+            }
+        }
+
+        qbvh_node->qbounds.lower_x[k] = lower_uchar.x;
+        qbvh_node->qbounds.lower_y[k] = lower_uchar.y;
+        qbvh_node->qbounds.lower_z[k] = lower_uchar.z;
+        qbvh_node->qbounds.upper_x[k] = upper_uchar.x;
+        qbvh_node->qbounds.upper_y[k] = upper_uchar.y;
+        qbvh_node->qbounds.upper_z[k] = upper_uchar.z;
+
+        qbvh_node->childData[k] = (type == NODE_TYPE_INSTANCE) ? 2 : 1;
+
+#if ENABLE_CONVERSION_CHECKS == 1
+
+        if (!(exp.x >= -128 && exp.x <= 127))
+            printf("exp_x error \n");
+        if (!(exp.y >= -128 && exp.y <= 127))
+            printf("exp_y error \n");
+        if (!(exp.z >= -128 && exp.z <= 127))
+            printf("exp_z error \n");
+
+        struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k);
+        if (!AABB_subset(&child_aabb, &child_qaabb))
+        {
+            uint3 lower_i = convert_uint3(lower_uchar);
+            uint3 upper_i = convert_uint3(upper_uchar);
+
+            printf("\n ERROR %d\n", k);
+            printf("lower %f upper %f \n lower_i %d  upper_i %d \n", lower, upper, lower_i, upper_i);
+            printf("%i uncompressed \n", k);
+            AABB_print(&child_aabb);
+            printf("%i compressed \n", k);
+            AABB_print(&child_qaabb);
+
+            printf("%i uncompressed (as int) \n", k);
+            AABB_printasInt(&child_aabb);
+            printf("%i compressed (as int) \n", k);
+            AABB_printasInt(&child_qaabb);
+
+            int4 e0 = child_aabb.lower < child_qaabb.lower;
+            int4 e1 = child_aabb.upper > child_qaabb.upper;
+            printf("e0 %d e1 %d \n", e0, e1);
+        }
+#endif
+    }
+}
+
+GRL_INLINE void subgroup_setInstanceQBVHNodeN(const int offset, struct AABB *input_aabb, const uint numChildren, struct QBVHNodeN *qbvh_node, const uint instMask)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+
+    // for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin.
+    // if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here.
+    bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK);
+
+    struct AABB aabb;
+    AABB_init(&aabb);
+
+    // if every child is degenerated (or inactive) instance, we need to init aabb with origin point
+    uchar commonMask = sub_group_reduce_or_N6(instMask);
+    if (subgroupLocalID < numChildren && (!degenerated || commonMask == BVH_NODE_DEGENERATED_MASK))
+        aabb = *input_aabb;
+
+    subgroup_setQBVHNodeN_setFields(offset, NODE_TYPE_INSTANCE, &aabb, numChildren, commonMask, qbvh_node, degenerated);
+}
+
+
+// return true if is degenerated
+GRL_INLINE bool subgroup_setInstanceBox_2xSIMD8_in_SIMD16(struct AABB* input_aabb, const uint numChildren, uchar* mask, const uint instMask, bool active_lane)
+{
+    const uint lane = get_sub_group_local_id() % 8;
+
+    // for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin.
+    // if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here.
+    bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK);
+
+    // if every child is degenerated (or inactive) instance, we need to init aabb with origin point
+    uchar commonMask = sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(instMask);
+    if (active_lane)
+        *mask = commonMask;
+
+    if (active_lane && (degenerated && commonMask != BVH_NODE_DEGENERATED_MASK))
+        AABB_init(input_aabb);
+
+    return active_lane ? degenerated : false;
+}
+
+GRL_INLINE void subgroup_setInstanceQBVHNodeN_x2(const int offset, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, const uint instMask, bool active_lane)
+{
+    const uint lane = get_sub_group_local_id() % 8;
+
+    // for degenerated (or inactive) instance ignore this box in exp, origin calculation and make its box be a point in the node origin.
+    // if it becomes non_degenerated on update, tree topology will be equivalent to what it would be if we would account this degenerated node here.
+    bool degenerated = (instMask == BVH_NODE_DEGENERATED_MASK);
+
+    struct AABB aabb;
+    AABB_init(&aabb);
+
+    // if every child is degenerated (or inactive) instance, we need to init aabb with origin point
+    uchar commonMask = sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(instMask);
+    if (lane < numChildren && (!degenerated || commonMask == BVH_NODE_DEGENERATED_MASK))
+        aabb = *input_aabb;
+
+    subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(offset, NODE_TYPE_INSTANCE, &aabb, numChildren, commonMask, qbvh_node, degenerated, active_lane);
+}
+
+
+GRL_INLINE void subgroup_setQBVHNodeN(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, uint mask)
+{
+    const uint subgroupLocalID = get_sub_group_local_id();
+
+    struct AABB aabb;
+    AABB_init(&aabb);
+
+    if (subgroupLocalID < numChildren)
+        aabb = *input_aabb;
+
+    subgroup_setQBVHNodeN_setFields(offset, type, &aabb, numChildren, mask, qbvh_node, false);
+}
+
+
+GRL_INLINE void subgroup_setQBVHNodeN_x2(const int offset, const uint type, struct AABB* input_aabb, const uint numChildren, struct QBVHNodeN* qbvh_node, bool active_lane)
+{
+    const uint lane = get_sub_group_local_id() % 8;
+
+    struct AABB aabb;
+    AABB_init(&aabb);
+
+    if (lane < numChildren)
+        aabb = *input_aabb;
+
+    subgroup_setQBVHNodeN_setFields_2xSIMD8_in_SIMD16(offset, type, &aabb, numChildren, BVH_NODE_DEFAULT_MASK, qbvh_node, false, active_lane);
+}
+
+
+GRL_INLINE void subgroup_QBVHNodeN_setBounds( uniform struct QBVHNodeN* qbvh_node, 
+                                              uniform struct AABB reduced_bounds,
+                                              varying struct AABB input_aabb, 
+                                              uniform uint numChildren,
+                                              varying ushort lane )
+{
+    const float up = 1.0f + ulp;
+    const float down = 1.0f - ulp;
+
+    int3 exp;
+
+    struct AABB conservative_aabb = conservativeAABB( &reduced_bounds);
+    const float3 len = AABB_size( &conservative_aabb ).xyz * up;
+    const float3 mant = frexp_vec3( len, &exp );
+    const float3 org = conservative_aabb.lower.xyz;
+
+    exp += (mant > ( float3 )QUANT_MAX_MANT ? (int3)1 : (int3)0);
+
+    qbvh_node->lower[0] = org.x;
+    qbvh_node->lower[1] = org.y;
+    qbvh_node->lower[2] = org.z;
+
+    qbvh_node->exp[0] = exp.x;
+    qbvh_node->exp[1] = exp.y;
+    qbvh_node->exp[2] = exp.z;
+
+    qbvh_node->instMask = 0xff;
+
+    uchar3 lower_uchar = 0x80;
+    uchar3 upper_uchar = 0;
+
+    if ( lane < BVH_NODE_N6 )
+    {
+        ushort k = lane;
+        if( lane < numChildren )
+        {
+            struct AABB child_aabb = conservativeAABB( &input_aabb ); // conservative ???
+
+            float3 lower = floor( bitShiftLdexp3( (child_aabb.lower.xyz - org) * down, -exp + 8 ) );
+            lower = clamp( lower, (float)(QUANT_MIN), (float)(QUANT_MAX) );
+            float3 upper = ceil( bitShiftLdexp3( (child_aabb.upper.xyz - org) * up, -exp + 8 ) );
+            upper = clamp( upper, (float)(QUANT_MIN), (float)(QUANT_MAX) );
+
+            lower_uchar = convert_uchar3_rtn( lower );
+            upper_uchar = convert_uchar3_rtp( upper );
+        }
+
+        qbvh_node->qbounds.lower_x[k] = lower_uchar.x;
+        qbvh_node->qbounds.lower_y[k] = lower_uchar.y;
+        qbvh_node->qbounds.lower_z[k] = lower_uchar.z;
+        qbvh_node->qbounds.upper_x[k] = upper_uchar.x;
+        qbvh_node->qbounds.upper_y[k] = upper_uchar.y;
+        qbvh_node->qbounds.upper_z[k] = upper_uchar.z;
+    }
+   
+}
+
+GRL_INLINE void QBVHNodeN_setBounds(struct QBVHNodeN *qbvh_node, struct AABB *input_aabb, const uint numChildren)
+{
+    const float up = 1.0f + ulp;
+    const float down = 1.0f - ulp;
+
+    int3 exp;
+    struct AABB aabb;
+    AABB_init(&aabb);
+    for (uint i = 0; i < numChildren; i++)
+        AABB_extend(&aabb, &input_aabb[i]);
+
+    struct AABB conservative_aabb = conservativeAABB(&aabb);
+    const float3 len = AABB_size(&conservative_aabb).xyz * up;
+    const float3 mant = frexp_vec3(len, &exp);
+    const float3 org = conservative_aabb.lower.xyz;
+
+    exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
+
+    qbvh_node->lower[0] = org.x;
+    qbvh_node->lower[1] = org.y;
+    qbvh_node->lower[2] = org.z;
+
+    qbvh_node->exp[0] = exp.x;
+    qbvh_node->exp[1] = exp.y;
+    qbvh_node->exp[2] = exp.z;
+
+    qbvh_node->instMask = 0xff;
+
+    for (uint k = 0; k < numChildren; k++)
+    {
+        struct AABB child_aabb = conservativeAABB(&input_aabb[k]); // conservative ???
+
+        float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8));
+        lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX));
+        float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8));
+        upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX));
+
+        uchar3 lower_uchar = convert_uchar3_rtn(lower);
+        uchar3 upper_uchar = convert_uchar3_rtp(upper);
+
+        qbvh_node->qbounds.lower_x[k] = lower_uchar.x;
+        qbvh_node->qbounds.lower_y[k] = lower_uchar.y;
+        qbvh_node->qbounds.lower_z[k] = lower_uchar.z;
+        qbvh_node->qbounds.upper_x[k] = upper_uchar.x;
+        qbvh_node->qbounds.upper_y[k] = upper_uchar.y;
+        qbvh_node->qbounds.upper_z[k] = upper_uchar.z;
+
+#if ENABLE_CONVERSION_CHECKS == 1
+        if (!(exp.x >= -128 && exp.x <= 127))
+            printf("exp_x error \n");
+        if (!(exp.y >= -128 && exp.y <= 127))
+            printf("exp_y error \n");
+        if (!(exp.z >= -128 && exp.z <= 127))
+            printf("exp_z error \n");
+
+        struct AABB child_qaabb = extractAABB_QBVHNodeN(qbvh_node, k);
+        if (!AABB_subset(&child_aabb, &child_qaabb))
+        {
+            uint3 lower_i = convert_uint3(lower_uchar);
+            uint3 upper_i = convert_uint3(upper_uchar);
+
+            printf("\n ERROR %d\n", k);
+            printf("lower %f upper %f \n lower_i %d  upper_i %d \n", lower, upper, lower_i, upper_i);
+            printf("%i uncompressed \n", k);
+            AABB_print(&child_aabb);
+            printf("%i compressed \n", k);
+            AABB_print(&child_qaabb);
+
+            printf("%i uncompressed (as int) \n", k);
+            AABB_printasInt(&child_aabb);
+            printf("%i compressed (as int) \n", k);
+            AABB_printasInt(&child_qaabb);
+
+            int4 e0 = child_aabb.lower < child_qaabb.lower;
+            int4 e1 = child_aabb.upper > child_qaabb.upper;
+            printf("e0 %d e1 %d \n", e0, e1);
+        }
+#endif
+    }
+    for (uint k = numChildren; k < BVH_NODE_N6; k++)
+    {
+        qbvh_node->qbounds.lower_x[k] = 0x80;
+        qbvh_node->qbounds.lower_y[k] = 0x80;
+        qbvh_node->qbounds.lower_z[k] = 0x80;
+        qbvh_node->qbounds.upper_x[k] = 0;
+        qbvh_node->qbounds.upper_y[k] = 0;
+        qbvh_node->qbounds.upper_z[k] = 0;
+    }
+}
+
+GRL_INLINE void QBVHNodeN_setChildren(struct QBVHNodeN *qbvh_node, const int offset, const uint numChildren)
+{
+    qbvh_node->offset = offset;
+    for (uint k = 0; k < BVH_NODE_N6; k++)
+        qbvh_node->childData[k] = 1;
+}
+
+GRL_INLINE void QBVHNodeN_setChildIncr1(struct QBVHNodeN *qbvh_node)
+{
+    for (uint k = 0; k < BVH_NODE_N6; k++)
+        qbvh_node->childData[k] = 1;
+}
+
+GRL_INLINE void SUBGROUP_QBVHNodeN_setChildIncr1(struct QBVHNodeN *qbvh_node)
+{
+    if( get_sub_group_local_id() < BVH_NODE_N6 )
+        qbvh_node->childData[get_sub_group_local_id()] = 1;
+}
+
+
+GRL_INLINE void QBVHNodeN_setChildIncr2(struct QBVHNodeN *qbvh_node)
+{
+    for (uint k = 0; k < BVH_NODE_N6; k++)
+        qbvh_node->childData[k] = 2;
+}
+
+GRL_INLINE void QBVHNodeN_setType(struct QBVHNodeN *qbvh_node, const uint type)
+{
+    qbvh_node->type = type;
+}
+
+GRL_INLINE void setQBVHNodeN(const int offset, const uint type, struct AABB *input_aabb, const uint numChildren, struct QBVHNodeN *qbvh_node)
+{
+    QBVHNodeN_setType(qbvh_node, type);
+    QBVHNodeN_setChildren(qbvh_node, offset, numChildren);
+    QBVHNodeN_setBounds(qbvh_node, input_aabb, numChildren);
+}
+
+GRL_INLINE void printQBVHNodeN(struct QBVHNodeN *qnode)
+{
+    printf(" offset %d type %d \n", qnode->offset, (int)qnode->type);
+    printf(" lower %f %f %f \n", qnode->lower[0], qnode->lower[1], qnode->lower[2]);
+    printf(" exp %d %d %d \n", (int)qnode->exp[0], (int)qnode->exp[1], (int)qnode->exp[2]);
+    printf(" instMask %d \n", qnode->instMask);
+
+    struct AABB aabb0 = extractAABB_QBVHNodeN(qnode, 0);
+    struct AABB aabb1 = extractAABB_QBVHNodeN(qnode, 1);
+    struct AABB aabb2 = extractAABB_QBVHNodeN(qnode, 2);
+    struct AABB aabb3 = extractAABB_QBVHNodeN(qnode, 3);
+    struct AABB aabb4 = extractAABB_QBVHNodeN(qnode, 4);
+    struct AABB aabb5 = extractAABB_QBVHNodeN(qnode, 5);
+
+    printf(" lower_x %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_x[0], qnode->qbounds.lower_x[1], qnode->qbounds.lower_x[2], qnode->qbounds.lower_x[3], qnode->qbounds.lower_x[4], qnode->qbounds.lower_x[5], aabb0.lower.x, aabb1.lower.x, aabb2.lower.x, aabb3.lower.x, aabb4.lower.x, aabb5.lower.x);
+    printf(" upper_x %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_x[0], qnode->qbounds.upper_x[1], qnode->qbounds.upper_x[2], qnode->qbounds.upper_x[3], qnode->qbounds.upper_x[4], qnode->qbounds.upper_x[5], aabb0.upper.x, aabb1.upper.x, aabb2.upper.x, aabb3.upper.x, aabb4.upper.x, aabb5.upper.x);
+
+    printf(" lower_y %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_y[0], qnode->qbounds.lower_y[1], qnode->qbounds.lower_y[2], qnode->qbounds.lower_y[3], qnode->qbounds.lower_y[4], qnode->qbounds.lower_y[5], aabb0.lower.y, aabb1.lower.y, aabb2.lower.y, aabb3.lower.y, aabb4.lower.y, aabb5.lower.y);
+    printf(" upper_y %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_y[0], qnode->qbounds.upper_y[1], qnode->qbounds.upper_y[2], qnode->qbounds.upper_y[3], qnode->qbounds.upper_y[4], qnode->qbounds.upper_y[5], aabb0.upper.y, aabb1.upper.y, aabb2.upper.y, aabb3.upper.y, aabb4.upper.y, aabb5.upper.y);
+
+    printf(" lower_z %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.lower_z[0], qnode->qbounds.lower_z[1], qnode->qbounds.lower_z[2], qnode->qbounds.lower_z[3], qnode->qbounds.lower_z[4], qnode->qbounds.lower_z[5], aabb0.lower.z, aabb1.lower.z, aabb2.lower.z, aabb3.lower.z, aabb4.lower.z, aabb5.lower.z);
+    printf(" upper_z %d %d %d %d %d %d %f %f %f %f %f %f\n", qnode->qbounds.upper_z[0], qnode->qbounds.upper_z[1], qnode->qbounds.upper_z[2], qnode->qbounds.upper_z[3], qnode->qbounds.upper_z[4], qnode->qbounds.upper_z[5], aabb0.upper.z, aabb1.upper.z, aabb2.upper.z, aabb3.upper.z, aabb4.upper.z, aabb5.upper.z);
+}
+
+GRL_INLINE int encodeOffset(global char *bvh_mem, global void *parent, int global_child_offset)
+{
+    long global_parent_offset = (long)parent - (long)bvh_mem;
+    global_parent_offset = global_parent_offset & (~(64 - 1));        // FIXME: (sw) this should not be necessary?
+    int relative_offset = global_child_offset - global_parent_offset; // FIXME: this limits BVH size to 4GB
+    //if ((int)relative_offset <= 0) printf("relative offset <= 0 %d global_child_offset %d global_parent_offset %d \n", relative_offset,global_child_offset,global_parent_offset);
+    return relative_offset;
+}
+
+GRL_INLINE void QBVH6Node_set_offset(struct QBVHNodeN *qnode, void *children)
+{
+    int ofs = (struct QBVHNodeN *)children - qnode;
+    qnode->offset = ofs;
+}
+
+GRL_INLINE void QBVH6Node_set_type(struct QBVHNodeN *qnode, uint type)
+{
+    qnode->type = type;
+}
+
+GRL_INLINE uint sortBVHChildrenIDs(uint input)
+{
+#if BVH_NODE_N == 8
+    return sort8_descending(input);
+#else
+    return sort4_descending(input);
+#endif
+}
+
+enum XFM_BOX_OPTION {
+    XFM_BOX_NO_CLIP = 0,
+    XFM_BOX_NOT_REFINED_CLIPPED = 1, //<<use clipbox, for not refined, compute bbox from children, transform after extending to one box
+    XFM_BOX_NOT_REFINED_TAKE_CLIPBOX = 2 //<<use clipbox, for not refined, just transform xlipbox, don't take children boxes into account
+};
+
+#define DEB_PRINTFS 0
+#ifndef FINE_TRANSFORM_NODE_BOX
+#define FINE_TRANSFORM_NODE_BOX 0
+#endif
+
+GRL_INLINE struct AABB3f GRL_OVERLOADABLE compute_xfm_bbox(const float* xfm, InternalNode* pnode, enum XFM_BOX_OPTION clipOpt, const AABB3f* clipBox, float matrixTransformOverhead)
+{
+    AABB3f childrenbox;
+#if FINE_TRANSFORM_NODE_BOX
+    struct AffineSpace3f axfm = AffineSpace3f_load_row_major(xfm);
+    bool computeFine = matrixTransformOverhead < 0.6f;
+    computeFine = sub_group_any(computeFine);
+    if (computeFine)
+    {
+        bool clip = clipOpt != XFM_BOX_NO_CLIP;
+        InternalNode node = *pnode;
+
+#if DEB_PRINTFS
+        if (InternalNode_IsChildValid(&node, 5) && !InternalNode_IsChildValid(&node, 4))
+            printf("child 5 valid && child 4 invalid\n");
+        if (InternalNode_IsChildValid(&node, 4) && !InternalNode_IsChildValid(&node, 3))
+            printf("child 4 valid && child 3 invalid\n");
+        if (InternalNode_IsChildValid(&node, 3) && !InternalNode_IsChildValid(&node, 2))
+            printf("child 3 valid && child 2 invalid\n");
+        if (InternalNode_IsChildValid(&node, 2) && !InternalNode_IsChildValid(&node, 1))
+            printf("child 2 valid && child 1 invalid\n");
+        if (InternalNode_IsChildValid(&node, 1) && !InternalNode_IsChildValid(&node, 0))
+            printf("child 1 valid && child 0 invalid\n");
+#endif
+
+#if DEB_PRINTFS
+        printf("F");
+#endif
+        AABB3f child_bounds0 = InternalNode_GetChildAABB(&node, 0);
+        AABB3f child_bounds1 = InternalNode_GetChildAABB(&node, 1);
+        AABB3f child_bounds2 = InternalNode_GetChildAABB(&node, 2); 
+        AABB3f child_bounds3 = InternalNode_GetChildAABB(&node, 3); 
+        AABB3f child_bounds4 = InternalNode_GetChildAABB(&node, 4); 
+        AABB3f child_bounds5 = InternalNode_GetChildAABB(&node, 5); 
+
+        // we bravely assumme we will have at least 2 children here.
+        if(!InternalNode_IsChildValid(&node, 2)) child_bounds2 = child_bounds0;
+        if(!InternalNode_IsChildValid(&node, 3)) child_bounds3 = child_bounds0;
+        if(!InternalNode_IsChildValid(&node, 4)) child_bounds4 = child_bounds0;
+        if(!InternalNode_IsChildValid(&node, 5)) child_bounds5 = child_bounds0;
+
+        if (clip)
+        {
+            AABB3f_trim_upper(&child_bounds0, clipBox->upper);
+            AABB3f_trim_upper(&child_bounds1, clipBox->upper);
+            AABB3f_trim_upper(&child_bounds2, clipBox->upper);
+            AABB3f_trim_upper(&child_bounds3, clipBox->upper);
+            AABB3f_trim_upper(&child_bounds4, clipBox->upper);
+            AABB3f_trim_upper(&child_bounds5, clipBox->upper);
+        }
+
+        child_bounds0 = transform_aabb(child_bounds0, xfm);
+        child_bounds1 = transform_aabb(child_bounds1, xfm);
+        child_bounds2 = transform_aabb(child_bounds2, xfm);
+        child_bounds3 = transform_aabb(child_bounds3, xfm);
+        child_bounds4 = transform_aabb(child_bounds4, xfm);
+        child_bounds5 = transform_aabb(child_bounds5, xfm);
+        
+        AABB3f_extend(&child_bounds0, &child_bounds1);
+        AABB3f_extend(&child_bounds2, &child_bounds3);
+        AABB3f_extend(&child_bounds4, &child_bounds5);
+        AABB3f_extend(&child_bounds0, &child_bounds2);
+        AABB3f_extend(&child_bounds0, &child_bounds4);
+
+        return child_bounds0;
+    }
+#endif
+
+#if DEB_PRINTFS
+    printf("0");
+#endif
+
+    struct AABB3f child_bounds;
+
+    if (clipOpt != XFM_BOX_NOT_REFINED_TAKE_CLIPBOX)
+    {
+        // XFM_BOX_NOT_REFINED_CLIPPED || XFM_BOX_NO_CLIP
+        child_bounds = InternalNode_getAABB3f(pnode);
+        if (clipOpt != XFM_BOX_NO_CLIP)
+        {
+            AABB3f_intersect(&child_bounds, *clipBox);
+        }
+    }
+    else
+    {
+        //XFM_BOX_NOT_REFINED_TAKE_CLIPBOX
+        child_bounds = *clipBox;
+    }
+
+    child_bounds = transform_aabb(child_bounds, xfm);
+    //child_bounds = conservativeAABB3f(&child_bounds);
+    return child_bounds;
+}
+
+GRL_INLINE AABB3f GRL_OVERLOADABLE compute_xfm_bbox(struct AffineSpace3f xfm, InternalNode* pnode, bool clip, AABB3f* clipBox, float matOverhead)
+{
+    float transform[12];
+    load_row_major_from_AffineSpace3f(xfm, transform);
+    return compute_xfm_bbox(transform, pnode, clip, clipBox, matOverhead);
+}
+
+GRL_INLINE uint64_t compute_refit_structs_compacted_size(BVHBase* base)
+{
+    uint dataSize = 0;
+
+    if (BVHBase_HasBackPointers(base))
+    {
+        const uint fatleafEntrySize = (base->fatLeafCount * sizeof(LeafTableEntry) + 63) & ~63;
+        const uint innerEntrySize = (base->innerCount * sizeof(InnerNodeTableEntry) + 63) & ~63;
+
+        // New atomic update
+        if(base->quadIndicesDataStart > base->backPointerDataStart)
+        {
+            uint numQuads = BVHBase_GetNumQuads(base);
+
+            const uint quadTableMainBufferSize = (numQuads + 255) & ~255;
+            const uint quadLeftoversSize = (base->quadLeftoversCountNewAtomicUpdate + 255) & ~255;
+            const uint quadTableEntriesSize = (((quadTableMainBufferSize + quadLeftoversSize) * sizeof(LeafTableEntry) + 63) & ~63);
+
+            const uint quadIndicesDataSize = (numQuads * sizeof(QuadDataIndices) + 63) & ~63;
+
+            dataSize += quadTableEntriesSize + quadIndicesDataSize;
+        }
+
+        dataSize += 
+            ((BVHBase_GetNumInternalNodes(base) * sizeof(uint) + 63) & ~63)
+            + fatleafEntrySize + innerEntrySize;
+    }
+
+    return (uint64_t)dataSize;
+}
+
+GRL_INLINE uint64_t compute_compacted_size(BVHBase* base)
+{
+    uint64_t size = sizeof(BVHBase);
+    size += BVHBase_GetNumHWInstanceLeaves(base) * sizeof(HwInstanceLeaf);
+    size += BVHBase_GetNumProcedurals(base) * sizeof(ProceduralLeaf);
+    size += BVHBase_GetNumQuads(base) * sizeof(QuadLeaf);
+    size += compute_refit_structs_compacted_size(base);
+    size += BVHBase_GetNumInternalNodes(base) * sizeof(InternalNode);
+    size += sizeof(InstanceDesc) * base->Meta.instanceCount;
+    size += (sizeof(GeoMetaData) * base->Meta.geoCount + 63) & ~63; // align to 64
+    size = (size + 63) & ~63;
+
+    return size;
+}
diff --git a/src/intel/vulkan/grl/gpu/quad.h b/src/intel/vulkan/grl/gpu/quad.h
new file mode 100644
index 00000000000..cc1b7d470f8
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/quad.h
@@ -0,0 +1,127 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "shared.h"
+#include "intrinsics.h"
+#include "AABB.h"
+#include "AABB3f.h"
+
+// JDB TODO:  Use corresponding GRL structures!!!
+
+struct Quad
+{
+    unsigned int shaderIndex;   // note: also mask
+    unsigned int geomIndex;     // note:  also geom flags in upper 2 bits
+    unsigned int primIndex0;
+    unsigned int primIndex1Delta;
+    float v[4][3];
+};
+
+GRL_INLINE unsigned int Quad_getGeomIndex(global struct Quad *quad)
+{
+    return quad->geomIndex;
+}
+
+GRL_INLINE unsigned int Quad_getPrimIndex0(global struct Quad *quad)
+{
+    return quad->primIndex0;
+}
+
+GRL_INLINE unsigned int Quad_getPrimIndex1(global struct Quad *quad)
+{
+    return quad->primIndex0 + (quad->primIndex1Delta & 0xFFFF);
+}
+
+GRL_INLINE float3 load_float3(float *p)
+{
+    return (float3)(p[0], p[1], p[2]);
+}
+
+GRL_INLINE float3 load_perm_float3(float *p, const uint3 perm)
+{
+    return (float3)(p[perm.x], p[perm.y], p[perm.z]);
+}
+
+GRL_INLINE float2 load_perm_float2(float *p, const uint2 perm)
+{
+    return (float2)(p[perm.x], p[perm.y]);
+}
+
+GRL_INLINE float load_perm_float(float *p, const uint perm)
+{
+    return p[perm];
+}
+
+GRL_INLINE struct AABB getAABB_Quad(struct Quad *q)
+{
+    struct AABB aabb;
+    const float3 lower = min(min(load_float3(q->v[0]), load_float3(q->v[1])), min(load_float3(q->v[2]), load_float3(q->v[3])));
+    const float3 upper = max(max(load_float3(q->v[0]), load_float3(q->v[1])), max(load_float3(q->v[2]), load_float3(q->v[3])));
+    aabb.lower = (float4)(lower, 0.0f);
+    aabb.upper = (float4)(upper, 0.0f);
+    return aabb;
+}
+
+GRL_INLINE void Quad_ExtendAABB(struct Quad* q, struct AABB* box)
+{
+    struct AABB aabb;
+    const float3 lower = min(min(load_float3(q->v[0]), load_float3(q->v[1])), min(load_float3(q->v[2]), load_float3(q->v[3])));
+    const float3 upper = max(max(load_float3(q->v[0]), load_float3(q->v[1])), max(load_float3(q->v[2]), load_float3(q->v[3])));
+    aabb.lower = (float4)(lower, 0.0f);
+    aabb.upper = (float4)(upper, 0.0f);
+    AABB_extend(box, &aabb);
+}
+
+GRL_INLINE float4 getCentroid2_Quad(struct Quad *q)
+{
+    struct AABB aabb = getAABB_Quad(q);
+    return aabb.lower + aabb.upper;
+}
+
+GRL_INLINE void setQuad(struct Quad *quad, const float4 v0, const float4 v1, const float4 v2, const float4 v3,
+                    const uchar j0, const uchar j1, const uchar j2,
+                    const uint geomID, const uint primID0, const uint primID1, const uint geomMask, const uint geomFlags )
+{
+    quad->v[0][0] = v0.x;
+    quad->v[0][1] = v0.y;
+    quad->v[0][2] = v0.z;
+    quad->v[1][0] = v1.x;
+    quad->v[1][1] = v1.y;
+    quad->v[1][2] = v1.z;
+    quad->v[2][0] = v2.x;
+    quad->v[2][1] = v2.y;
+    quad->v[2][2] = v2.z;
+    quad->v[3][0] = v3.x;
+    quad->v[3][1] = v3.y;
+    quad->v[3][2] = v3.z;
+
+    quad->shaderIndex = (geomMask << 24) | geomID;
+    quad->geomIndex = geomID | (geomFlags << 30);
+    quad->primIndex0 = primID0;
+    const uint delta = primID1 - primID0;
+    const uint j = (((j0) << 0) | ((j1) << 2) | ((j2) << 4));
+    quad->primIndex1Delta = delta | (j << 16) | (1 << 22); // single prim in leaf
+   
+}
+
+GRL_INLINE void setQuadVertices(struct Quad *quad, const float3 v0, const float3 v1, const float3 v2, const float3 v3)
+{
+    quad->v[0][0] = v0.x;
+    quad->v[0][1] = v0.y;
+    quad->v[0][2] = v0.z;
+    quad->v[1][0] = v1.x;
+    quad->v[1][1] = v1.y;
+    quad->v[1][2] = v1.z;
+    quad->v[2][0] = v2.x;
+    quad->v[2][1] = v2.y;
+    quad->v[2][2] = v2.z;
+    quad->v[3][0] = v3.x;
+    quad->v[3][1] = v3.y;
+    quad->v[3][2] = v3.z;
+}
diff --git a/src/intel/vulkan/grl/gpu/radix_sort.grl b/src/intel/vulkan/grl/gpu/radix_sort.grl
new file mode 100644
index 00000000000..df932057a10
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/radix_sort.grl
@@ -0,0 +1,163 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module radix_sort;
+
+kernel_module radix_kernels ("morton_radix_sort.cl") 
+{
+    links lsc_intrinsics;
+    kernel opencl_build_morton_kernel_sort_bin_items              < kernelFunction="sort_morton_codes_bin_items">;
+    kernel opencl_build_morton_kernel_sort_reduce_bins            < kernelFunction="sort_morton_codes_reduce_bins">;
+    kernel opencl_build_morton_kernel_sort_scatter_items          < kernelFunction="sort_morton_codes_scatter_items">;
+
+    kernel opencl_build_morton_codes_sort_merged                  < kernelFunction="sort_morton_codes_merged">;
+
+    kernel opencl_build_morton_kernel_sort_reduce_bins_wide_partial_sum   < kernelFunction="sort_morton_codes_reduce_bins_wide_partial_sum">;
+    kernel opencl_build_morton_kernel_sort_reduce_bins_wide_add_reduce    < kernelFunction="sort_morton_codes_reduce_bins_wide_add_reduce">;
+}
+
+metakernel sort(
+    qword build_globals,
+    dword shift,
+    qword global_histogram,
+    qword input0,
+    qword input1,
+    dword input0_offset,
+    dword input1_offset,
+    dword iteration,
+    dword threads)
+{
+    dispatch opencl_build_morton_kernel_sort_bin_items (threads, 1, 1) args(
+        build_globals,
+        shift,
+        global_histogram,
+        input0,
+        input1,
+        input0_offset,
+        input1_offset,
+        iteration);
+
+    control(wait_idle);
+
+    dispatch opencl_build_morton_kernel_sort_reduce_bins (1, 1, 1) args(
+        threads,
+        global_histogram);
+
+    control(wait_idle);
+
+    dispatch opencl_build_morton_kernel_sort_scatter_items (threads, 1, 1) args(
+        build_globals,
+        shift,
+        global_histogram,
+        input0,
+        input1,
+        input0_offset,
+        input1_offset,
+        iteration);
+
+        control(wait_idle);
+
+}
+
+metakernel sort_bin_items(
+    qword build_globals,
+    qword global_histogram,
+    qword wg_flags,
+    qword input0,
+    dword iteration,
+    dword threads,
+    dword update_wg_flags
+    )
+{
+    dispatch opencl_build_morton_kernel_sort_bin_items (threads, 1, 1) args(
+        build_globals,
+        global_histogram,
+        wg_flags,
+        input0,
+        iteration,
+        threads,
+        update_wg_flags
+    );
+}
+
+metakernel sort_reduce_bins(
+    qword build_globals,
+    qword global_histogram,
+    dword threads,
+    dword iteration)
+{
+    dispatch opencl_build_morton_kernel_sort_reduce_bins (1, 1, 1) args(
+        build_globals,
+        threads,
+        global_histogram,
+        iteration);
+}
+
+metakernel sort_scatter_items(
+    qword build_globals,
+    qword global_histogram,
+    qword input0,
+    qword input1,
+    dword iteration,
+    dword threads,
+    dword update_morton_sort_in_flight )
+{
+    dispatch opencl_build_morton_kernel_sort_scatter_items( threads, 1, 1 ) args(
+        build_globals,
+        global_histogram,
+        input0,
+        input1,
+        iteration,
+        threads,
+        update_morton_sort_in_flight
+    );
+}
+
+metakernel sort_bin_items_merged(
+    qword build_globals,
+    qword global_histogram,
+    qword input0,
+    dword iteration,
+    dword threads)
+{
+    dispatch opencl_build_morton_codes_sort_merged (threads, 1, 1) args(
+        build_globals,
+        global_histogram,
+        input0,
+        iteration,
+        threads
+    );
+}
+
+metakernel sort_reduce_bins_wide(
+    qword build_globals,
+    qword global_histogram,
+    qword global_histogram_tmp,
+    qword wg_flags,
+    dword threads,
+    dword threads_groups,
+    dword iteration)
+{
+    dispatch opencl_build_morton_kernel_sort_reduce_bins_wide_partial_sum(threads_groups, 1, 1) args(
+        build_globals,
+        threads,
+        threads_groups,
+        global_histogram,
+        global_histogram_tmp,
+        wg_flags,
+        iteration);
+
+    control(wait_idle);
+
+    dispatch opencl_build_morton_kernel_sort_reduce_bins_wide_add_reduce(threads_groups, 1, 1) args(
+        build_globals,
+        threads,
+        threads_groups,
+        global_histogram,
+        global_histogram_tmp,
+        iteration);
+}
diff --git a/src/intel/vulkan/grl/gpu/rebraid.grl b/src/intel/vulkan/grl/gpu/rebraid.grl
new file mode 100644
index 00000000000..5aa809637a3
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/rebraid.grl
@@ -0,0 +1,167 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module rebraid;
+
+kernel init_scratch             < source="bvh_rebraid.cl", kernelFunction="rebraid_init_scratch"                        >
+kernel chase_instance_ptrs      < source="bvh_rebraid.cl", kernelFunction="rebraid_chase_instance_pointers"             >
+kernel calc_aabb                < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances"           >
+kernel calc_aabb_indirect                < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_indirect"           >
+kernel calc_aabb_ptr            < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_pointers"  >
+kernel calc_aabb_ptr_indirect            < source="bvh_rebraid.cl", kernelFunction="rebraid_computeAABB_DXR_instances_pointers_indirect"  >
+kernel count_splits             < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits"          >
+kernel count_splits_SG          < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits_SG"       >
+kernel count_splits_SG_indirect          < source="bvh_rebraid.cl", kernelFunction="rebraid_count_splits_SG_indirect"       >
+kernel build_primrefs           < source="bvh_rebraid.cl", kernelFunction="rebraid_build_primrefs"        >
+kernel build_primrefs_indirect           < source="bvh_rebraid.cl", kernelFunction="rebraid_build_primrefs_indirect"        >
+
+//kernel ISA_TEST < source="bvh_rebraid.cl", kernelFunction="ISA_TEST" >
+//kernel DEBUG_PRINT < source="bvh_rebraid.cl", kernelFunction="DEBUG_PRINT" >
+
+
+const PRIMREF_GROUP_SIZE = 256;
+
+const COUNT_SPLITS_GROUP_SIZE = 16;
+
+struct MKRebraidArgs
+{
+  qword bvh_buffer;
+  qword primref_buffer;
+  qword global_buffer;
+  qword instances_buffer;
+  qword rebraid_scratch;
+  qword flat_instances_buffer;
+  dword num_instances;
+  dword num_extra_primrefs;
+};
+
+metakernel rebraid(
+    MKRebraidArgs Args
+  )
+{
+  dispatch init_scratch(1,1,1) args( Args.rebraid_scratch );
+  dispatch calc_aabb(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer );
+  control( wait_idle );
+
+  //define num_count_groups ((Args.num_instances + (COUNT_SPLITS_GROUP_SIZE-1)) / COUNT_SPLITS_GROUP_SIZE);
+  //dispatch count_splits(num_count_groups,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.num_instances );
+  
+  dispatch count_splits_SG(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch );
+  control( wait_idle );
+
+  define num_primref_groups ((Args.num_instances + (PRIMREF_GROUP_SIZE-1)) / PRIMREF_GROUP_SIZE);
+
+  dispatch build_primrefs(num_primref_groups,1,1) args( Args.global_buffer, Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances );
+  control( wait_idle );
+
+  //dispatch DEBUG_PRINT(1,1,1) args( Args.global_buffer, Args.instances_buffer, Args.rebraid_scratch, Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances );
+}
+
+metakernel rebraid_indirect(MKRebraidArgs Args, qword indirectBuildRangeInfo)
+{
+
+    dispatch init_scratch(1, 1, 1) args(Args.rebraid_scratch);
+
+    define num_groups  REG0;
+    num_groups = load_dword(indirectBuildRangeInfo);
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect calc_aabb_indirect args(Args.bvh_buffer, Args.instances_buffer, indirectBuildRangeInfo);
+    control(wait_idle);
+
+    dispatch_indirect count_splits_SG_indirect
+        args(Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch, indirectBuildRangeInfo);
+
+    define groupsize_1 REG1; // groupsize - 1
+    define C_8         REG2;
+
+    groupsize_1 = 255; // PRIMREF_GROUP_SIZE - 1
+    C_8 = 8;          // log_2(PRIMREF_GROUP_SIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_8; // num_groups / PRIMREF_GROUP_SIZE;
+    DISPATCHDIM_X = num_groups.lo;
+
+    control(wait_idle);
+
+    dispatch_indirect build_primrefs_indirect args(
+        Args.global_buffer,
+        Args.bvh_buffer,
+        Args.instances_buffer,
+        Args.rebraid_scratch,
+        Args.primref_buffer,
+        indirectBuildRangeInfo,
+        Args.num_extra_primrefs);
+    control(wait_idle);
+}
+
+metakernel rebraid_ptrs(
+    MKRebraidArgs Args
+  )
+{
+  dispatch init_scratch(1,1,1) args( Args.rebraid_scratch );
+  dispatch chase_instance_ptrs( Args.num_instances, 1, 1)   args( Args.instances_buffer, Args.flat_instances_buffer );
+  dispatch calc_aabb_ptr(Args.num_instances,1,1) args( Args.bvh_buffer, Args.instances_buffer );
+  control( wait_idle );
+
+  //define num_count_groups ((Args.num_instances + (COUNT_SPLITS_GROUP_SIZE-1)) / COUNT_SPLITS_GROUP_SIZE);
+  //dispatch count_splits(num_count_groups,1,1) args( Args.bvh_buffer, Args.instances_buffer, Args.rebraid_scratch );
+  
+  dispatch count_splits_SG(Args.num_instances,1,1) args( Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch );
+  control( wait_idle );
+
+  define num_primref_groups ((Args.num_instances + (PRIMREF_GROUP_SIZE-1)) / PRIMREF_GROUP_SIZE);
+
+
+  dispatch build_primrefs(num_primref_groups,1,1) args( Args.global_buffer, Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch,  Args.primref_buffer, Args.num_extra_primrefs, Args.num_instances );
+  control( wait_idle );
+
+}
+
+metakernel rebraid_ptrs_indirect(MKRebraidArgs Args, qword indirectBuildRangeInfo)
+{
+    dispatch init_scratch(1, 1, 1) args(Args.rebraid_scratch);
+
+    define num_groups  REG0;
+    num_groups = load_dword(indirectBuildRangeInfo);
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect chase_instance_ptrs
+             args(Args.instances_buffer, Args.flat_instances_buffer, indirectBuildRangeInfo);
+    dispatch_indirect calc_aabb_ptr_indirect args(Args.bvh_buffer, Args.instances_buffer, indirectBuildRangeInfo);
+    control(wait_idle);
+
+    dispatch_indirect count_splits_SG_indirect
+        args(Args.bvh_buffer, Args.flat_instances_buffer, Args.rebraid_scratch, indirectBuildRangeInfo);
+
+    define groupsize_1 REG1; // groupsize - 1
+    define C_8         REG2;
+
+    groupsize_1 = 255; // PRIMREF_GROUP_SIZE - 1
+    C_8 = 8;          // log_2(PRIMREF_GROUP_SIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_8; // num_groups / PRIMREF_GROUP_SIZE;
+    DISPATCHDIM_X = num_groups.lo;
+
+    control(wait_idle);
+
+    dispatch_indirect build_primrefs_indirect args(
+        Args.global_buffer,
+        Args.bvh_buffer,
+        Args.flat_instances_buffer,
+        Args.rebraid_scratch,
+        Args.primref_buffer,
+        Args.num_extra_primrefs,
+        indirectBuildRangeInfo,
+        Args.num_instances);
+    control(wait_idle);
+}
diff --git a/src/intel/vulkan/grl/gpu/shared.h b/src/intel/vulkan/grl/gpu/shared.h
new file mode 100644
index 00000000000..0d42d98a1d4
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/shared.h
@@ -0,0 +1,182 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "GRLGen12.h"
+#pragma once
+
+#define sizeof_Quad 64
+#define sizeof_Procedural 64
+#define sizeof_PrimRef 32
+#define sizeof_PresplitItem 8
+#define sizeof_HwInstanceLeaf 128
+#define MORTON_BUILDER_SUBTREE_THRESHOLD 256
+#define MORTON_BUILDER_P2_ELEMENTS_IN_SLM 16 * 1024 / 32
+// Temporarily disable localized phase2 due to issues in ELG presi
+// This implementation would be replaced with bottom_up + bounding box approach without the need for phase2 refit
+#define MORTON_BUILDER_P2_SINGLE_WG_THRESHOLD /*100000*/ 0
+
+#define BVH_QUAD_NODE 4
+#define BVH_INSTANCE_NODE 1
+#define BVH_INTERNAL_NODE 0
+#define BVH_PROCEDURAL_NODE 3
+#define BUILDRECORD_STACK_SIZE 48
+#define BINS 16
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+GRL_NAMESPACE_BEGIN(GPUBVHBuilder)
+
+struct AABB
+{
+    float4 lower;
+    float4 upper;
+};
+
+typedef struct BlockAllocator
+{
+    unsigned int start;
+    unsigned int cur;
+} BlockAllocator;
+
+struct Globals
+{
+    struct AABB centroidBounds;
+
+    unsigned int build_record_start;
+    unsigned int numPrimitives;
+    unsigned int leafPrimType;
+    unsigned int leafSize;
+
+    unsigned int numSplittedPrimitives;
+    unsigned int numBuildRecords;
+
+    // spatial split sate
+    unsigned int numOriginalPrimitives;
+    float presplitPrioritySum;
+    float probThreshold;
+
+    // binned-sah bfs state 
+    unsigned int counter;
+    unsigned int numBuildRecords_extended;
+    
+    // sync variable used for global-sync on work groups
+    unsigned int sync;
+    
+
+    /* morton code builder state */
+    unsigned int shift;      // used by adaptive mc-builder
+    unsigned int shift_mask; // used by adaptive mc-builder
+    unsigned int binary_hierarchy_root;
+    unsigned int p0_allocated_num;
+    unsigned int p0_created_num;
+    unsigned int morton_sort_in_flight;
+    unsigned int sort_iterations;
+
+    gpuva_t binary_hierarchy_buffer; // pointer to the binary morton code hierarchy.  Stashed here as a debug aid
+};
+
+struct Range
+{
+    unsigned int start, end;
+};
+
+struct Triangle
+{
+    unsigned int vtx[3];
+    //unsigned int primID;
+    //unsigned int geomID;
+};
+
+struct MortonCodePrimitive
+{
+    uint64_t index_code; // 64bit code + index combo
+};
+
+struct BuildRecord
+{
+    struct AABB centroidBounds;
+    unsigned int start, end;
+    __global void *current;
+};
+
+struct BinaryMortonCodeHierarchy
+{
+    struct Range range;
+    unsigned int leftChild;
+    unsigned int rightChild;
+   // unsigned int flag;
+};
+
+typedef struct MortonFlattenedBoxlessNode {
+    uint binary_hierarchy_index; // only needed when type != BVH_INTERNAL_NODE
+    uint childOffset_type;       // childOffset : 26, type : 6
+    uint backPointer;            // same usage as in bvh
+} MortonFlattenedBoxlessNode;
+
+struct StatStackEntry
+{
+    struct AABB aabb;
+    unsigned int node;
+    unsigned int type;
+    unsigned int depth;
+    float area;
+};
+
+struct BuildRecordMorton
+{
+    unsigned int nodeID;
+    unsigned int items;
+    unsigned int current_index;
+    unsigned int parent_index;
+};
+
+struct Split
+{
+    float sah;
+    int dim;
+    int pos;
+};
+
+struct BinMapping
+{
+    float4 ofs, scale;
+};
+
+struct BinInfo
+{
+    struct AABB3f boundsX[BINS];
+    struct AABB3f boundsY[BINS];
+    struct AABB3f boundsZ[BINS];
+    uint3 counts[BINS];
+};
+
+struct BinInfo2
+{
+    struct AABB3f boundsX[BINS * 2];
+    struct AABB3f boundsY[BINS * 2];
+    struct AABB3f boundsZ[BINS * 2];
+    uint3 counts[BINS * 2];
+};
+
+struct GlobalBuildRecord
+{
+    struct BinInfo2 binInfo;
+    struct BinMapping binMapping;
+    struct Split split;
+    struct Range range;
+    struct AABB leftCentroid;
+    struct AABB rightCentroid;
+    struct AABB leftGeometry;
+    struct AABB rightGeometry;
+    unsigned int atomicCountLeft;
+    unsigned int atomicCountRight;
+    unsigned int buildRecordID;
+};
+
+GRL_NAMESPACE_END(GPUBVHBuilder)
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
diff --git a/src/intel/vulkan/grl/gpu/structs.grl b/src/intel/vulkan/grl/gpu/structs.grl
new file mode 100644
index 00000000000..f15b1d2346b
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/structs.grl
@@ -0,0 +1,38 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module structs;
+
+struct MKBuilderState {
+    qword geomDesc_buffer;
+    qword build_primref_buffer;
+    qword build_globals;
+    qword bvh_buffer;
+    dword leaf_type;
+    dword leaf_size;
+};
+
+struct MKSizeEstimate {
+    dword numTriangles;
+    dword numProcedurals;
+    dword numPrimitives;
+    dword numMeshes;
+    dword numBuildPrimitives;
+    dword numPrimitivesToSplit;
+    dword instance_descs_start;
+    dword geo_meta_data_start;
+    dword node_data_start;
+    dword leaf_data_start;
+    dword procedural_data_start;
+    dword back_pointer_start;
+    dword sizeTotal;
+    dword updateScratchSizeTotal;
+    dword fatleaf_table_start;
+    dword innernode_table_start;
+    dword max_fatleaves;
+    dword quad_indices_data_start;
+};
diff --git a/src/intel/vulkan/grl/gpu/traversal_shader.cl b/src/intel/vulkan/grl/gpu/traversal_shader.cl
new file mode 100644
index 00000000000..ee5d2afcc75
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/traversal_shader.cl
@@ -0,0 +1,277 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#include "instance.h"
+#include "api_interface.h"
+
+#include "bvh_build_primref.h"
+#include "bvh_build_refit.h"
+
+/*
+  Create primrefs from array of instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+TS_primrefs_from_instances(
+    global struct Globals* globals,
+    global struct BVHBase* bvh,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
+    uint numInstances,
+    global struct AABB* primrefs,
+    global uchar* pAABBs,
+    global uchar* pIsProcedural,
+    dword aabb_stride,
+    uint allowUpdate
+    )
+{
+    const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+    if (instanceIndex < numInstances)
+    {
+        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex;
+
+        global struct GRL_RAYTRACING_AABB* procedural_bb = 0;
+        if ( pIsProcedural[instanceIndex] )
+        {
+            procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex);
+        }
+
+        primrefs_from_instances(
+            globals,
+            bvh,
+            instance,
+            instanceIndex,
+            primrefs,
+            procedural_bb,
+            allowUpdate);
+    }
+}
+
+/*
+  Create primrefs from array of instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel
+TS_primrefs_from_instances_indirect(
+    global struct Globals* globals,
+    global struct BVHBase* bvh,
+    global __const struct GRL_RAYTRACING_INSTANCE_DESC* instances,
+    uint numInstances,
+    global struct AABB* primrefs,
+    global uchar* pAABBs,
+    global uchar* pIsProcedural,
+    dword aabb_stride,
+    uint allowUpdate,
+    global struct IndirectBuildRangeInfo* indirect_data
+    )
+{
+    const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+    if (instanceIndex < indirect_data->primitiveCount)
+    {
+        instances = (global __const struct GRL_RAYTRACING_INSTANCE_DESC*)
+            (((global char*)instances) + indirect_data->primitiveOffset);
+        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances + instanceIndex;
+
+        global struct GRL_RAYTRACING_AABB* procedural_bb = 0;
+        if ( pIsProcedural[instanceIndex] )
+        {
+            procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex);
+        }
+
+        primrefs_from_instances(
+            globals,
+            bvh,
+            instance,
+            instanceIndex,
+            primrefs,
+            procedural_bb,
+            allowUpdate);
+    }
+}
+
+/*
+  Create primrefs from array of pointers to instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
+TS_primrefs_from_instances_pointers(global struct Globals* globals,
+    global struct BVHBase* bvh,
+    global void* instances_in,
+    uint numInstances,
+    global struct AABB* primrefs,
+    global uchar* pAABBs,
+    global uchar* pIsProcedural,
+    dword aabb_stride,
+    uint allowUpdate
+    )
+{
+    global const struct GRL_RAYTRACING_INSTANCE_DESC** instances =
+        (global const struct GRL_RAYTRACING_INSTANCE_DESC**)instances_in;
+
+    const uint instanceIndex = get_sub_group_local_id() + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+    if (instanceIndex < numInstances)
+    {
+        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex];
+
+        global struct GRL_RAYTRACING_AABB* procedural_bb = 0;
+        if (pIsProcedural[instanceIndex])
+        {
+            procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex);
+        }
+
+        primrefs_from_instances(
+            globals,
+            bvh,
+            instance,
+            instanceIndex,
+            primrefs,
+            procedural_bb,
+            allowUpdate);
+    }
+}
+
+/*
+  Create primrefs from array of pointers to instance descriptors.
+ */
+ GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
+void kernel
+TS_primrefs_from_instances_pointers_indirect(global struct Globals* globals,
+    global struct BVHBase* bvh,
+    global void* instances_in,
+    global struct AABB* primrefs,
+    global uchar* pAABBs,
+    global uchar* pIsProcedural,
+    dword aabb_stride,
+    uint allowUpdate,
+    global struct IndirectBuildRangeInfo* indirect_data
+    )
+{
+    const uint instanceIndex = get_local_id(0) + get_group_id(0) * MAX_HW_SIMD_WIDTH;
+    if (instanceIndex < indirect_data->primitiveCount)
+    {
+        instances_in = ((global char*)instances_in) + indirect_data->primitiveOffset;
+        global const struct GRL_RAYTRACING_INSTANCE_DESC** instances =
+            (global const struct GRL_RAYTRACING_INSTANCE_DESC**)instances_in;
+        global __const struct GRL_RAYTRACING_INSTANCE_DESC* instance = instances[instanceIndex];
+
+        global struct GRL_RAYTRACING_AABB* procedural_bb = 0;
+        if (pIsProcedural[instanceIndex])
+        {
+            procedural_bb = (global struct GRL_RAYTRACING_AABB*)(pAABBs + aabb_stride * instanceIndex);
+        }
+
+        primrefs_from_instances(
+            globals,
+            bvh,
+            instance,
+            instanceIndex,
+            primrefs,
+            procedural_bb,
+            allowUpdate);
+    }
+}
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+TS_update_instance_leaves(global struct BVHBase* bvh,
+    uint64_t dxrInstancesArray,
+    uint64_t dxrInstancesPtr,
+    global struct AABB3f* instance_aabb_scratch,
+    global uchar* aabbs,
+    global uchar* is_procedural,
+    dword aabb_stride
+)
+{
+    uint num_leaves = BVHBase_GetNumHWInstanceLeaves(bvh);
+    uint id = get_local_id(0) + get_local_size(0) * get_group_id(0);
+    if (id >= num_leaves)
+        return;
+
+    struct HwInstanceLeaf* leaves = BVHBase_GetHWInstanceLeaves(bvh);
+    uint idx = HwInstanceLeaf_GetInstanceIndex(&leaves[id]);
+
+    global GRL_RAYTRACING_AABB* procedural_box = 0;
+    if (is_procedural[idx])
+    {
+        procedural_box = (global GRL_RAYTRACING_AABB*)(aabbs + (aabb_stride * idx));
+    }
+
+    DO_update_instance_leaves(
+        bvh,
+        dxrInstancesArray,
+        dxrInstancesPtr,
+        instance_aabb_scratch,
+        id,
+        procedural_box);
+}
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(16, 1, 1)))
+void kernel
+TS_fixup_leaves( global struct BVHBase* bvh,
+                 global uchar* primref_index,
+                 global PrimRef* primrefs,
+                 uint stride )
+
+{
+    uint num_inners = BVHBase_GetNumInternalNodes(bvh);
+    uint id       = get_local_id(0) + get_local_size(0) * get_group_id(0);
+
+    // assign 8 lanes to each inner node, 6 of which will do useful work
+    uint node_id  = id / 8;
+    uint child_id = id % 8;
+
+    bool node_valid = (node_id < num_inners);
+
+    if (node_valid )
+    {
+        global InternalNode* nodes = (global InternalNode*) BVHBase_GetInternalNodes(bvh);
+        global InternalNode* my_node = nodes + node_id;
+
+        if (my_node->nodeType == BVH_INSTANCE_NODE)
+        {
+            bool child_valid = (child_id < 6) && InternalNode_IsChildValid(my_node, child_id);
+            if (child_valid)
+            {
+                global HwInstanceLeaf* leaves = (global HwInstanceLeaf*)InternalNode_GetChildren(my_node);
+                uint leafIndex = (leaves - BVHBase_GetHWInstanceLeaves(bvh)) + child_id;
+
+                const uint primrefID = *(uint*)(primref_index + leafIndex * stride);
+
+                uint type = PRIMREF_isProceduralInstance(&primrefs[primrefID]) ?
+                                BVH_PROCEDURAL_NODE : BVH_INSTANCE_NODE;
+
+                InternalNode_SetChildType(my_node, child_id, type);
+            }
+
+            if (child_id == 0)
+                my_node->nodeType = BVH_INTERNAL_NODE;
+        }
+    }
+}
+
+
+
+
+
+GRL_ANNOTATE_IGC_DO_NOT_SPILL
+__attribute__((reqd_work_group_size(SG_REFIT_WG_SIZE, 1, 1))) void kernel
+TS_Refit_per_one_startpoint_sg(
+    global struct BVHBase* bvh,
+    global struct AABB3f* instance_leaf_aabbs,
+    global uchar* procedural_instance_enable_buffer )
+{
+    DO_Refit_per_one_startpoint_sg(bvh, (global GRL_RAYTRACING_GEOMETRY_DESC*) bvh, instance_leaf_aabbs, procedural_instance_enable_buffer );
+
+}
diff --git a/src/intel/vulkan/grl/gpu/traversal_shader.grl b/src/intel/vulkan/grl/gpu/traversal_shader.grl
new file mode 100644
index 00000000000..3820996c348
--- /dev/null
+++ b/src/intel/vulkan/grl/gpu/traversal_shader.grl
@@ -0,0 +1,244 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+module traversal_shader;
+
+kernel_module morton_kernels ("traversal_shader.cl")
+{
+    links lsc_intrinsics;
+
+    kernel TS_primrefs_from_instances       < kernelFunction = "TS_primrefs_from_instances" >;
+    kernel TS_primrefs_from_instances_indirect       < kernelFunction = "TS_primrefs_from_instances_indirect" >;
+    kernel TS_primrefs_from_instances_ptrs  < kernelFunction = "TS_primrefs_from_instances_pointers" >;
+    kernel TS_primrefs_from_instances_ptrs_indirect  < kernelFunction = "TS_primrefs_from_instances_pointers_indirect" >;
+    kernel TS_update_instance_leaves        < kernelFunction = "TS_update_instance_leaves" >;
+    kernel TS_Refit_per_one_startpoint_sg   < kernelFunction = "TS_Refit_per_one_startpoint_sg" >;
+    kernel TS_fixup_leaves                  < kernelFunction = "TS_fixup_leaves" >;
+}
+
+struct MKTSBuildArgs
+{
+    qword build_globals;
+    qword bvh_buffer;
+    qword instance_descs;
+    qword build_primref_buffer;
+    qword aabb_buffer;
+    qword is_procedural_buffer;
+    qword leaf_creation_index_buffer;
+    dword aabb_stride;
+    dword num_instances;
+    dword leaf_creation_index_stride;
+};
+
+const BUILD_PRIMREFS_GROUPSIZE = 16;
+
+
+metakernel TS_build_primrefs( MKTSBuildArgs build_state, dword allowUpdate )
+{
+    define num_groups((build_state.num_instances + BUILD_PRIMREFS_GROUPSIZE - 1) / BUILD_PRIMREFS_GROUPSIZE);
+    dispatch TS_primrefs_from_instances(num_groups, 1, 1) args(
+        build_state.build_globals,
+        build_state.bvh_buffer,
+        build_state.instance_descs,
+        build_state.num_instances,
+        build_state.build_primref_buffer,
+        build_state.aabb_buffer,
+        build_state.is_procedural_buffer,
+        build_state.aabb_stride,
+        allowUpdate
+    );
+
+}
+
+metakernel TS_build_primrefs_indirect(MKTSBuildArgs build_state, qword indirectBuildRangeInfo, dword allowUpdate)
+{
+    define num_groups  REG0;
+    define groupsize_1 REG1; // groupsize - 1
+    define C_4         REG2;
+
+    // init with primitiveCount
+    num_groups = load_dword(indirectBuildRangeInfo);
+    groupsize_1 = 15; // BUILD_PRIMREFS_GROUPSIZE - 1
+    C_4 = 4;          // log_2(BUILD_PRIMREFS_GROUPSIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_4; // num_groups / BUILD_PRIMREFS_GROUPSIZE;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect TS_primrefs_from_instances_indirect args(
+        build_state.build_globals,
+        build_state.bvh_buffer,
+        build_state.instance_descs,
+        build_state.build_primref_buffer,
+        build_state.aabb_buffer,
+        build_state.is_procedural_buffer,
+        build_state.aabb_stride,
+        allowUpdate,
+        indirectBuildRangeInfo
+    );
+
+}
+
+metakernel TS_build_primrefs_array_of_pointers( MKTSBuildArgs build_state, dword allowUpdate )
+{
+    define num_groups((build_state.num_instances + BUILD_PRIMREFS_GROUPSIZE - 1) / BUILD_PRIMREFS_GROUPSIZE);
+    dispatch TS_primrefs_from_instances_ptrs(num_groups, 1, 1) args(
+        build_state.build_globals,
+        build_state.bvh_buffer,
+        build_state.instance_descs,
+        build_state.num_instances,
+        build_state.build_primref_buffer,
+        build_state.aabb_buffer,
+        build_state.is_procedural_buffer,
+        build_state.aabb_stride,
+        allowUpdate
+    );
+}
+
+metakernel
+TS_build_primrefs_array_of_pointers_indirect(MKTSBuildArgs build_state, qword indirectBuildRangeInfo, dword allowUpdate)
+{
+    define num_groups  REG0;
+    define groupsize_1 REG1; // groupsize - 1
+    define C_4         REG2;
+
+    // init with primitiveCount
+    num_groups = load_dword(indirectBuildRangeInfo);
+    groupsize_1 = 15; // BUILD_PRIMREFS_GROUPSIZE - 1
+    C_4 = 4;          // log_2(BUILD_PRIMREFS_GROUPSIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_4; // num_groups / BUILD_PRIMREFS_GROUPSIZE;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect TS_primrefs_from_instances_ptrs_indirect args(
+        build_state.build_globals,
+        build_state.bvh_buffer,
+        build_state.instance_descs,
+        build_state.build_primref_buffer,
+        build_state.aabb_buffer,
+        build_state.is_procedural_buffer,
+        build_state.aabb_stride,
+        allowUpdate,
+        indirectBuildRangeInfo
+    );
+}
+
+
+
+
+const UPDATE_INSTANCE_LEAVES_GROUPSIZE = 16;
+
+struct MKTSUpdateArgs
+{
+    qword bvh_buffer;
+    qword instance_descs;
+    qword instance_descs_ptrs;
+    qword aabb_buffer;
+    qword is_procedural_buffer;
+    qword refit_scratch;
+    dword aabb_stride;
+    dword num_instances;
+};
+
+metakernel TS_update_instance_leaves( MKTSUpdateArgs update_state )
+{
+    define num_groups((update_state.num_instances + UPDATE_INSTANCE_LEAVES_GROUPSIZE - 1) / UPDATE_INSTANCE_LEAVES_GROUPSIZE);
+    dispatch TS_update_instance_leaves(num_groups, 1, 1) args(
+        update_state.bvh_buffer,
+        update_state.instance_descs,
+        update_state.instance_descs_ptrs,
+        update_state.refit_scratch,
+        update_state.aabb_buffer,
+        update_state.is_procedural_buffer,
+        update_state.aabb_stride
+    );
+}
+
+metakernel TS_update_instance_leaves_indirect( MKTSUpdateArgs update_state, qword indirectBuildRangeInfo )
+{
+    define num_groups  REG0;
+    define groupsize_1 REG1; // groupsize - 1
+    define C_4         REG2;
+
+    // init with primitiveCount
+    num_groups = load_dword(indirectBuildRangeInfo);
+    groupsize_1 = 15; // UPDATE_INSTANCE_LEAVES_GROUPSIZE - 1
+    C_4 = 4;          // log_2(UPDATE_INSTANCE_LEAVES_GROUPSIZE)
+
+    num_groups = num_groups + groupsize_1;
+    num_groups = num_groups >> C_4; // num_groups / UPDATE_INSTANCE_LEAVES_GROUPSIZE;
+
+    DISPATCHDIM_X = num_groups.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    // need to add indirect offset?
+    dispatch_indirect TS_update_instance_leaves args(
+        update_state.bvh_buffer,
+        update_state.instance_descs,
+        update_state.instance_descs_ptrs,
+        update_state.refit_scratch,
+        update_state.aabb_buffer,
+        update_state.is_procedural_buffer,
+        update_state.aabb_stride
+    );
+}
+
+metakernel TS_refit(MKTSUpdateArgs update_state, qword bvh_inner_nodes_start_value, qword bvh_inner_nodes_end )
+{
+    REG0 = bvh_inner_nodes_start_value;
+    REG1.lo = load_dword(bvh_inner_nodes_end);
+    REG1.hi = 0;
+    REG2 = REG1 - REG0;
+
+    DISPATCHDIM_X = REG2.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect TS_Refit_per_one_startpoint_sg
+    args(
+        update_state.bvh_buffer,
+        update_state.refit_scratch,
+        update_state.is_procedural_buffer
+    );
+}
+
+
+const FIXUP_LEAVES_NODES_PER_GROUP = 2;
+
+metakernel TS_fixup_leaves(MKTSBuildArgs build_state, qword bvh_inner_nodes_start_value, qword bvh_inner_nodes_end )
+{
+    define ONE REG3;
+
+    ONE = 1;
+    REG0 = bvh_inner_nodes_start_value;
+    REG1.lo = load_dword(bvh_inner_nodes_end);
+    REG1.hi = 0;
+    REG2 = REG1 - REG0;
+    REG2 = REG2 + ONE;
+    REG2 = REG2 >> ONE;
+
+    DISPATCHDIM_X = REG2.lo;
+    DISPATCHDIM_Y = 1;
+    DISPATCHDIM_Z = 1;
+
+    dispatch_indirect TS_fixup_leaves
+        args(
+            build_state.bvh_buffer,
+            build_state.leaf_creation_index_buffer,
+            build_state.build_primref_buffer,
+            build_state.leaf_creation_index_stride
+        );
+
+}
diff --git a/src/intel/vulkan/grl/grl_cl_kernel_gen.py b/src/intel/vulkan/grl/grl_cl_kernel_gen.py
new file mode 100644
index 00000000000..148438e9fa6
--- /dev/null
+++ b/src/intel/vulkan/grl/grl_cl_kernel_gen.py
@@ -0,0 +1,226 @@
+COPYRIGHT = """\
+/*
+ * Copyright 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+"""
+
+import argparse
+import os
+
+from grl_parser import parse_grl_file
+from mako.template import Template
+
+TEMPLATE_H = Template(COPYRIGHT + """
+/* This file generated from ${filename}, don't edit directly. */
+
+#ifndef GRL_CL_KERNEL_H
+#define GRL_CL_KERNEL_H
+
+#include "genxml/gen_macros.h"
+#include "compiler/brw_kernel.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum grl_cl_kernel {
+% for k in kernels:
+    GRL_CL_KERNEL_${k.upper()},
+% endfor
+    GRL_CL_KERNEL_MAX,
+};
+
+const char *genX(grl_cl_kernel_name)(enum grl_cl_kernel kernel);
+
+const char *genX(grl_get_cl_kernel_sha1)(enum grl_cl_kernel id);
+
+void genX(grl_get_cl_kernel)(struct brw_kernel *kernel, enum grl_cl_kernel id);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* INTEL_GRL_H */
+""")
+
+TEMPLATE_C = Template(COPYRIGHT + """
+/* This file generated from ${filename}, don't edit directly. */
+
+#include "grl_cl_kernel.h"
+
+% for k in kernels:
+#include "${prefix}_${k}.h"
+% endfor
+
+const char *
+genX(grl_cl_kernel_name)(enum grl_cl_kernel kernel)
+{
+    switch (kernel) {
+% for k in kernels:
+    case GRL_CL_KERNEL_${k.upper()}: return "${k}";
+% endfor
+    default: return "unknown";
+    }
+}
+
+const char *
+genX(grl_get_cl_kernel_sha1)(enum grl_cl_kernel id)
+{
+    switch (id) {
+% for k in kernels:
+    case GRL_CL_KERNEL_${k.upper()}: return ${prefix}_${k}_sha1;
+% endfor
+    default:
+        unreachable("Invalid GRL kernel enum");
+    }
+};
+
+void
+${prefix}_grl_get_cl_kernel(struct brw_kernel *kernel, enum grl_cl_kernel id)
+{
+    switch (id) {
+% for k in kernels:
+    case GRL_CL_KERNEL_${k.upper()}:
+        *kernel = ${prefix}_${k};
+        break;
+% endfor
+    default:
+        unreachable("Invalid GRL kernel enum");
+    }
+}
+""")
+
+def get_libraries_files(kernel_module):
+    lib_files = []
+    for item in kernel_module[3]:
+        if item[0] != 'library':
+            continue
+        default_file = None
+        fallback_file = None
+        path_directory = None
+        for props in item[2]:
+            if props[0] == 'fallback':
+                fallback_file = props[1]
+            elif props[0] == 'default':
+                default_file = props[1]
+            elif props[0] == 'path':
+                path_directory = props[1]
+        assert path_directory
+        assert default_file or fallback_file
+        if fallback_file:
+            lib_files.append(os.path.join(path_directory, fallback_file))
+        else:
+            lib_files.append(os.path.join(path_directory, default_file))
+    return lib_files
+
+def add_kernels(kernels, cl_file, entrypoint, libs):
+    assert cl_file.endswith('.cl')
+    for lib_file in libs:
+        assert lib_file.endswith('.cl')
+    kernels.append((cl_file, entrypoint, ','.join(libs)))
+
+def get_kernels(grl_nodes):
+    kernels = []
+    for item in grl_nodes:
+        assert isinstance(item, tuple)
+        if item[0] == 'kernel':
+            ann = item[2]
+            add_kernels(kernels, ann['source'], ann['kernelFunction'], [])
+        elif item[0] == 'kernel-module':
+            cl_file = item[2]
+            libfiles = get_libraries_files(item)
+            for kernel_def in item[3]:
+                if kernel_def[0] == 'kernel':
+                    ann = kernel_def[2]
+                    add_kernels(kernels, cl_file, ann['kernelFunction'], libfiles)
+    return kernels
+
+def parse_libraries(filenames):
+    libraries = {}
+    for fname in filenames:
+        lib_package = parse_grl_file(fname, [])
+        for lib in lib_package:
+            assert lib[0] == 'library'
+            # Add the directory of the library so that CL files can be found.
+            lib[2].append(('path', os.path.dirname(fname)))
+            libraries[lib[1]] = lib
+    return libraries
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--out-c', help='Output C file')
+    parser.add_argument('--out-h', help='Output H file')
+    parser.add_argument('--ls-kernels', action='store_const', const=True,
+                        help='List all openCL kernels')
+    parser.add_argument('--prefix', help='Prefix')
+    parser.add_argument('--library', dest='libraries', action='append',
+                        default=[], help='Libraries to include')
+    parser.add_argument('files', type=str, nargs='*', help='GRL files')
+    args = parser.parse_args()
+
+    libraries = parse_libraries(args.libraries)
+
+    kernels = []
+    for fname in args.files:
+        kernels += get_kernels(parse_grl_file(fname, libraries))
+
+    # Make the list of kernels unique and sorted
+    kernels = sorted(list(set(kernels)))
+
+    if args.ls_kernels:
+        for cl_file, entrypoint, libs in kernels:
+            if not os.path.isabs(cl_file):
+                cl_file = os.path.join(os.path.dirname(fname), cl_file)
+            print('{}:{}:{}'.format(cl_file, entrypoint, libs))
+
+    kernel_c_names = []
+    for cl_file, entrypoint, libs in kernels:
+        cl_file = os.path.splitext(cl_file)[0]
+        cl_file_name = cl_file.replace('/', '_')
+        kernel_c_names.append('_'.join([cl_file_name, entrypoint]))
+
+    try:
+        if args.out_h:
+            with open(args.out_h, 'w', encoding='utf-8') as f:
+                f.write(TEMPLATE_H.render(kernels=kernel_c_names,
+                                          filename=os.path.basename(__file__)))
+
+        if args.out_c:
+            with open(args.out_c, 'w', encoding='utf-8') as f:
+                f.write(TEMPLATE_C.render(kernels=kernel_c_names,
+                                          prefix=args.prefix,
+                                          filename=os.path.basename(__file__)))
+    except Exception:
+        # In the event there's an error, this imports some helpers from mako
+        # to print a useful stack trace and prints it, then exits with
+        # status 1, if python is run with debug; otherwise it just raises
+        # the exception
+        if __debug__:
+            import sys
+            from mako import exceptions
+            sys.stderr.write(exceptions.text_error_template().render() + '\n')
+            sys.exit(1)
+        raise
+
+if __name__ == '__main__':
+    main()
diff --git a/src/intel/vulkan/grl/grl_metakernel_gen.py b/src/intel/vulkan/grl/grl_metakernel_gen.py
new file mode 100644
index 00000000000..6c416bd3d5d
--- /dev/null
+++ b/src/intel/vulkan/grl/grl_metakernel_gen.py
@@ -0,0 +1,933 @@
+#!/bin/env python
+COPYRIGHT = """\
+/*
+ * Copyright 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+"""
+
+import argparse
+import os.path
+import re
+import sys
+
+from grl_parser import parse_grl_file
+
+class Writer(object):
+    def __init__(self, file):
+        self._file = file
+        self._indent = 0
+        self._new_line = True
+
+    def push_indent(self, levels=4):
+        self._indent += levels
+
+    def pop_indent(self, levels=4):
+        self._indent -= levels
+
+    def write(self, s, *fmt):
+        if self._new_line:
+            s = '\n' + s
+        self._new_line = False
+        if s.endswith('\n'):
+            self._new_line = True
+            s = s[:-1]
+        if fmt:
+            s = s.format(*fmt)
+        self._file.write(s.replace('\n', '\n' + ' ' * self._indent))
+
+# Internal Representation
+
+class Value(object):
+    def __init__(self, name=None, zone=None):
+        self.name = name
+        self._zone = zone
+        self.live = False
+
+    @property
+    def zone(self):
+        assert self._zone is not None
+        return self._zone
+
+    def is_reg(self):
+        return False
+
+    def c_val(self):
+        if not self.name:
+            print(self)
+        assert self.name
+        return self.name
+
+    def c_cpu_val(self):
+        assert self.zone == 'cpu'
+        return self.c_val()
+
+    def c_gpu_val(self):
+        if self.zone == 'gpu':
+            return self.c_val()
+        else:
+            return 'mi_imm({})'.format(self.c_cpu_val())
+
+class Constant(Value):
+    def __init__(self, value):
+        super().__init__(zone='cpu')
+        self.value = value
+
+    def c_val(self):
+        if self.value < 100:
+            return str(self.value)
+        elif self.value < (1 << 32):
+            return '0x{:x}u'.format(self.value)
+        else:
+            return '0x{:x}ull'.format(self.value)
+
+class Register(Value):
+    def __init__(self, name):
+        super().__init__(name=name, zone='gpu')
+
+    def is_reg(self):
+        return True
+
+class FixedGPR(Register):
+    def __init__(self, num):
+        super().__init__('REG{}'.format(num))
+        self.num = num
+
+    def write_c(self, w):
+        w.write('UNUSED struct mi_value {} = mi_reserve_gpr(&b, {});\n',
+                self.name, self.num)
+
+class GroupSizeRegister(Register):
+    def __init__(self, comp):
+        super().__init__('DISPATCHDIM_' + 'XYZ'[comp])
+        self.comp = comp
+
+class Member(Value):
+    def __init__(self, value, member):
+        super().__init__(zone=value.zone)
+        self.value = value
+        self.member = member
+
+    def is_reg(self):
+        return self.value.is_reg()
+
+    def c_val(self):
+        c_val = self.value.c_val()
+        if self.zone == 'gpu':
+            assert isinstance(self.value, Register)
+            if self.member == 'hi':
+                return 'mi_value_half({}, true)'.format(c_val)
+            elif self.member == 'lo':
+                return 'mi_value_half({}, false)'.format(c_val)
+            else:
+                assert False, 'Invalid member: {}'.format(self.member)
+        else:
+            return '.'.join([c_val, self.member])
+
+class OffsetOf(Value):
+    def __init__(self, mk, expr):
+        super().__init__(zone='cpu')
+        assert isinstance(expr, tuple) and expr[0] == 'member'
+        self.type = mk.m.get_type(expr[1])
+        self.field = expr[2]
+
+    def c_val(self):
+        return 'offsetof({}, {})'.format(self.type.c_name, self.field)
+
+class Scope(object):
+    def __init__(self, m, mk, parent):
+        self.m = m
+        self.mk = mk
+        self.parent = parent
+        self.defs = {}
+
+    def add_def(self, d, name=None):
+        if name is None:
+            name = d.name
+        assert name not in self.defs
+        self.defs[name] = d
+
+    def get_def(self, name):
+        if name in self.defs:
+            return self.defs[name]
+        assert self.parent, 'Unknown definition: "{}"'.format(name)
+        return self.parent.get_def(name)
+
+class Statement(object):
+    def __init__(self, srcs=[]):
+        assert isinstance(srcs, (list, tuple))
+        self.srcs = list(srcs)
+
+class SSAStatement(Statement, Value):
+    _count = 0
+
+    def __init__(self, zone, srcs):
+        Statement.__init__(self, srcs)
+        Value.__init__(self, None, zone)
+        self.c_name = '_tmp{}'.format(SSAStatement._count)
+        SSAStatement._count += 1
+
+    def c_val(self):
+        return self.c_name
+
+    def write_c_refs(self, w):
+        assert self.zone == 'gpu'
+        assert self.uses > 0
+        if self.uses > 1:
+            w.write('mi_value_add_refs(&b, {}, {});\n',
+                    self.c_name, self.uses - 1)
+
+class Half(SSAStatement):
+    def __init__(self, value, half):
+        assert half in ('hi', 'lo')
+        super().__init__(None, [value])
+        self.half = half
+
+    @property
+    def zone(self):
+        return self.srcs[0].zone
+
+    def write_c(self, w):
+        assert self.half in ('hi', 'lo')
+        if self.zone == 'cpu':
+            if self.half == 'hi':
+                w.write('uint32_t {} = (uint64_t)({}) >> 32;\n',
+                        self.c_name, self.srcs[0].c_cpu_val())
+            else:
+                w.write('uint32_t {} = {};\n',
+                        self.c_name, self.srcs[0].c_cpu_val())
+        else:
+            if self.half == 'hi':
+                w.write('struct mi_value {} = mi_value_half({}, true);\n',
+                        self.c_name, self.srcs[0].c_gpu_val())
+            else:
+                w.write('struct mi_value {} = mi_value_half({}, false);\n',
+                        self.c_name, self.srcs[0].c_gpu_val())
+            self.write_c_refs(w)
+
+class Expression(SSAStatement):
+    def __init__(self, mk, op, *srcs):
+        super().__init__(None, srcs)
+        self.op = op
+
+    @property
+    def zone(self):
+        zone = 'cpu'
+        for s in self.srcs:
+            if s.zone == 'gpu':
+                zone = 'gpu'
+        return zone
+
+    def write_c(self, w):
+        if self.zone == 'cpu':
+            w.write('uint64_t {} = ', self.c_name)
+            c_cpu_vals = [s.c_cpu_val() for s in self.srcs]
+            if len(self.srcs) == 1:
+                w.write('({} {})', self.op, c_cpu_vals[0])
+            elif len(self.srcs) == 2:
+                w.write('({} {} {})', c_cpu_vals[0], self.op, c_cpu_vals[1])
+            else:
+                assert len(self.srcs) == 3 and op == '?'
+                w.write('({} ? {} : {})', *c_cpu_vals)
+            w.write(';\n')
+            return
+
+        w.write('struct mi_value {} = ', self.c_name)
+        if self.op == '~':
+            w.write('mi_inot(&b, {});\n', self.srcs[0].c_gpu_val())
+        elif self.op == '+':
+            w.write('mi_iadd(&b, {}, {});\n',
+                    self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
+        elif self.op == '-':
+            w.write('mi_isub(&b, {}, {});\n',
+                    self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
+        elif self.op == '&':
+            w.write('mi_iand(&b, {}, {});\n',
+                    self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
+        elif self.op == '|':
+            w.write('mi_ior(&b, {}, {});\n',
+                    self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
+        elif self.op == '<<':
+            if self.srcs[1].zone == 'cpu':
+                w.write('mi_ishl_imm(&b, {}, {});\n',
+                        self.srcs[0].c_gpu_val(), self.srcs[1].c_cpu_val())
+            else:
+                w.write('mi_ishl(&b, {}, {});\n',
+                        self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
+        elif self.op == '>>':
+            if self.srcs[1].zone == 'cpu':
+                w.write('mi_ushr_imm(&b, {}, {});\n',
+                        self.srcs[0].c_gpu_val(), self.srcs[1].c_cpu_val())
+            else:
+                w.write('mi_ushr(&b, {}, {});\n',
+                        self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
+        elif self.op == '==':
+            w.write('mi_ieq(&b, {}, {});\n',
+                    self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
+        elif self.op == '<':
+            w.write('mi_ult(&b, {}, {});\n',
+                    self.srcs[0].c_gpu_val(), self.srcs[1].c_gpu_val())
+        elif self.op == '>':
+            w.write('mi_ult(&b, {}, {});\n',
+                    self.srcs[1].c_gpu_val(), self.srcs[0].c_gpu_val())
+        elif self.op == '<=':
+            w.write('mi_uge(&b, {}, {});\n',
+                    self.srcs[1].c_gpu_val(), self.srcs[0].c_gpu_val())
+        else:
+            assert False, 'Unknown expression opcode: {}'.format(self.op)
+        self.write_c_refs(w)
+
+class StoreReg(Statement):
+    def __init__(self, mk, reg, value):
+        super().__init__([mk.load_value(value)])
+        self.reg = mk.parse_value(reg)
+        assert self.reg.is_reg()
+
+    def write_c(self, w):
+        value = self.srcs[0]
+        w.write('mi_store(&b, {}, {});\n',
+                self.reg.c_gpu_val(), value.c_gpu_val())
+
+class LoadMem(SSAStatement):
+    def __init__(self, mk, bit_size, addr):
+        super().__init__('gpu', [mk.load_value(addr)])
+        self.bit_size = bit_size
+
+    def write_c(self, w):
+        addr = self.srcs[0]
+        w.write('struct mi_value {} = ', self.c_name)
+        if addr.zone == 'cpu':
+            w.write('mi_mem{}(anv_address_from_u64({}));\n',
+                    self.bit_size, addr.c_cpu_val())
+        else:
+            assert self.bit_size == 64
+            w.write('mi_load_mem64_offset(&b, anv_address_from_u64(0), {});\n',
+                    addr.c_gpu_val())
+        self.write_c_refs(w)
+
+class StoreMem(Statement):
+    def __init__(self, mk, bit_size, addr, src):
+        super().__init__([mk.load_value(addr), mk.load_value(src)])
+        self.bit_size = bit_size
+
+    def write_c(self, w):
+        addr, data = tuple(self.srcs)
+        if addr.zone == 'cpu':
+            w.write('mi_store(&b, mi_mem{}(anv_address_from_u64({})), {});\n',
+                    self.bit_size, addr.c_cpu_val(), data.c_gpu_val())
+        else:
+            assert self.bit_size == 64
+            w.write('mi_store_mem64_offset(&b, anv_address_from_u64(0), {}, {});\n',
+                    addr.c_gpu_val(), data.c_gpu_val())
+
+class GoTo(Statement):
+    def __init__(self, mk, target_id, cond=None, invert=False):
+        cond = [mk.load_value(cond)] if cond is not None else []
+        super().__init__(cond)
+        self.target_id = target_id
+        self.invert = invert
+        self.mk = mk
+
+    def write_c(self, w):
+        # Now that we've parsed the entire metakernel, we can look up the
+        # actual target from the id
+        target = self.mk.get_goto_target(self.target_id)
+
+        if self.srcs:
+            cond = self.srcs[0]
+            if self.invert:
+                w.write('mi_goto_if(&b, mi_inot(&b, {}), &{});\n', cond.c_gpu_val(), target.c_name)
+            else:
+                w.write('mi_goto_if(&b, {}, &{});\n', cond.c_gpu_val(), target.c_name)
+        else:
+            w.write('mi_goto(&b, &{});\n', target.c_name)
+
+class GoToTarget(Statement):
+    def __init__(self, mk, name):
+        super().__init__()
+        self.name = name
+        self.c_name = '_goto_target_' + name
+        self.goto_tokens = []
+
+        mk = mk.add_goto_target(self)
+
+    def write_decl(self, w):
+        w.write('struct mi_goto_target {} = MI_GOTO_TARGET_INIT;\n',
+                self.c_name)
+
+    def write_c(self, w):
+        w.write('mi_goto_target(&b, &{});\n', self.c_name)
+
+class Dispatch(Statement):
+    def __init__(self, mk, kernel, group_size, args, postsync):
+        if group_size is None:
+            srcs = [mk.scope.get_def('DISPATCHDIM_{}'.format(d)) for d in 'XYZ']
+        else:
+            srcs = [mk.load_value(s) for s in group_size]
+        srcs += [mk.load_value(a) for a in args]
+        super().__init__(srcs)
+        self.kernel = mk.m.kernels[kernel]
+        self.indirect = group_size is None
+        self.postsync = postsync
+
+    def write_c(self, w):
+        w.write('{\n')
+        w.push_indent()
+
+        group_size = self.srcs[:3]
+        args = self.srcs[3:]
+        if not self.indirect:
+            w.write('const uint32_t _group_size[3] = {{ {}, {}, {} }};\n',
+                    *[s.c_cpu_val() for s in group_size])
+            gs = '_group_size'
+        else:
+            gs = 'NULL'
+
+        w.write('const struct anv_kernel_arg _args[] = {\n')
+        w.push_indent()
+        for arg in args:
+            w.write('{{ .u64 = {} }},\n', arg.c_cpu_val())
+        w.pop_indent()
+        w.write('};\n')
+
+        w.write('genX(grl_dispatch)(cmd_buffer, {},\n', self.kernel.c_name)
+        w.write('                   {}, ARRAY_SIZE(_args), _args);\n', gs)
+        w.pop_indent()
+        w.write('}\n')
+
+class SemWait(Statement):
+    def __init__(self, scope, wait):
+        super().__init__()
+        self.wait = wait
+
+class Control(Statement):
+    def __init__(self, scope, wait):
+        super().__init__()
+        self.wait = wait
+
+    def write_c(self, w):
+        w.write('cmd_buffer->state.pending_pipe_bits |=\n')
+        w.write('    ANV_PIPE_CS_STALL_BIT |\n')
+        w.write('    ANV_PIPE_DATA_CACHE_FLUSH_BIT |\n')
+        w.write('    ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT;\n')
+        w.write('genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);\n')
+
+TYPE_REMAPS = {
+    'dword' : 'uint32_t',
+    'qword' : 'uint64_t',
+}
+
+class Module(object):
+    def __init__(self, grl_dir, elems):
+        assert isinstance(elems[0], tuple)
+        assert elems[0][0] == 'module-name'
+        self.grl_dir = grl_dir
+        self.name = elems[0][1]
+        self.kernels = {}
+        self.structs = {}
+        self.constants = []
+        self.metakernels = []
+        self.regs = {}
+
+        scope = Scope(self, None, None)
+        for e in elems[1:]:
+            if e[0] == 'kernel':
+                k = Kernel(self, *e[1:])
+                assert k.name not in self.kernels
+                self.kernels[k.name] = k
+            elif e[0] == 'kernel-module':
+                m = KernelModule(self, *e[1:])
+                for k in m.kernels:
+                    assert k.name not in self.kernels
+                    self.kernels[k.name] = k
+            elif e[0] == 'struct':
+                s = Struct(self, *e[1:])
+                assert s.name not in self.kernels
+                self.structs[s.name] = s
+            elif e[0] == 'named-constant':
+                c = NamedConstant(*e[1:])
+                scope.add_def(c)
+                self.constants.append(c)
+            elif e[0] == 'meta-kernel':
+                mk = MetaKernel(self, scope, *e[1:])
+                self.metakernels.append(mk)
+            elif e[0] == 'import':
+                assert e[2] == 'struct'
+                self.import_struct(e[1], e[3])
+            else:
+                assert False, 'Invalid module-level token: {}'.format(t[0])
+
+    def import_struct(self, filename, struct_name):
+        elems = parse_grl_file(os.path.join(self.grl_dir, filename), [])
+        assert elems
+        for e in elems[1:]:
+            if e[0] == 'struct' and e[1] == struct_name:
+                s = Struct(self, *e[1:])
+                assert s.name not in self.kernels
+                self.structs[s.name] = s
+                return
+        assert False, "Struct {0} not found in {1}".format(struct_name, filename)
+
+    def get_type(self, name):
+        if name in self.structs:
+            return self.structs[name]
+        return BasicType(TYPE_REMAPS.get(name, name))
+
+    def get_fixed_gpr(self, num):
+        assert isinstance(num, int)
+        if num in self.regs:
+            return self.regs[num]
+
+        reg = FixedGPR(num)
+        self.regs[num] = reg
+        return reg
+
+    def optimize(self):
+        progress = True
+        while progress:
+            progress = False
+
+            # Copy Propagation
+            for mk in self.metakernels:
+                if mk.opt_copy_prop():
+                    progress = True
+
+            # Dead Code Elimination
+            for r in self.regs.values():
+                r.live = False
+            for c in self.constants:
+                c.live = False
+            for mk in self.metakernels:
+                mk.opt_dead_code1()
+            for mk in self.metakernels:
+                if mk.opt_dead_code2():
+                    progress = True
+            for n in list(self.regs.keys()):
+                if not self.regs[n].live:
+                    del self.regs[n]
+                    progress = True
+            self.constants = [c for c in self.constants if c.live]
+
+    def compact_regs(self):
+        old_regs = self.regs
+        self.regs = {}
+        for i, reg in enumerate(old_regs.values()):
+            reg.num = i
+            self.regs[i] = reg
+
+    def write_h(self, w):
+        for s in self.structs.values():
+            s.write_h(w)
+        for mk in self.metakernels:
+            mk.write_h(w)
+
+    def write_c(self, w):
+        for c in self.constants:
+            c.write_c(w)
+        for mk in self.metakernels:
+            mk.write_c(w)
+
+class Kernel(object):
+    def __init__(self, m, name, ann):
+        self.name = name
+        self.source_file = ann['source']
+        self.kernel_name = self.source_file.replace('/', '_')[:-3].upper()
+        self.entrypoint = ann['kernelFunction']
+
+        assert self.source_file.endswith('.cl')
+        self.c_name = '_'.join([
+            'GRL_CL_KERNEL',
+            self.kernel_name,
+            self.entrypoint.upper(),
+        ])
+
+class KernelModule(object):
+    def __init__(self, m, name, source, kernels):
+        self.name = name
+        self.kernels = []
+        self.libraries = []
+
+        for k in kernels:
+            if k[0] == 'kernel':
+                k[2]['source'] = source
+                self.kernels.append(Kernel(m, *k[1:]))
+            elif k[0] == 'library':
+                # Skip this for now.
+                pass
+
+class BasicType(object):
+    def __init__(self, name):
+        self.name = name
+        self.c_name = name
+
+class Struct(object):
+    def __init__(self, m, name, fields, align):
+        assert align == 0
+        self.name = name
+        self.c_name = 'struct ' + '_'.join(['grl', m.name, self.name])
+        self.fields = [(m.get_type(t), n) for t, n in fields]
+
+    def write_h(self, w):
+        w.write('{} {{\n', self.c_name)
+        w.push_indent()
+        for f in self.fields:
+            w.write('{} {};\n', f[0].c_name, f[1])
+        w.pop_indent()
+        w.write('};\n')
+
+class NamedConstant(Value):
+    def __init__(self, name, value):
+        super().__init__(name, 'cpu')
+        self.name = name
+        self.value = Constant(value)
+        self.written = False
+
+    def set_module(self, m):
+        pass
+
+    def write_c(self, w):
+        if self.written:
+            return
+        w.write('static const uint64_t {} = {};\n',
+                self.name, self.value.c_val())
+        self.written = True
+
+class MetaKernelParameter(Value):
+    def __init__(self, mk, type, name):
+        super().__init__(name, 'cpu')
+        self.type = mk.m.get_type(type)
+
+class MetaKernel(object):
+    def __init__(self, m, m_scope, name, params, ann, statements):
+        self.m = m
+        self.name = name
+        self.c_name = '_'.join(['grl', m.name, self.name])
+        self.goto_targets = {}
+        self.num_tmps = 0
+
+        mk_scope = Scope(m, self, m_scope)
+
+        self.params = [MetaKernelParameter(self, *p) for p in params]
+        for p in self.params:
+            mk_scope.add_def(p)
+
+        mk_scope.add_def(GroupSizeRegister(0), name='DISPATCHDIM_X')
+        mk_scope.add_def(GroupSizeRegister(1), name='DISPATCHDIM_Y')
+        mk_scope.add_def(GroupSizeRegister(2), name='DISPATCHDIM_Z')
+
+        self.statements = []
+        self.parse_stmt(mk_scope, statements)
+        self.scope = None
+
+    def get_tmp(self):
+        tmpN = '_tmp{}'.format(self.num_tmps)
+        self.num_tmps += 1
+        return tmpN
+
+    def add_stmt(self, stmt):
+        self.statements.append(stmt)
+        return stmt
+
+    def parse_value(self, v):
+        if isinstance(v, Value):
+            return v
+        elif isinstance(v, str):
+            if re.match(r'REG\d+', v):
+                return self.m.get_fixed_gpr(int(v[3:]))
+            else:
+                return self.scope.get_def(v)
+        elif isinstance(v, int):
+            return Constant(v)
+        elif isinstance(v, tuple):
+            if v[0] == 'member':
+                return Member(self.parse_value(v[1]), v[2])
+            elif v[0] == 'offsetof':
+                return OffsetOf(self, v[1])
+            else:
+                op = v[0]
+                srcs = [self.parse_value(s) for s in v[1:]]
+                return self.add_stmt(Expression(self, op, *srcs))
+        else:
+            assert False, 'Invalid value: {}'.format(v[0])
+
+    def load_value(self, v):
+        v = self.parse_value(v)
+        if isinstance(v, Member) and v.zone == 'gpu':
+            v = self.add_stmt(Half(v.value, v.member))
+        return v
+
+    def parse_stmt(self, scope, s):
+        self.scope = scope
+        if isinstance(s, list):
+            subscope = Scope(self.m, self, scope)
+            for stmt in s:
+                self.parse_stmt(subscope, stmt)
+        elif s[0] == 'define':
+            scope.add_def(self.parse_value(s[2]), name=s[1])
+        elif s[0] == 'assign':
+            self.add_stmt(StoreReg(self, *s[1:]))
+        elif s[0] == 'dispatch':
+            self.add_stmt(Dispatch(self, *s[1:]))
+        elif s[0] == 'load-dword':
+            v = self.add_stmt(LoadMem(self, 32, s[2]))
+            self.add_stmt(StoreReg(self, s[1], v))
+        elif s[0] == 'load-qword':
+            v = self.add_stmt(LoadMem(self, 64, s[2]))
+            self.add_stmt(StoreReg(self, s[1], v))
+        elif s[0] == 'store-dword':
+            self.add_stmt(StoreMem(self, 32, *s[1:]))
+        elif s[0] == 'store-qword':
+            self.add_stmt(StoreMem(self, 64, *s[1:]))
+        elif s[0] == 'goto':
+            self.add_stmt(GoTo(self, s[1]))
+        elif s[0] == 'goto-if':
+            self.add_stmt(GoTo(self, s[1], s[2]))
+        elif s[0] == 'goto-if-not':
+            self.add_stmt(GoTo(self, s[1], s[2], invert=True))
+        elif s[0] == 'label':
+            self.add_stmt(GoToTarget(self, s[1]))
+        elif s[0] == 'control':
+            self.add_stmt(Control(self, s[1]))
+        elif s[0] == 'sem-wait-while':
+            self.add_stmt(Control(self, s[1]))
+        else:
+            assert False, 'Invalid statement: {}'.format(s[0])
+
+    def add_goto_target(self, t):
+        assert t.name not in self.goto_targets
+        self.goto_targets[t.name] = t
+
+    def get_goto_target(self, name):
+        return self.goto_targets[name]
+
+    def opt_copy_prop(self):
+        progress = False
+        copies = {}
+        for stmt in self.statements:
+            for i in range(len(stmt.srcs)):
+                src = stmt.srcs[i]
+                if isinstance(src, FixedGPR) and src.num in copies:
+                    stmt.srcs[i] = copies[src.num]
+                    progress = True
+
+            if isinstance(stmt, StoreReg):
+                reg = stmt.reg
+                if isinstance(reg, Member):
+                    reg = reg.value
+
+                if isinstance(reg, FixedGPR):
+                    copies.pop(reg.num, None)
+                    if not stmt.srcs[0].is_reg():
+                        copies[reg.num] = stmt.srcs[0]
+            elif isinstance(stmt, (GoTo, GoToTarget)):
+                copies = {}
+
+        return progress
+
+    def opt_dead_code1(self):
+        for stmt in self.statements:
+            # Mark every register which is read as live
+            for src in stmt.srcs:
+                if isinstance(src, Register):
+                    src.live = True
+
+            # Initialize every SSA statement to dead
+            if isinstance(stmt, SSAStatement):
+                stmt.live = False
+
+    def opt_dead_code2(self):
+        def yield_live(statements):
+            gprs_read = set(self.m.regs.keys())
+            for stmt in statements:
+                if isinstance(stmt, SSAStatement):
+                    if not stmt.live:
+                        continue
+                elif isinstance(stmt, StoreReg):
+                    reg = stmt.reg
+                    if isinstance(reg, Member):
+                        reg = reg.value
+
+                    if not stmt.reg.live:
+                        continue
+
+                    if isinstance(reg, FixedGPR):
+                        if reg.num in gprs_read:
+                            gprs_read.remove(reg.num)
+                        else:
+                            continue
+                elif isinstance(stmt, (GoTo, GoToTarget)):
+                    gprs_read = set(self.m.regs.keys())
+
+                for src in stmt.srcs:
+                    src.live = True
+                    if isinstance(src, FixedGPR):
+                        gprs_read.add(src.num)
+                yield stmt
+
+        old_stmt_list = self.statements
+        old_stmt_list.reverse()
+        self.statements = list(yield_live(old_stmt_list))
+        self.statements.reverse()
+        return len(self.statements) != len(old_stmt_list)
+
+    def count_ssa_value_uses(self):
+        for stmt in self.statements:
+            if isinstance(stmt, SSAStatement):
+                stmt.uses = 0
+
+            for src in stmt.srcs:
+                if isinstance(src, SSAStatement):
+                    src.uses += 1
+
+    def write_h(self, w):
+        w.write('void\n')
+        w.write('genX({})(\n', self.c_name)
+        w.push_indent()
+        w.write('struct anv_cmd_buffer *cmd_buffer')
+        for p in self.params:
+            w.write(',\n{} {}', p.type.c_name, p.name)
+        w.write(');\n')
+        w.pop_indent()
+
+    def write_c(self, w):
+        w.write('void\n')
+        w.write('genX({})(\n', self.c_name)
+        w.push_indent()
+        w.write('struct anv_cmd_buffer *cmd_buffer')
+        for p in self.params:
+            w.write(',\n{} {}', p.type.c_name, p.name)
+        w.write(')\n')
+        w.pop_indent()
+        w.write('{\n')
+        w.push_indent()
+
+        w.write('struct mi_builder b;\n')
+        w.write('mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);\n')
+        w.write('/* TODO: use anv_mocs? */\n');
+        w.write('const uint32_t mocs = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);\n');
+        w.write('mi_builder_set_mocs(&b, mocs);\n');
+        w.write('\n')
+
+        for r in self.m.regs.values():
+            r.write_c(w)
+        w.write('\n')
+
+        for t in self.goto_targets.values():
+            t.write_decl(w)
+        w.write('\n')
+
+        self.count_ssa_value_uses()
+        for s in self.statements:
+            s.write_c(w)
+
+        w.pop_indent()
+
+        w.write('}\n')
+
+HEADER_PROLOGUE = COPYRIGHT + '''
+#include "anv_private.h"
+#include "grl/genX_grl.h"
+
+#ifndef {0}
+#define {0}
+
+#ifdef __cplusplus
+extern "C" {{
+#endif
+
+'''
+
+HEADER_EPILOGUE = '''
+#ifdef __cplusplus
+}}
+#endif
+
+#endif /* {0} */
+'''
+
+C_PROLOGUE = COPYRIGHT + '''
+#include "{0}"
+
+#include "genxml/gen_macros.h"
+#include "genxml/genX_pack.h"
+#include "genxml/genX_rt_pack.h"
+
+/* We reserve :
+ *    - GPR 14 for secondary command buffer returns
+ *    - GPR 15 for conditional rendering
+ */
+#define MI_BUILDER_NUM_ALLOC_GPRS 14
+#define __gen_get_batch_dwords anv_batch_emit_dwords
+#define __gen_address_offset anv_address_add
+#define __gen_get_batch_address(b, a) anv_batch_address(b, a)
+#include "common/mi_builder.h"
+
+#define MI_PREDICATE_RESULT mi_reg32(0x2418)
+#define DISPATCHDIM_X mi_reg32(0x2500)
+#define DISPATCHDIM_Y mi_reg32(0x2504)
+#define DISPATCHDIM_Z mi_reg32(0x2508)
+'''
+
+def parse_libraries(filenames):
+    libraries = {}
+    for fname in filenames:
+        lib_package = parse_grl_file(fname, [])
+        for lib in lib_package:
+            assert lib[0] == 'library'
+            # Add the directory of the library so that CL files can be found.
+            lib[2].append(('path', os.path.dirname(fname)))
+            libraries[lib[1]] = lib
+    return libraries
+
+def main():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument('--out-c', help='Output C file')
+    argparser.add_argument('--out-h', help='Output C file')
+    argparser.add_argument('--library', dest='libraries', action='append',
+                           default=[], help='Libraries to include')
+    argparser.add_argument('grl', help="Input  file")
+    args = argparser.parse_args()
+
+    grl_dir = os.path.dirname(args.grl)
+
+    libraries = parse_libraries(args.libraries)
+
+    ir = parse_grl_file(args.grl, libraries)
+
+    m = Module(grl_dir, ir)
+    m.optimize()
+    m.compact_regs()
+
+    with open(args.out_h, 'w') as f:
+        guard = os.path.splitext(os.path.basename(args.out_h))[0].upper()
+        w = Writer(f)
+        w.write(HEADER_PROLOGUE, guard)
+        m.write_h(w)
+        w.write(HEADER_EPILOGUE, guard)
+
+    with open(args.out_c, 'w') as f:
+        w = Writer(f)
+        w.write(C_PROLOGUE, os.path.basename(args.out_h))
+        m.write_c(w)
+
+if __name__ == '__main__':
+    main()
diff --git a/src/intel/vulkan/grl/grl_parser.py b/src/intel/vulkan/grl/grl_parser.py
new file mode 100644
index 00000000000..2d62b25a169
--- /dev/null
+++ b/src/intel/vulkan/grl/grl_parser.py
@@ -0,0 +1,586 @@
+#!/bin/env python
+COPYRIGHT = """\
+/*
+ * Copyright 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+"""
+
+import os
+import re
+import ply.lex as lex
+import ply.yacc as yacc
+
+# Libraries
+
+libraries = {}
+
+# LEXER
+
+keywords = {
+    '__debugbreak': 'KW_DEBUGBREAK',
+    'alignas': 'KW_ALIGNAS',
+    'args': 'KW_ARGS',
+    'atomic': 'KW_ATOMIC',
+    'atomic_return': 'KW_ATOMIC_RETURN',
+    'const': 'KW_CONST',
+    'control': 'KW_CONTROL',
+    'define': 'KW_DEFINE',
+    'dispatch': 'KW_DISPATCH',
+    'dispatch_indirect': 'KW_DISPATCH_INDIRECT',
+    'goto': 'KW_GOTO',
+    'if': 'KW_IF',
+    'kernel': 'KW_KERNEL',
+    'kernel_module': 'KW_KERNEL_MODULE',
+    'import': 'KW_IMPORT',
+    'library': 'KW_LIBRARY',
+    'links': 'KW_LINKS',
+    'load_dword': 'KW_LOAD_DWORD',
+    'load_qword': 'KW_LOAD_QWORD',
+    'metakernel': 'KW_METAKERNEL',
+    'module': 'KW_MODULE',
+    'not': 'KW_NOT',
+    'offsetof': 'KW_OFFSETOF',
+    'postsync': 'KW_POSTSYNC',
+    'print': 'KW_PRINT',
+    'semaphore_wait': 'KW_SEMAPHORE_WAIT',
+    'shiftof': 'KW_SHIFTOF',
+    'sizeof': 'KW_SIZEOF',
+    'store_dword': 'KW_STORE_DWORD',
+    'store_qword': 'KW_STORE_QWORD',
+    'store_timestamp': 'KW_STORE_TIMESTAMP',
+    'struct': 'KW_STRUCT',
+    'unsigned': 'KW_UNSIGNED',
+    'while': 'KW_WHILE'
+}
+
+ops = {
+    '&&': 'OP_LOGICAL_AND',
+    '||': 'OP_LOGICAL_OR',
+    '==': 'OP_EQUALEQUAL',
+    '!=': 'OP_NOTEQUAL',
+    '<=': 'OP_LESSEQUAL',
+    '>=': 'OP_GREATEREQUAL',
+    '<<': 'OP_LSHIFT',
+    '>>': 'OP_RSHIFT'
+}
+
+tokens = [
+    'INT_LITERAL',
+    'STRING_LITERAL',
+    'OP',
+    'IDENTIFIER'
+] + list(keywords.values()) + list(ops.values())
+
+def t_INT_LITERAL(t):
+    r'(0x[a-fA-F0-9]+|\d+)'
+    if t.value.startswith('0x'):
+        t.value = int(t.value[2:], 16)
+    else:
+        t.value = int(t.value)
+    return t
+
+def t_OP(t):
+    r'(&&|\|\||==|!=|<=|>=|<<|>>)'
+    t.type = ops.get(t.value)
+    return t
+
+def t_IDENTIFIER(t):
+    r'[a-zA-Z_][a-zA-Z_0-9]*'
+    t.type = keywords.get(t.value, 'IDENTIFIER')
+    return t
+
+def t_STRING_LITERAL(t):
+    r'"(\\.|[^"\\])*"'
+    t.value = t.value[1:-1]
+    return t
+
+literals = "+*/(){};:,=&|!~^.%?-<>[]"
+
+t_ignore = ' \t'
+
+def t_newline(t):
+    r'\n+'
+    t.lexer.lineno += len(t.value)
+
+def t_error(t):
+    print("WUT: {}".format(t.value))
+    t.lexer.skip(1)
+
+LEXER = lex.lex()
+
+# PARSER
+
+precedence = (
+    ('right', '?', ':'),
+    ('left', 'OP_LOGICAL_OR', 'OP_LOGICAL_AND'),
+    ('left', '|'),
+    ('left', '^'),
+    ('left', '&'),
+    ('left', 'OP_EQUALEQUAL', 'OP_NOTEQUAL'),
+    ('left', '<', '>', 'OP_LESSEQUAL', 'OP_GREATEREQUAL'),
+    ('left', 'OP_LSHIFT', 'OP_RSHIFT'),
+    ('left', '+', '-'),
+    ('left', '*', '/', '%'),
+    ('right', '!', '~'),
+    ('left', '[', ']', '.')
+)
+
+def p_module(p):
+    'module : element_list'
+    p[0] = p[1]
+
+def p_element_list(p):
+    '''element_list : element_list element
+                    | element'''
+    if len(p) == 2:
+        p[0] = [p[1]]
+    else:
+        p[0] = p[1] + [p[2]]
+
+def p_element(p):
+    '''element : kernel_definition
+               | kernel_module_definition
+               | library_definition
+               | metakernel_definition
+               | module_name
+               | struct_definition
+               | const_definition
+               | import_definition'''
+    p[0] = p[1]
+
+def p_module_name(p):
+    'module_name : KW_MODULE IDENTIFIER ";"'
+    p[0] = ('module-name', p[2])
+
+def p_kernel_module_definition(p):
+    'kernel_module_definition : KW_KERNEL_MODULE IDENTIFIER "(" STRING_LITERAL ")" "{" kernel_definition_list "}"'
+    p[0] = ('kernel-module', p[2], p[4], p[7])
+
+def p_kernel_definition(p):
+    'kernel_definition : KW_KERNEL IDENTIFIER optional_annotation_list'
+    p[0] = ('kernel', p[2], p[3])
+
+def p_library_definition(p):
+    'library_definition : KW_LIBRARY IDENTIFIER "{" library_definition_list "}"'
+    p[0] = ('library', p[2], p[4])
+
+def p_library_definition_list(p):
+    '''library_definition_list :
+                              | library_definition_list IDENTIFIER STRING_LITERAL ";"'''
+    if len(p) < 3:
+        p[0] = []
+    else:
+        p[0] = p[1]
+        p[0].append((p[2], p[3]))
+
+def p_import_definition(p):
+    'import_definition : KW_IMPORT KW_STRUCT IDENTIFIER STRING_LITERAL ";"'
+    p[0] = ('import', p[4], 'struct', p[3])
+
+def p_links_definition(p):
+    'links_definition : KW_LINKS IDENTIFIER'
+
+    # Process a library include like a preprocessor
+    global libraries
+
+    if not p[2] in libraries:
+        raise "Not able to find library {0}".format(p[2])
+    p[0] = libraries[p[2]]
+
+def p_metakernel_definition(p):
+    'metakernel_definition : KW_METAKERNEL IDENTIFIER "(" optional_parameter_list ")" optional_annotation_list scope'
+    p[0] = ('meta-kernel', p[2], p[4], p[6], p[7])
+
+def p_kernel_definition_list(p):
+    '''kernel_definition_list :
+                              | kernel_definition_list kernel_definition ";"
+                              | kernel_definition_list links_definition ";"'''
+    if len(p) < 3:
+        p[0] = []
+    else:
+        p[0] = p[1]
+        p[0].append(p[2])
+
+def p_optional_annotation_list(p):
+    '''optional_annotation_list :
+                                | "<" ">"
+                                | "<" annotation_list ">"'''
+    if len(p) < 4:
+        p[0] = {}
+    else:
+        p[0] = p[2]
+
+def p_optional_parameter_list(p):
+    '''optional_parameter_list :
+                               | parameter_list'''
+    p[0] = p[1]
+
+def p_annotation_list(p):
+    '''annotation_list : annotation'''
+    p[0] = p[1]
+
+def p_annotation_list_append(p):
+    '''annotation_list : annotation_list "," annotation'''
+    p[0] = {**p[1], **p[3]}
+
+def p_annotation(p):
+    '''annotation : IDENTIFIER "=" INT_LITERAL
+                  | IDENTIFIER "=" IDENTIFIER
+                  | IDENTIFIER "=" STRING_LITERAL'''
+    p[0] = {p[1]: p[3]}
+
+def p_parameter_list(p):
+    '''parameter_list : parameter_definition'''
+    p[0] = [p[1]]
+
+def p_parameter_list_append(p):
+    '''parameter_list : parameter_list "," parameter_definition'''
+    p[0] = p[1]
+    p[0].append(p[3])
+
+def p_parameter_definition(p):
+    'parameter_definition : IDENTIFIER IDENTIFIER'
+    p[0] = (p[1], p[2])
+
+def p_scope(p):
+    '''scope : "{" optional_statement_list "}"'''
+    p[0] = p[2]
+
+def p_optional_statement_list(p):
+    '''optional_statement_list :
+                               | statement_list'''
+    p[0] = p[1]
+
+def p_statement_list(p):
+    '''statement_list : statement'''
+    p[0] = [p[1]]
+
+def p_statement_list_append(p):
+    '''statement_list : statement_list statement'''
+    p[0] = p[1]
+    p[0].append(p[2])
+
+def p_statement(p):
+    '''statement : definition_statement ";"
+                 | assignment_statement ";"
+                 | load_store_statement ";"
+                 | dispatch_statement ";"
+                 | semaphore_statement ";"
+                 | label
+                 | goto_statement ";"
+                 | scope_statement
+                 | atomic_op_statement ";"
+                 | control_statement ";"
+                 | print_statement ";"
+                 | debug_break_statement ";"'''
+    p[0] = p[1]
+
+def p_definition_statement(p):
+    'definition_statement : KW_DEFINE IDENTIFIER value'
+    p[0] = ('define', p[2], p[3])
+
+def p_assignemt_statement(p):
+    'assignment_statement : value "=" value'
+    p[0] = ('assign', p[1], p[3])
+
+def p_load_store_statement_load_dword(p):
+    '''load_store_statement : value "=" KW_LOAD_DWORD "(" value ")"'''
+    p[0] = ('load-dword', p[1], p[5])
+
+def p_load_store_statement_load_qword(p):
+    '''load_store_statement : value "=" KW_LOAD_QWORD "(" value ")"'''
+    p[0] = ('load-qword', p[1], p[5])
+
+def p_load_store_statement_store_dword(p):
+    '''load_store_statement : KW_STORE_DWORD "(" value "," value ")"'''
+    p[0] = ('store-dword', p[3], p[5])
+
+def p_load_store_statement_store_qword(p):
+    '''load_store_statement : KW_STORE_QWORD "(" value "," value ")"'''
+    p[0] = ('store-qword', p[3], p[5])
+
+def p_dispatch_statement(p):
+    '''dispatch_statement : direct_dispatch_statement
+                          | indirect_dispatch_statement'''
+    p[0] = p[1]
+
+def p_direct_dispatch_statement(p):
+    '''direct_dispatch_statement : KW_DISPATCH IDENTIFIER "(" value "," value "," value ")" optional_kernel_arg_list optional_postsync'''
+    p[0] = ('dispatch', p[2], (p[4], p[6], p[8]), p[10], p[11])
+
+def p_indirect_dispatch_statement(p):
+    '''indirect_dispatch_statement : KW_DISPATCH_INDIRECT IDENTIFIER optional_kernel_arg_list optional_postsync'''
+    p[0] = ('dispatch', p[2], None, p[3], p[4])
+
+def p_optional_kernel_arg_list(p):
+    '''optional_kernel_arg_list :
+                                | KW_ARGS "(" value_list ")"'''
+    p[0] = p[3]
+
+def p_value_list(p):
+    '''value_list : value'''
+    p[0] = [p[1]]
+
+def p_value_list_append(p):
+    '''value_list : value_list "," value'''
+    p[0] = p[1]
+    p[0].append(p[3])
+
+def p_optional_postsync(p):
+    '''optional_postsync :
+                         | postsync_operation'''
+    if len(p) > 1:
+        p[0] = p[1]
+
+def p_postsync_operation(p):
+    '''postsync_operation : postsync_write_dword
+                          | postsync_write_timestamp'''
+    p[0] = p[1]
+
+def p_postsync_write_dword(p):
+    '''postsync_write_dword : KW_POSTSYNC KW_STORE_DWORD "(" value "," value ")"'''
+    p[0] = ('postsync', 'store-dword', p[4], p[6])
+
+def p_postsync_write_timestamp(p):
+    '''postsync_write_timestamp : KW_POSTSYNC KW_STORE_TIMESTAMP "(" value ")"'''
+    p[0] = ('postsync', 'timestamp', p[4])
+
+def p_semaphore_statement(p):
+    '''semaphore_statement : KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value "<" value ")"
+                           | KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value ">" value ")"
+                           | KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value OP_LESSEQUAL value ")"
+                           | KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value OP_GREATEREQUAL value ")"
+                           | KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value OP_EQUALEQUAL value ")"
+                           | KW_SEMAPHORE_WAIT KW_WHILE "(" "*" value OP_NOTEQUAL value ")"'''
+    p[0] = ('sem-wait-while', p[5], p[6], p[7])
+
+def p_atomic_op_statement(p):
+    '''atomic_op_statement : KW_ATOMIC IDENTIFIER IDENTIFIER "(" value_list ")"'''
+    p[0] = ('atomic', p[2], p[3], p[5])
+
+def p_atomic_op_statement_return(p):
+    '''atomic_op_statement : KW_ATOMIC_RETURN IDENTIFIER IDENTIFIER "(" value_list ")"'''
+    p[0] = ('atomic-return', p[2], p[3], p[5])
+
+def p_label(p):
+    '''label : IDENTIFIER ":"'''
+    p[0] = ('label', p[1])
+
+def p_goto_statement(p):
+    '''goto_statement : KW_GOTO IDENTIFIER'''
+    p[0] = ('goto', p[2])
+
+def p_goto_statement_if(p):
+    '''goto_statement : KW_GOTO IDENTIFIER KW_IF "(" value ")"'''
+    p[0] = ('goto-if', p[2], p[5])
+
+def p_goto_statement_if_not(p):
+    '''goto_statement : KW_GOTO IDENTIFIER KW_IF KW_NOT "(" value ")"'''
+    p[0] = ('goto-if-not', p[2], p[6])
+
+def p_scope_statement(p):
+    '''scope_statement : scope'''
+    p[0] = (p[1])
+
+def p_control_statement(p):
+    '''control_statement : KW_CONTROL "(" id_list ")"'''
+    p[0] = ('control', p[3])
+
+def p_print_statement(p):
+    '''print_statement : KW_PRINT "(" printable_list ")"'''
+    p[0] = ('print', p[3])
+
+def p_printable_list(p):
+    '''printable_list : printable'''
+    p[0] = [p[1]]
+
+def p_printable_list_append(p):
+    '''printable_list : printable_list "," printable'''
+    p[0] = p[1]
+    p[0].append(p[3])
+
+def p_printable_str_lit(p):
+    '''printable : STRING_LITERAL'''
+    p[0] = '"{}"'.format(p[1])
+
+def p_printable_value(p):
+    '''printable : value'''
+    p[0] = p[1]
+
+def p_printable_str_lit_value(p):
+    '''printable : STRING_LITERAL value'''
+    p[0] = ('"{}"'.format(p[1]), p[2])
+
+def p_debug_break_statement(p):
+    '''debug_break_statement : KW_DEBUGBREAK'''
+    p[0] = ('debug-break')
+
+def p_id_list(p):
+    '''id_list : IDENTIFIER'''
+    p[0] = p[1]
+
+def p_id_list_append(p):
+    '''id_list : id_list "," IDENTIFIER'''
+    p[0] = p[1]
+    p[0].append(p[3])
+
+def p_value(p):
+    '''value : IDENTIFIER
+             | INT_LITERAL'''
+    p[0] = p[1]
+
+def p_value_braces(p):
+    '''value : "(" value ")"'''
+    p[0] = (p[2])
+
+def p_value_member(p):
+    '''value : value "." IDENTIFIER'''
+    p[0] = ('member', p[1], p[3])
+
+def p_value_idx(p):
+    '''value : value "[" value "]"'''
+    p[0] = ('index', p[1], p[3])
+
+def p_value_binop(p):
+    '''value : value "+" value
+             | value "-" value
+             | value "*" value
+             | value "/" value
+             | value "%" value
+             | value "&" value
+             | value "|" value
+             | value "<" value
+             | value ">" value
+             | value "^" value
+             | value OP_LESSEQUAL value
+             | value OP_GREATEREQUAL value
+             | value OP_EQUALEQUAL value
+             | value OP_NOTEQUAL value
+             | value OP_LOGICAL_AND value
+             | value OP_LOGICAL_OR value
+             | value OP_LSHIFT value
+             | value OP_RSHIFT value'''
+    p[0] = (p[2], p[1], p[3])
+
+def p_value_uniop(p):
+    '''value : "!" value
+             | "~" value'''
+    p[0] = (p[1], p[2])
+
+def p_value_cond(p):
+    '''value : value "?" value ":" value'''
+    p[0] = ('?', p[1], p[3], p[5])
+
+def p_value_funcop(p):
+    '''value : KW_OFFSETOF "(" offset_expression ")"
+             | KW_SHIFTOF "(" IDENTIFIER ")"
+             | KW_SIZEOF "(" IDENTIFIER ")"'''
+    p[0] = (p[1], p[3])
+
+def p_offset_expression(p):
+    '''offset_expression : IDENTIFIER'''
+    p[0] = p[1]
+
+def p_offset_expression_member(p):
+    '''offset_expression : offset_expression "." IDENTIFIER'''
+    p[0] = ('member', p[1], p[3])
+
+def p_offset_expression_idx(p):
+    '''offset_expression : offset_expression "[" INT_LITERAL "]"'''
+    p[0] = ('index', p[1], p[3])
+
+def p_struct_definition(p):
+    '''struct_definition : KW_STRUCT optional_alignment_specifier IDENTIFIER "{" optional_struct_member_list "}" ";"'''
+    p[0] = ('struct', p[3], p[5], p[2])
+
+def p_optional_alignment_specifier(p):
+    '''optional_alignment_specifier :
+                                    | KW_ALIGNAS "(" INT_LITERAL ")"'''
+    if len(p) == 1:
+        p[0] = 0
+    else:
+        p[0] = p[3]
+
+def p_optional_struct_member_list(p):
+    '''optional_struct_member_list :
+                                   | struct_member_list'''
+    if len(p) == 1:
+        p[0] = {}
+    else:
+        p[0] = p[1]
+
+def p_struct_member_list(p):
+    '''struct_member_list : struct_member'''
+    p[0] = [p[1]]
+
+def p_struct_member_list_append(p):
+    '''struct_member_list : struct_member_list struct_member'''
+    p[0] = p[1] + [p[2]]
+
+def p_struct_member(p):
+    '''struct_member : struct_member_typename IDENTIFIER ";"'''
+    p[0] = (p[1], p[2])
+
+def p_struct_member_array(p):
+    '''struct_member : struct_member_typename IDENTIFIER "[" INT_LITERAL "]" ";"'''
+    '''struct_member : struct_member_typename IDENTIFIER "[" IDENTIFIER "]" ";"'''
+    p[0] = {p[1]: p[2], 'count': p[4]}
+
+def p_struct_member_typename(p):
+    '''struct_member_typename : IDENTIFIER'''
+    p[0] = p[1]
+
+def p_struct_member_typename_unsigned(p):
+    '''struct_member_typename : KW_UNSIGNED IDENTIFIER'''
+    p[0] = ('unsigned', p[2])
+
+def p_struct_member_typename_struct(p):
+    '''struct_member_typename : KW_STRUCT IDENTIFIER'''
+    p[0] = ('struct', p[2])
+
+def p_const_definition(p):
+    '''const_definition : KW_CONST IDENTIFIER "=" INT_LITERAL ";"'''
+    p[0] = ('named-constant', p[2], p[4])
+
+PARSER = yacc.yacc()
+
+# Shamelessly stolen from some StackOverflow answer
+def _remove_comments(text):
+    def replacer(match):
+        s = match.group(0)
+        if s.startswith('/'):
+            return " " # note: a space and not an empty string
+        else:
+            return s
+    pattern = re.compile(
+        r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
+        re.DOTALL | re.MULTILINE
+    )
+    return re.sub(pattern, replacer, text)
+
+def parse_grl_file(grl_fname, libs):
+    global libraries
+
+    libraries = libs
+    with open(grl_fname, 'r') as f:
+        return PARSER.parse(_remove_comments(f.read()))
diff --git a/src/intel/vulkan/grl/grl_structs.h b/src/intel/vulkan/grl/grl_structs.h
new file mode 100644
index 00000000000..ed721afa6a2
--- /dev/null
+++ b/src/intel/vulkan/grl/grl_structs.h
@@ -0,0 +1,479 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * This file contains a redefinition of structures defined in the GRL library.
+ * We need to have those structures defined to allocate & prepare data for
+ * the OpenCL kernels building acceleration structures. Unfortunately because
+ * of C++ & OpenCL assumptions in GRL, it's no possible to just include GRL
+ * header files directly so we have to redefine stuff here.
+ */
+
+#ifndef GRL_STRUCTS_H
+#define GRL_STRUCTS_H
+
+#include "GRLStructs.h"
+#include "GRLRTASCommon.h"
+
+struct MKBuilderState {
+   qword geomDesc_buffer;
+   qword build_primref_buffer;
+   qword build_globals;
+   qword bvh_buffer;
+   dword leaf_type;
+   dword leaf_size;
+};
+
+#define PREFIX_MK_STATE(prefix, obj) \
+   (struct prefix##_MKBuilderState) { \
+      .geomDesc_buffer = (obj).geomDesc_buffer, \
+      .build_primref_buffer = (obj).build_primref_buffer, \
+      .build_globals = (obj).build_globals, \
+      .bvh_buffer = (obj).bvh_buffer, \
+      .leaf_type = (obj).leaf_type, \
+      .leaf_size = (obj).leaf_size, \
+   }
+
+struct MKSizeEstimate {
+   dword numTriangles;
+   dword numProcedurals;
+   dword numPrimitives;
+   dword numMeshes;
+   dword numBuildPrimitives;
+   dword numPrimitivesToSplit;
+   dword instance_descs_start;
+   dword geo_meta_data_start;
+   dword node_data_start;
+   dword leaf_data_start;
+   dword procedural_data_start;
+   dword back_pointer_start;
+   dword sizeTotal;
+   dword updateScratchSizeTotal;
+   dword fatleaf_table_start;
+   dword innernode_table_start;
+   dword max_fatleaves;
+
+   size_t max_instance_leafs;
+   size_t max_inner_nodes;
+   size_t leaf_data_size;
+   size_t min_primitives;
+   size_t max_primitives;
+};
+
+#define PREFIX_MK_SIZE(prefix, obj) \
+   (struct prefix##_MKSizeEstimate) { \
+      .numTriangles = (obj).numTriangles, \
+      .numProcedurals = (obj).numProcedurals, \
+      .numPrimitives = (obj).numPrimitives, \
+      .numMeshes = (obj).numMeshes, \
+      .numBuildPrimitives = (obj).numBuildPrimitives, \
+      .numPrimitivesToSplit = (obj).numPrimitivesToSplit, \
+      .instance_descs_start = (obj).instance_descs_start, \
+      .geo_meta_data_start = (obj).geo_meta_data_start, \
+      .node_data_start = (obj).node_data_start, \
+      .leaf_data_start = (obj).leaf_data_start, \
+      .procedural_data_start = (obj).procedural_data_start, \
+      .back_pointer_start = (obj).back_pointer_start, \
+      .sizeTotal = (obj).sizeTotal, \
+      .updateScratchSizeTotal = (obj).updateScratchSizeTotal, \
+      .fatleaf_table_start = (obj).fatleaf_table_start, \
+      .innernode_table_start = (obj).innernode_table_start, \
+      .max_fatleaves = (obj).max_fatleaves, \
+   }
+
+typedef struct AABB {
+   float lower[4];
+   float upper[4];
+} AABB;
+
+struct Globals
+{
+   struct AABB centroidBounds;
+
+   unsigned int build_record_start;
+   unsigned int numPrimitives;
+   unsigned int leafPrimType;
+   unsigned int leafSize;
+
+   unsigned int numSplittedPrimitives;
+   unsigned int numBuildRecords;
+
+   // spatial split sate
+   unsigned int numOriginalPrimitives;
+   float presplitPrioritySum;
+   float probThreshold;
+
+   // binned-sah bfs state
+   unsigned int counter;
+   unsigned int numBuildRecords_extended;
+
+   // sync variable used for global-sync on work groups
+   unsigned int sync;
+
+
+   /* morton code builder state */
+   unsigned int shift;      // used by adaptive mc-builder
+   unsigned int shift_mask; // used by adaptive mc-builder
+   unsigned int binary_hierarchy_root;
+   unsigned int p0_allocated_num;
+   unsigned int p0_created_num;
+   unsigned int morton_sort_in_flight;
+   unsigned int sort_iterations;
+
+   gpuva_t binary_hierarchy_buffer; // pointer to the binary morton code hierarchy.  Stashed here as a debug aid
+};
+
+typedef struct BVHBase
+{
+   // TODO:  Implement the "copy-first-node" trick... duplicate root node here
+
+   uint64_t rootNodeOffset;
+
+   uint32_t reserved;
+
+   uint32_t nodeDataCur; // nodeDataStart is sizeof(BVHBase) / 64 = BVH_ROOT_NODE_OFFSET / 64
+   uint32_t quadLeafStart;
+   uint32_t quadLeafCur;
+   uint32_t proceduralDataStart;
+   uint32_t proceduralDataCur;
+   uint32_t instanceLeafStart;
+   uint32_t instanceLeafEnd;
+   uint32_t backPointerDataStart;     //
+   uint32_t refitTreeletsDataStart;   // refit structs
+   uint32_t refitStartPointDataStart; //
+   uint32_t BVHDataEnd;
+
+   // number of bottom treelets
+   // if 1, then the bottom treelet is also tip treelet
+   uint32_t refitTreeletCnt;
+   uint32_t refitTreeletCnt2; // always 0, used for atomic updates
+   // data layout:
+   // @backPointerDataStart
+   //  'backpointer' - a dword per inner node.
+   //  The bits are used as follows:
+   //     2:0  --> Used as a refit counter during BVH refitting.  MBZ
+   //     5:3  --> Number of children
+   //     31:6 --> Index of the parent node in the internal node array
+   //    The root node has a parent index of all ones
+   // @refitTreeletsDataStart
+   //  RefitTreelet[], the last treelet is for top treelet all previous are for bottom
+   // @refitStartPointDataStart
+   //  for each treelet T there is [T.startpoint_offset, T.numStartpoints) interval of startpoints here in that space
+   // @backPointerDataEnd
+
+   uint32_t fatLeafCount;  // number of internal nodes which are "fat-leaves"
+   uint32_t innerCount;    // number of internal nodes which are true inner nodes (all internalNode children)
+   uint32_t fatLeafTableStart;
+   uint32_t innerTableStart;
+
+   uint32_t _pad[12];
+
+   struct RTASMetaData Meta;
+} BVHBase;
+
+
+struct BatchedInitGlobalsData
+{
+   qword p_build_globals;
+   qword p_bvh_buffer;
+   dword numPrimitives;
+   dword numGeometries;
+   dword numInstances;
+   dword instance_descs_start;
+   dword geo_meta_data_start;
+   dword node_data_start;
+   dword leaf_data_start;
+   dword procedural_data_start;
+   dword back_pointer_start;
+   dword sizeTotal;
+   dword leafType;
+   dword leafSize;
+   dword fatleaf_table_start;
+   dword innernode_table_start;
+};
+
+
+#define BFS_NUM_BINS        16
+#define BFS_NUM_VCONTEXTS   256
+#define BFS_MAX_DEPTH 32
+
+#define QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM 16384
+
+struct BFS_Split
+{
+   float sah;
+   int dim;
+   int pos;
+};
+
+struct BFS_BinInfo
+{
+   float min_max[18 * BFS_NUM_BINS]; //  layout: bins[axis][num_bins][6]
+   //          The 6 are lower(xyz) and -upper(xyz)
+   // bins use negated-max so that we can use vectorized mins instead of min/max pairs
+   uint counts[3 * BFS_NUM_BINS];
+};
+
+struct SAHBuildGlobals
+{
+   qword   p_primref_index_buffers;
+   qword   p_primrefs_buffer;
+   qword   p_bvh2;
+   qword   p_globals;     // TODO: deprecate this
+   qword   p_bvh_base;
+   gpuva_t p_qnode_root_buffer;
+
+   dword flags; // bit 1 is 'alloc_backpointers'.  bit 2 is 'need_masks'
+   dword num_primrefs;
+   dword leaf_size;
+   dword leaf_type;
+
+   dword root_buffer_num_produced;
+   dword root_buffer_num_produced_hi;
+   dword root_buffer_num_consumed;
+   dword root_buffer_num_consumed_hi;
+   dword root_buffer_num_to_consume;
+   dword root_buffer_num_to_consume_hi;
+};
+
+typedef union LRBounds
+{
+   struct
+   {
+      struct AABB3f left_centroid_bounds;
+      struct AABB3f left_geom_bounds;
+      struct AABB3f right_centroid_bounds;
+      struct AABB3f right_geom_bounds;
+   } boxes;
+   struct
+   {
+      float Array[24];
+   } scalars;
+} LRBounds;
+
+
+struct VContext
+{
+   uint dispatch_primref_begin;    // range of primrefs for this task
+   uint dispatch_primref_end;
+   uint bvh2_root;                 // BVH2 root node for this task
+   uint tree_depth;                // depth of this node in the tree
+   uint num_left;          // primref counts
+   uint num_right;
+   uint lr_mask;      // lower 8b : left mask.  upper 8b : right mask
+   uint batch_index;
+
+   // pass1 global working state and output
+   struct BFS_Split split;
+   struct BFS_BinInfo global_bin_info;
+
+   // pass2 global working state and output
+   LRBounds lr_bounds;
+};
+
+
+
+struct BFSDispatchRecord
+{
+   ushort batch_index;
+   ushort context_id;
+};
+
+
+struct BFSDispatchQueue
+{
+   uint num_dispatches;
+   uint wg_count[BFS_NUM_VCONTEXTS];
+   struct BFSDispatchRecord records[BFS_NUM_VCONTEXTS];
+};
+
+struct BFS1SpillStackEntry
+{
+   uint primref_begin;
+   uint primref_end;
+   uint bvh2_root;
+   ushort tree_depth;
+   ushort batch_index;
+};
+
+struct BFS1SpillStack
+{
+   uint size;
+   struct BFS1SpillStackEntry entries[BFS_NUM_VCONTEXTS * BFS_MAX_DEPTH];
+};
+
+struct QNodeGlobalRootBufferEntry
+{
+   uint bvh2_node;
+   uint qnode;
+   uint build_idx;
+   uint _pad;
+};
+
+struct QNodeGlobalRootBuffer
+{
+   uint curr_entries_offset; // we use "entries" as two buffers, so offset is either 0 or QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM
+   struct QNodeGlobalRootBufferEntry entries[QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM * 2];
+};
+
+struct DFSDispatchRecord
+{
+   uint primref_base;
+   uint bvh2_base;
+   uint batch_index;
+   ushort num_primrefs;
+   ushort tree_depth;
+};
+
+
+struct DFSDispatchQueue
+{
+   struct DFSDispatchRecord records[BFS_NUM_VCONTEXTS * 2];
+};
+
+#define VCONTEXT_STATE_EXECUTING   0
+#define VCONTEXT_STATE_UNALLOCATED 1
+
+union SchedulerUnion
+{
+   struct VContextScheduler
+   {
+      /////////////////////////////////////////////////////////////
+      //  State data used for communication with command streamer
+      //   NOTE: This part must match definition in 'new_sah_builder.grl'
+      /////////////////////////////////////////////////////////////
+
+      dword num_bfs_wgs;
+      dword num_dfs_wgs;
+
+      dword scheduler_postsync;
+      dword _pad1;
+
+      dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).
+      dword num_single_builds;  // number of single-wg builds (#primrefs < threshold)
+
+      dword batched_build_wg_count;  // number of wgs to dispatch for initial BFS pass
+      dword batched_build_loop_mask; // value is 0 if  #builds <= #contexts.  else 1  command streamer uses this as a loop condition
+
+      /////////////////////////////////////////////////////////////
+
+      dword batched_build_count;  // number of batched builds in the SAHBuildGlobals buffer
+      dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
+
+      dword vcontext_state[BFS_NUM_VCONTEXTS];
+
+      struct BFSDispatchQueue bfs_queue;
+      struct DFSDispatchQueue dfs_queue;
+
+      struct VContext contexts[BFS_NUM_VCONTEXTS];
+
+      struct BFS1SpillStack bfs2_spill_stack;
+   } vContextScheduler;
+
+   struct QnodeScheduler
+   {
+      dword num_qnode_grb_curr_entries;
+      dword num_qnode_grb_new_entries;
+
+      dword scheduler_postsync;
+      dword _pad1;
+
+      dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).
+      dword num_single_builds;  // number of single-wg builds (#primrefs < threshold)
+
+      dword batched_builds_to_process;
+      dword num_max_qnode_global_root_buffer_entries; // number of maximum entries to global root buffer
+
+      /////////////////////////////////////////////////////////////
+
+      dword batched_build_count;  // number of batched builds in the SAHBuildGlobals buffer
+      dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
+
+      struct QNodeGlobalRootBuffer qnode_global_root_buffer;
+   } qnodeScheduler;
+};
+
+
+struct BVH2Node
+{
+   struct AABB3f box;
+   uint  meta_u;   // leaf:  primref start.  inner: offset from node to its first child
+   uint  meta_ss;
+   //ushort meta_s;   // leaf: primref count.  inner: offset from first to second child, in nodes
+   //uchar is_inner; //  1 if inner, 0 if leaf
+   //uchar mask;
+};
+
+struct BVH2
+{
+   uint num_nodes;
+   uint _pad[7];  // align to 32B
+};
+
+struct BatchedBLSDispatchEntry
+{
+   /////////////////////////////////////////////////////////////
+   //  State data used for communication with command streamer
+   //  NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl'
+   /////////////////////////////////////////////////////////////
+   qword p_data_buffer;
+   qword num_elements; // number of elements in p_data_buffer
+};
+
+struct SAHBuildArgsBatchable
+{
+   qword p_globals_ptrs;
+   qword p_scheduler;
+   qword p_buffers_info;
+   qword p_sah_globals;
+
+   dword num_max_qnode_global_root_buffer_entries;
+   dword num_builds;
+};
+
+#define PREFIX_MK_SAH_BUILD_ARGS_BATCHABLE(prefix, obj) \
+   (struct prefix##_SAHBuildArgsBatchable) { \
+      .p_globals_ptrs = (obj).p_globals_ptrs, \
+      .p_scheduler = (obj).p_scheduler, \
+      .p_buffers_info = (obj).p_buffers_info, \
+      .p_sah_globals = (obj).p_sah_globals, \
+      .num_max_qnode_global_root_buffer_entries = \
+      (obj).num_max_qnode_global_root_buffer_entries, \
+      .num_builds = (obj).num_builds, \
+   }
+
+
+struct SAHBuildBuffersInfo
+{
+   gpuva_t p_globals;
+   gpuva_t p_primref_index_buffers;
+   gpuva_t p_primrefs_buffer;
+   gpuva_t p_bvh2;
+   gpuva_t p_bvh_base;
+   gpuva_t p_qnode_root_buffer;
+   dword   sah_globals_flags;
+   dword   _pad;
+   gpuva_t _pad2;
+};
+
+#endif /* GRL_STRUCTS_H */
diff --git a/src/intel/vulkan/grl/include/AABB3f.h b/src/intel/vulkan/grl/include/AABB3f.h
new file mode 100644
index 00000000000..a3412332c77
--- /dev/null
+++ b/src/intel/vulkan/grl/include/AABB3f.h
@@ -0,0 +1,459 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "GRLRTASCommon.h"
+
+#include "affinespace.h"
+
+#ifndef __OPENCL_VERSION__
+#   include "stdio.h" //for printf
+#endif
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+
+GRL_INLINE void AABB3f_init(struct AABB3f *aabb)
+{
+    aabb->lower[0] = (float)(INFINITY);
+    aabb->lower[1] = (float)(INFINITY);
+    aabb->lower[2] = (float)(INFINITY);
+
+    aabb->upper[0] = -(float)(INFINITY);
+    aabb->upper[1] = -(float)(INFINITY);
+    aabb->upper[2] = -(float)(INFINITY);
+}
+
+GRL_INLINE float3 AABB3f_load_lower( const struct AABB3f* aabb )
+{
+    float3 v = { aabb->lower[0], aabb->lower[1], aabb->lower[2] };
+    return v;
+}
+GRL_INLINE float3 AABB3f_load_upper( const struct AABB3f* aabb )
+{
+    float3 v = { aabb->upper[0], aabb->upper[1], aabb->upper[2] };
+    return v;
+}
+
+GRL_INLINE void AABB3f_extend(struct AABB3f *aabb, const struct AABB3f *v)
+{
+    aabb->lower[0] = fmin(aabb->lower[0], v->lower[0]);
+    aabb->lower[1] = fmin(aabb->lower[1], v->lower[1]);
+    aabb->lower[2] = fmin(aabb->lower[2], v->lower[2]);
+    aabb->upper[0] = fmax(aabb->upper[0], v->upper[0]);
+    aabb->upper[1] = fmax(aabb->upper[1], v->upper[1]);
+    aabb->upper[2] = fmax(aabb->upper[2], v->upper[2]);
+}
+
+GRL_INLINE void AABB3f_intersect(struct AABB3f* aabb, struct AABB3f inters)
+{
+    aabb->upper[0] = fmin(inters.upper[0],aabb->upper[0]);
+    aabb->upper[1] = fmin(inters.upper[1],aabb->upper[1]);
+    aabb->upper[2] = fmin(inters.upper[2],aabb->upper[2]);
+    aabb->lower[0] = fmax(inters.lower[0],aabb->lower[0]);
+    aabb->lower[1] = fmax(inters.lower[1],aabb->lower[1]);
+    aabb->lower[2] = fmax(inters.lower[2],aabb->lower[2]);
+}
+
+GRL_INLINE void AABB3f_trim_upper(struct AABB3f* aabb, const float* upper)
+{
+    aabb->upper[0] = fmin(upper[0], aabb->upper[0]);
+    aabb->upper[1] = fmin(upper[1], aabb->upper[1]);
+    aabb->upper[2] = fmin(upper[2], aabb->upper[2]);
+}
+
+GRL_INLINE void AABB3f_set( struct AABB3f* aabb, float3 lower, float3 upper )
+{
+    aabb->lower[0] = lower.x ;
+    aabb->lower[1] = lower.y ;
+    aabb->lower[2] = lower.z ;
+    aabb->upper[0] = upper.x ;
+    aabb->upper[1] = upper.y ;
+    aabb->upper[2] = upper.z ;
+}
+
+inline void AABB3f_extend_point(struct AABB3f *aabb, const float3 p)
+{
+    aabb->lower[0] = fmin(aabb->lower[0], p.x);
+    aabb->lower[1] = fmin(aabb->lower[1], p.y);
+    aabb->lower[2] = fmin(aabb->lower[2], p.z);
+    aabb->upper[0] = fmax(aabb->upper[0], p.x);
+    aabb->upper[1] = fmax(aabb->upper[1], p.y);
+    aabb->upper[2] = fmax(aabb->upper[2], p.z);
+}
+
+GRL_INLINE void AABB3f_extendlu(struct AABB3f *aabb, const float3 lower, const float3 upper)
+{
+    aabb->lower[0] = fmin(aabb->lower[0], lower.x);
+    aabb->lower[1] = fmin(aabb->lower[1], lower.y);
+    aabb->lower[2] = fmin(aabb->lower[2], lower.z);
+    aabb->upper[0] = fmax(aabb->upper[0], upper.x);
+    aabb->upper[1] = fmax(aabb->upper[1], upper.y);
+    aabb->upper[2] = fmax(aabb->upper[2], upper.z);
+}
+
+GRL_INLINE float3 AABB3f_size(struct AABB3f* aabb)
+{
+    return AABB3f_load_upper(aabb) - AABB3f_load_lower(aabb);
+}
+
+GRL_INLINE float AABB3f_halfArea(struct AABB3f *aabb)
+{
+    const float3 d = AABB3f_load_upper( aabb ) - AABB3f_load_lower( aabb );
+    return d.x* (d.y + d.z) + (d.y * d.z);
+}
+
+GRL_INLINE float halfArea_AABB3f(struct AABB3f *aabb) // TODO: Remove me
+{
+    const float3 d = { aabb->upper[0] - aabb->lower[0], aabb->upper[1] - aabb->lower[1], aabb->upper[2] - aabb->lower[2] };
+    return fma(d.x, (d.y + d.z), d.y * d.z);
+}
+
+GRL_INLINE void AABB3f_set_lower(struct AABB3f* aabb, float3 lower)
+{
+    aabb->lower[0] = lower.x;
+    aabb->lower[1] = lower.y;
+    aabb->lower[2] = lower.z;
+}
+
+GRL_INLINE void AABB3f_set_upper(struct AABB3f* aabb, float3 upper)
+{
+    aabb->upper[0] = upper.x;
+    aabb->upper[1] = upper.y;
+    aabb->upper[2] = upper.z;
+}
+
+GRL_INLINE float3 conservativeExtent(float3 extent)
+{
+    const float v = FLT_EPSILON * fmax(extent.x, fmax(extent.y, extent.z));
+    float3 v3 = { v,v,v };
+    extent = extent + v3;
+    return extent;
+}
+
+inline struct AABB3f GRL_OVERLOADABLE transform_aabb(float3 lower, float3 upper, const float* Transform)
+{
+#if 1
+    // We use an abs-matrix to transform the AABB extent vector, which is enough to compute the area
+    //     New AABB is center +- Extent.
+    //
+    // For derivation see:
+    //    https://zeux.io/2010/10/17/aabb-from-obb-with-component-wise-abs/
+    //
+
+    float3 Center = (upper + lower) * 0.5f;
+    float3 Extent = (conservativeExtent(upper) - lower) * 0.5f;
+
+    float cx = Center.x * Transform[0] + Center.y * Transform[1] + Center.z * Transform[2] + Transform[3];
+    float cy = Center.x * Transform[4] + Center.y * Transform[5] + Center.z * Transform[6] + Transform[7];
+    float cz = Center.x * Transform[8] + Center.y * Transform[9] + Center.z * Transform[10] + Transform[11];
+    float ex = Extent.x * fabs(Transform[0]) + Extent.y * fabs(Transform[1]) + Extent.z * fabs(Transform[2]);
+    float ey = Extent.x * fabs(Transform[4]) + Extent.y * fabs(Transform[5]) + Extent.z * fabs(Transform[6]);
+    float ez = Extent.x * fabs(Transform[8]) + Extent.y * fabs(Transform[9]) + Extent.z * fabs(Transform[10]);
+
+    Center.x = cx; Center.y = cy;  Center.z = cz;
+    Extent.x = ex; Extent.y = ey;  Extent.z = ez;
+
+    struct AABB3f box;
+    AABB3f_set_lower(&box, Center - Extent);
+    AABB3f_set_upper(&box, Center + Extent);
+    return box;
+#else
+    struct AffineSpace3f xfm = AffineSpace3f_load_row_major(Transform);
+
+    float3 plll = { lower.x, lower.y, lower.z };
+    float3 pllu = { lower.x, lower.y, upper.z };
+    float3 plul = { lower.x, upper.y, lower.z };
+    float3 pluu = { lower.x, upper.y, upper.z };
+    float3 pull = { upper.x, lower.y, lower.z };
+    float3 pulu = { upper.x, lower.y, upper.z };
+    float3 puul = { upper.x, upper.y, lower.z };
+    float3 puuu = { upper.x, upper.y, upper.z };
+    plll  = xfmPoint(xfm, plll) ;
+    pllu  = xfmPoint(xfm, pllu) ;
+    plul  = xfmPoint(xfm, plul) ;
+    pluu  = xfmPoint(xfm, pluu) ;
+    pull  = xfmPoint(xfm, pull) ;
+    pulu  = xfmPoint(xfm, pulu) ;
+    puul  = xfmPoint(xfm, puul) ;
+    puuu  = xfmPoint(xfm, puuu) ;
+
+    float3 p1_min = fmin(plll, pull);
+    float3 p2_min = fmin(pllu, pulu);
+    float3 p3_min = fmin(plul, puul);
+    float3 p4_min = fmin(pluu, puuu);
+    float3 p1_max = fmax(plll, pull);
+    float3 p2_max = fmax(pllu, pulu);
+    float3 p3_max = fmax(plul, puul);
+    float3 p4_max = fmax(pluu, puuu);
+    p1_min = fmin(p1_min, p3_min);
+    p2_min = fmin(p2_min, p4_min);
+    p1_max = fmax(p1_max, p3_max);
+    p2_max = fmax(p2_max, p4_max);
+    p1_min = fmin(p1_min, p2_min);
+    p1_max = fmax(p1_max, p2_max);
+
+    AABB3f out = {
+        {p1_min.x,p1_min.y,p1_min.z},
+        {p1_max.x,p1_max.y,p1_max.z}
+    };
+    return out;
+#endif
+}
+
+GRL_INLINE struct AABB3f GRL_OVERLOADABLE transform_aabb(struct AABB3f box, const float* Transform)
+{
+    float3 lower = { box.lower[0], box.lower[1], box.lower[2] };
+    float3 upper = { box.upper[0], box.upper[1], box.upper[2] };
+    return transform_aabb(lower, upper, Transform);
+}
+
+GRL_INLINE struct AABB3f AABB3f_transform(struct AffineSpace3f xfm, struct AABB3f in)
+{
+    struct AABB3f out;
+    float rmTransform[12];
+    load_row_major_from_AffineSpace3f(xfm, rmTransform);
+    out = transform_aabb(in, rmTransform);
+
+    return out;
+}
+
+GRL_INLINE bool AABB3f_isIn(struct AABB3f bigger, float3 contained)
+{
+    bool iscontained =
+        contained.x >= bigger.lower[0] &&
+        contained.y >= bigger.lower[1] &&
+        contained.z >= bigger.lower[2] &&
+        contained.x <= bigger.upper[0] &&
+        contained.y <= bigger.upper[1] &&
+        contained.z <= bigger.upper[2];
+
+    return iscontained;
+}
+
+GRL_INLINE bool AABB3f_isSubset(struct AABB3f bigger, struct AABB3f contained)
+{
+    bool iscontained =
+        contained.lower[0] >= bigger.lower[0] &&
+        contained.lower[1] >= bigger.lower[1] &&
+        contained.lower[2] >= bigger.lower[2] &&
+        contained.upper[0] <= bigger.upper[0] &&
+        contained.upper[1] <= bigger.upper[1] &&
+        contained.upper[2] <= bigger.upper[2];
+
+    return iscontained;
+}
+
+GRL_INLINE bool AABB3f_is_degenerate(struct AABB3f* box )
+{
+    return box->lower[0] > box->upper[0] ||
+           box->lower[1] > box->upper[1] ||
+           box->lower[2] > box->upper[2];
+}
+
+GRL_INLINE void AABB3f_print(struct AABB3f *aabb)
+{
+    printf("AABB {\n");
+    printf("  lower = %f, %f, %f\n", aabb->lower[0], aabb->lower[1], aabb->lower[2]);
+    printf("  upper = %f, %f, %f\n", aabb->upper[0], aabb->upper[1], aabb->upper[2]);
+    printf("}\n");
+}
+
+
+
+#ifdef __OPENCL_VERSION__
+GRL_INLINE struct AABB3f AABB3f_sub_group_shuffle(struct AABB3f *aabb, const uint slotID)
+{
+    struct AABB3f bounds;
+    bounds.lower[0] = intel_sub_group_shuffle(aabb->lower[0], slotID);
+    bounds.lower[1] = intel_sub_group_shuffle(aabb->lower[1], slotID);
+    bounds.lower[2] = intel_sub_group_shuffle(aabb->lower[2], slotID);
+    bounds.upper[0] = intel_sub_group_shuffle(aabb->upper[0], slotID);
+    bounds.upper[1] = intel_sub_group_shuffle(aabb->upper[1], slotID);
+    bounds.upper[2] = intel_sub_group_shuffle(aabb->upper[2], slotID);
+    return bounds;
+}
+
+GRL_INLINE struct AABB3f AABB3f_sub_group_reduce(struct AABB3f *aabb)
+{
+    struct AABB3f bounds;
+    bounds.lower[0] = sub_group_reduce_min(aabb->lower[0]);
+    bounds.lower[1] = sub_group_reduce_min(aabb->lower[1]);
+    bounds.lower[2] = sub_group_reduce_min(aabb->lower[2]);
+    bounds.upper[0] = sub_group_reduce_max(aabb->upper[0]);
+    bounds.upper[1] = sub_group_reduce_max(aabb->upper[1]);
+    bounds.upper[2] = sub_group_reduce_max(aabb->upper[2]);
+    return bounds;
+}
+
+GRL_INLINE struct AABB3f AABB3f_sub_group_scan_exclusive_min_max(struct AABB3f *aabb)
+{
+    struct AABB3f bounds;
+    bounds.lower[0] = sub_group_scan_exclusive_min(aabb->lower[0]);
+    bounds.lower[1] = sub_group_scan_exclusive_min(aabb->lower[1]);
+    bounds.lower[2] = sub_group_scan_exclusive_min(aabb->lower[2]);
+    bounds.upper[0] = sub_group_scan_exclusive_max(aabb->upper[0]);
+    bounds.upper[1] = sub_group_scan_exclusive_max(aabb->upper[1]);
+    bounds.upper[2] = sub_group_scan_exclusive_max(aabb->upper[2]);
+    return bounds;
+}
+
+GRL_INLINE struct AABB3f AABB3f_sub_group_scan_inclusive_min_max(struct AABB3f *aabb)
+{
+    struct AABB3f bounds;
+    bounds.lower[0] = sub_group_scan_inclusive_min(aabb->lower[0]);
+    bounds.lower[1] = sub_group_scan_inclusive_min(aabb->lower[1]);
+    bounds.lower[2] = sub_group_scan_inclusive_min(aabb->lower[2]);
+    bounds.upper[0] = sub_group_scan_inclusive_max(aabb->upper[0]);
+    bounds.upper[1] = sub_group_scan_inclusive_max(aabb->upper[1]);
+    bounds.upper[2] = sub_group_scan_inclusive_max(aabb->upper[2]);
+    return bounds;
+}
+
+GRL_INLINE void AABB3f_atomic_merge_local_nocheck(local struct AABB3f *aabb, const float4 lower, const float4 upper)
+{
+    atomic_min((local float *)&aabb->lower + 0, lower.x);
+    atomic_min((local float *)&aabb->lower + 1, lower.y);
+    atomic_min((local float *)&aabb->lower + 2, lower.z);
+    atomic_max((local float *)&aabb->upper + 0, upper.x);
+    atomic_max((local float *)&aabb->upper + 1, upper.y);
+    atomic_max((local float *)&aabb->upper + 2, upper.z);
+}
+
+
+GRL_INLINE void AABB3f_atomic_merge_global_lu( global struct AABB3f* aabb, const float3 lower, const float3 upper )
+{
+    atomic_min( (global float*) & aabb->lower + 0, lower.x );
+    atomic_min( (global float*) & aabb->lower + 1, lower.y );
+    atomic_min( (global float*) & aabb->lower + 2, lower.z );
+    atomic_max( (global float*) & aabb->upper + 0, upper.x );
+    atomic_max( (global float*) & aabb->upper + 1, upper.y );
+    atomic_max( (global float*) & aabb->upper + 2, upper.z );
+}
+
+GRL_INLINE void AABB3f_atomic_merge_local_lu( local struct AABB3f* aabb, const float3 lower, const float3 upper )
+{
+    atomic_min( (local float*) & aabb->lower + 0, lower.x );
+    atomic_min( (local float*) & aabb->lower + 1, lower.y );
+    atomic_min( (local float*) & aabb->lower + 2, lower.z );
+    atomic_max( (local float*) & aabb->upper + 0, upper.x );
+    atomic_max( (local float*) & aabb->upper + 1, upper.y );
+    atomic_max( (local float*) & aabb->upper + 2, upper.z );
+}
+
+GRL_INLINE void Uniform_AABB3f_atomic_merge_local_sub_group_lu(uniform local struct AABB3f* aabb, const float3 lower, const float3 upper)
+{
+    float lx = sub_group_reduce_min(lower.x);
+    float ly = sub_group_reduce_min(lower.y);
+    float lz = sub_group_reduce_min(lower.z);
+
+    float ux = sub_group_reduce_max(upper.x);
+    float uy = sub_group_reduce_max(upper.y);
+    float uz = sub_group_reduce_max(upper.z);
+
+    if (get_sub_group_local_id() == 0)
+    {
+        atomic_min((local float*) & aabb->lower + 0, lx);
+        atomic_min((local float*) & aabb->lower + 1, ly);
+        atomic_min((local float*) & aabb->lower + 2, lz);
+        atomic_max((local float*) & aabb->upper + 0, ux);
+        atomic_max((local float*) & aabb->upper + 1, uy);
+        atomic_max((local float*) & aabb->upper + 2, uz);
+    }
+}
+
+GRL_INLINE void AABB3f_atomic_merge_global_sub_group_lu(uniform global struct AABB3f* aabb, const float3 lower, const float3 upper)
+{
+    uint lane = get_sub_group_local_id();
+    float l[3];
+    l[0] = sub_group_reduce_min(lower.x);
+    l[1] = sub_group_reduce_min(lower.y);
+    l[2] = sub_group_reduce_min(lower.z);
+    float u[3];
+    u[0] = sub_group_reduce_max(upper.x);
+    u[1] = sub_group_reduce_max(upper.y);
+    u[2] = sub_group_reduce_max(upper.z);
+
+    if (lane < 3)
+    {
+        atomic_min((global float*)&aabb->lower + lane, l[lane]);
+        atomic_max((global float*)&aabb->upper + lane, u[lane]);
+    }
+}
+
+GRL_INLINE void AABB3f_atomic_merge_global( global struct AABB3f* aabb, struct AABB3f* other )
+{
+    float3 lower = AABB3f_load_lower( other );
+    float3 upper = AABB3f_load_upper( other );
+    atomic_min( (global float*) & aabb->lower + 0, lower.x );
+    atomic_min( (global float*) & aabb->lower + 1, lower.y );
+    atomic_min( (global float*) & aabb->lower + 2, lower.z );
+    atomic_max( (global float*) & aabb->upper + 0, upper.x );
+    atomic_max( (global float*) & aabb->upper + 1, upper.y );
+    atomic_max( (global float*) & aabb->upper + 2, upper.z );
+}
+
+GRL_INLINE void AABB3f_atomic_merge_localBB_nocheck( local struct AABB3f* aabb, struct AABB3f* bb )
+{
+    atomic_min( (local float*) & aabb->lower + 0, bb->lower[0] );
+    atomic_min( (local float*) & aabb->lower + 1, bb->lower[1] );
+    atomic_min( (local float*) & aabb->lower + 2, bb->lower[2] );
+    atomic_max( (local float*) & aabb->upper + 0, bb->upper[0] );
+    atomic_max( (local float*) & aabb->upper + 1, bb->upper[1] );
+    atomic_max( (local float*) & aabb->upper + 2, bb->upper[2] );
+}
+
+GRL_INLINE void AABB3f_atomic_merge_local(local struct AABB3f *aabb, const float4 lower, const float4 upper)
+{
+    if (lower.x < aabb->lower[0])
+        atomic_min((local float *)&aabb->lower + 0, lower.x);
+    if (lower.y < aabb->lower[1])
+        atomic_min((local float *)&aabb->lower + 1, lower.y);
+    if (lower.z < aabb->lower[2])
+        atomic_min((local float *)&aabb->lower + 2, lower.z);
+    if (upper.x > aabb->upper[0])
+        atomic_max((local float *)&aabb->upper + 0, upper.x);
+    if (upper.y > aabb->upper[1])
+        atomic_max((local float *)&aabb->upper + 1, upper.y);
+    if (upper.z > aabb->upper[2])
+        atomic_max((local float *)&aabb->upper + 2, upper.z);
+}
+
+GRL_INLINE void AABB3f_atomic_merge_global_local(global struct AABB3f *dest, local struct AABB3f *source)
+{
+    float3 l = AABB3f_load_lower(source);
+    float3 u = AABB3f_load_upper(source);
+    atomic_min((global float *)&dest->lower + 0, l.x );
+    atomic_min((global float *)&dest->lower + 1, l.y );
+    atomic_min((global float *)&dest->lower + 2, l.z );
+    atomic_max((global float *)&dest->upper + 0, u.x );
+    atomic_max((global float *)&dest->upper + 1, u.y );
+    atomic_max((global float *)&dest->upper + 2, u.z );
+}
+
+
+struct AABB3f AABB3f_construct( float3 min, float3 max )
+{
+    struct AABB3f bb;
+    bb.lower[0] = min.x; bb.lower[1] = min.y; bb.lower[2] = min.z;
+    bb.upper[0] = max.x; bb.upper[1] = max.y; bb.upper[2] = max.z;
+    return bb;
+}
+
+struct AABB3f AABB3f_select( struct AABB3f left, struct AABB3f right, int3 cond )
+{
+    float3 l = select( AABB3f_load_lower(&left), AABB3f_load_lower(&right), cond );
+    float3 u = select( AABB3f_load_upper(&left), AABB3f_load_upper(&right), cond );
+    return AABB3f_construct( l, u );
+}
+
+#endif
+
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
+
diff --git a/src/intel/vulkan/grl/include/GRLGen12.h b/src/intel/vulkan/grl/include/GRLGen12.h
new file mode 100644
index 00000000000..20849599e91
--- /dev/null
+++ b/src/intel/vulkan/grl/include/GRLGen12.h
@@ -0,0 +1,691 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+//
+// This file is to contain structure definitions related to the Gen12 QBVH6 acceleration structures
+//
+//
+
+//********************************************************************************************
+//   WARNING!!!!!
+// This file is shared by OpenCL and C++ source code and must be compatible.
+//  There should only be C structure definitions and trivial GRL_INLINE functions here
+//
+//********************************************************************************************
+
+#pragma once
+
+#include "GRLRTASCommon.h"
+#include "GRLUtilities.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+GRL_NAMESPACE_BEGIN(GEN12)
+
+    enum_uint8(NodeType)
+    {
+        NODE_TYPE_MIXED = 0x0,        // identifies a mixed internal node where each child can have a different type
+        NODE_TYPE_INTERNAL = 0x0,     // internal BVH node with 6 children
+        NODE_TYPE_INSTANCE = 0x1,     // instance leaf
+        NODE_TYPE_PROCEDURAL = 0x3,   // procedural leaf
+        NODE_TYPE_QUAD = 0x4,         // quad leaf
+        NODE_TYPE_INVALID = 0x7       // indicates invalid node
+    };
+
+
+    typedef enum PrimLeafType
+    {
+        TYPE_NONE = 0,
+
+        TYPE_QUAD = 0,
+
+        /* For a node type of NODE_TYPE_PROCEDURAL we support enabling
+        * and disabling the opaque/non_opaque culling. */
+
+        TYPE_OPACITY_CULLING_ENABLED = 0,
+        TYPE_OPACITY_CULLING_DISABLED = 1
+    } PrimLeafType;
+
+    #define BVH_MAGIC_MACRO     "GEN12_RTAS_005"    //  If serialization-breaking or algorithm-breaking changes are made, increment the digits at the end
+    static const char BVH_MAGIC[16] = BVH_MAGIC_MACRO;
+
+    typedef struct BVHBase
+    {
+        // TODO:  Implement the "copy-first-node" trick... duplicate root node here
+
+        uint64_t rootNodeOffset;
+
+        uint32_t reserved;
+
+        uint32_t nodeDataCur; // nodeDataStart is sizeof(BVHBase) / 64 = BVH_ROOT_NODE_OFFSET / 64
+        uint32_t quadLeafStart;
+        uint32_t quadLeafCur;
+        uint32_t proceduralDataStart;
+        uint32_t proceduralDataCur;
+        uint32_t instanceLeafStart;
+        uint32_t instanceLeafEnd;
+        uint32_t backPointerDataStart;     //
+        uint32_t refitTreeletsDataStart;   // refit structs
+        uint32_t refitStartPointDataStart; //
+        uint32_t BVHDataEnd;
+
+        // number of bottom treelets
+        // if 1, then the bottom treelet is also tip treelet
+        uint32_t refitTreeletCnt;    
+        uint32_t refitTreeletCnt2; // always 0, used for atomic updates
+        // data layout:
+        // @backPointerDataStart
+        //  'backpointer' - a dword per inner node.
+        //  The bits are used as follows:
+        //     2:0  --> Used as a refit counter during BVH refitting.  MBZ
+        //     5:3  --> Number of children
+        //     31:6 --> Index of the parent node in the internal node array
+        //    The root node has a parent index of all ones
+        // @refitTreeletsDataStart
+        //  RefitTreelet[], the last treelet is for top treelet all previous are for bottom 
+        // @refitStartPointDataStart
+        //  for each treelet T there is [T.startpoint_offset, T.numStartpoints) interval of startpoints here in that space
+        // @backPointerDataEnd
+
+        uint32_t fatLeafCount;  // number of internal nodes which are "fat-leaves"
+        uint32_t innerCount;    // number of internal nodes which are true inner nodes (all internalNode children)
+        uint32_t fatLeafTableStart;
+        uint32_t innerTableStart;
+
+        uint32_t quadLeftoversCountNewAtomicUpdate; // number of quad leftovers for new atomic update
+        uint32_t quadTableSizeNewAtomicUpdate; // size of quad Table including leftovers, padded to 256
+        uint32_t quadIndicesDataStart;
+
+        uint32_t _pad[9];
+
+        struct RTASMetaData Meta;
+
+    } BVHBase;
+
+    GRL_INLINE struct GeoMetaData* BVHBase_GetGeoMetaData(BVHBase* base)
+    {
+        return (struct GeoMetaData*)(((char*)base) + base->Meta.geoDescsStart);
+    }
+
+#ifdef __OPENCL_VERSION__
+#define BVH_ROOT_NODE_OFFSET sizeof(BVHBase)
+#else
+#define BVH_ROOT_NODE_OFFSET sizeof(GRL::RTAS::GEN12::BVHBase)
+#endif
+
+GRL_STATIC_ASSERT( sizeof(BVHBase) == BVH_ROOT_NODE_OFFSET, "Wrong size!");
+GRL_STATIC_ASSERT( (sizeof(BVHBase) % 64) == 0 , "Misaligned size!");
+
+    typedef struct BackPointers {
+    } BackPointers;
+
+    // threshold for size of bottom treelets, note usually treelets will be 2-3x smaller than that number
+    // means that no bottom treelet has more paths than this number
+    #define TREELET_NUM_STARTPOINTS 1536
+
+    // threshold under which only one treelet will be created
+    #define SINGLE_TREELET_THRESHOLD 3072
+    
+    typedef struct LeafTableEntry {
+
+        uint backpointer;
+        uint inner_node_index;
+        uint leaf_index;
+    } LeafTableEntry;
+
+    typedef struct InnerNodeTableEntry {
+
+        uint node_index_and_numchildren; // numchildren in 3 lsbs
+        uint first_child;
+
+    } InnerNodeTableEntry;
+
+    typedef struct QuadDataIndices
+    {
+        uint header_data[4];
+        uint vert_idx[4];
+    } QuadDataIndices;
+
+    typedef struct RefitTreelet {
+        uint32_t startpoint_offset;
+        uint32_t numStartpoints;
+        uint32_t numNonTrivialStartpoints;
+        uint8_t  maxDepth;
+        uint8_t  depthLess64; // depth from bottom at which there are less 64  paths
+        uint8_t  depthLess128;// depth from bottom at which there are less 128 paths
+        uint8_t  depthLess256;// depth from bottom at which there are less 256 paths
+    } RefitTreelet;
+
+    // if RefitTreelet has number of startpoints == 1
+    // it should be reinterpreted as:
+    typedef struct RefitTreeletTrivial {
+        uint32_t theOnlyNodeIndex;
+        uint32_t numStartpoints; // have to be 1 or 0
+        int32_t  childrenOffsetOfTheNode; // 0th node based
+        uint8_t  maxDepth;
+        uint8_t  numChildrenOfTheNode;
+    } RefitTreeletTrivial;
+
+    // 5:0  - depth after you die
+    // 31:6 - Index of the inner node
+    typedef uint32_t StartPoint;
+
+    struct HwInstanceLeaf;
+    struct QuadLeaf;
+    struct ProceduralLeaf;
+    struct InternalNode;
+
+    typedef struct HwInstanceLeaf HwInstanceLeaf;
+    typedef struct InternalNode InternalNode;
+    typedef struct QuadLeaf QuadLeaf;
+    typedef struct ProceduralLeaf ProceduralLeaf;
+
+    GRL_INLINE uint32_t BackPointer_GetParentIndex( uint32_t bp )
+    {
+        return bp >> 6;
+    }
+    GRL_INLINE uint32_t BackPointer_GetNumChildren( uint32_t bp )
+    {
+        return (bp >> 3) & (7);
+    }
+    GRL_INLINE uint32_t BackPointer_GetRefitCount( uint32_t bp )
+    {
+        return bp & 7;
+    }
+    GRL_INLINE bool BackPointer_IsRoot( uint32_t bp )
+    {
+        return (bp >> 6) == 0x03FFFFFF;
+    }
+
+    GRL_INLINE InternalNode* BVHBase_GetRootNode( const BVHBase* p )
+    {
+        return (InternalNode*)( ((char*)p) + BVH_ROOT_NODE_OFFSET);
+    }
+
+    GRL_INLINE AABB3f BVHBase_GetRootAABB(const BVHBase* p)
+    {
+        return p->Meta.bounds;
+    }
+
+    GRL_INLINE InternalNode* BVHBase_GetInternalNodes(const BVHBase* p)
+    {
+        return (InternalNode*)(((char*)p) + BVH_ROOT_NODE_OFFSET);
+    }
+    GRL_INLINE InternalNode* BVHBase_GetInternalNodesEnd(const BVHBase* p)
+    {
+        return (InternalNode*)(((char*)p) + (size_t)(64u * p->nodeDataCur));
+    }
+    GRL_INLINE uint32_t BVHBase_GetNumInternalNodes(const BVHBase* p)
+    {
+        return p->nodeDataCur - BVH_ROOT_NODE_OFFSET / 64;
+    }
+
+
+    GRL_INLINE QuadLeaf* BVHBase_GetQuadLeaves(const BVHBase* p)
+    {
+        return (QuadLeaf*)(((char*)p) + (size_t)(64u * p->quadLeafStart));
+    }
+    GRL_INLINE const QuadLeaf* BVHBase_GetQuadLeaves_End(const BVHBase* p)
+    {
+        return (QuadLeaf*)(((char*)p) + (size_t)(64u * p->quadLeafCur));
+    }
+
+    GRL_INLINE const ProceduralLeaf* BVHBase_GetProceduralLeaves_End(const BVHBase* p)
+    {
+        return (ProceduralLeaf*)(((char*)p) + (size_t)(64u * p->proceduralDataCur));
+    }
+
+    GRL_INLINE ProceduralLeaf* BVHBase_GetProceduralLeaves(const BVHBase* p)
+    {
+        return (ProceduralLeaf*)(((char*)p) + (size_t)(64u * p->proceduralDataStart));
+    }
+
+    GRL_INLINE HwInstanceLeaf* BVHBase_GetHWInstanceLeaves(const BVHBase* p )
+    {
+        char* pRTASBits = (char*)p;
+        return (HwInstanceLeaf*)(pRTASBits + (size_t)(64u * p->instanceLeafStart));
+    }
+
+    GRL_INLINE HwInstanceLeaf* BVHBase_GetHWInstanceLeaves_End(const BVHBase* p )
+    {
+        char* pRTASBits = (char*) p;
+        return (HwInstanceLeaf*)(pRTASBits + (size_t)(64u * p->instanceLeafEnd));
+    }
+
+    GRL_INLINE uint BVHBase_GetNumHWInstanceLeaves( const BVHBase* p )
+    {
+        return (p->instanceLeafEnd - p->instanceLeafStart) / 2;
+    }
+
+    GRL_INLINE uint* BVHBase_GetRefitStartPoints(const BVHBase* p)
+    {
+        return (uint32_t*)(((char*)p) + (size_t)(64u * p->refitStartPointDataStart));
+    }
+
+    GRL_INLINE uint BVHBase_GetRefitStartPointsSize(const BVHBase* p)
+    {
+        return 64u * (p->fatLeafTableStart - p->refitStartPointDataStart);
+    }
+
+    GRL_INLINE uint StartPoint_GetDepth(StartPoint s)
+    {
+        return s & ((1 << 6) - 1);
+    }
+
+    GRL_INLINE uint StartPoint_GetNodeIdx(StartPoint s)
+    {
+        return s >> 6;
+    }
+
+    GRL_INLINE RefitTreelet* BVHBase_GetRefitTreeletDescs(const BVHBase* p)
+    {
+        return (RefitTreelet*)(((char*)p) + (size_t)(64u * p->refitTreeletsDataStart));
+    }
+
+    // this is treelet count as should be executed, ie. num of bottom treelets if there are top and bottoms.
+    // to get real number of all treelets including tip, the formula is 
+    //    actualNumTreelets = refitTreeletCnt > 1 ? refitTreeletCnt + 1 : 1;
+    GRL_INLINE uint32_t* BVHBase_GetRefitTreeletCntPtr(BVHBase* p)
+    {
+        return &p->refitTreeletCnt;
+    }
+
+    GRL_INLINE uint32_t BVHBase_GetRefitTreeletCnt(const BVHBase* p)
+    {
+        return p->refitTreeletCnt;
+    }
+
+    GRL_INLINE uint32_t BVHBase_IsSingleTreelet(const BVHBase* p)
+    {
+        return p->refitTreeletCnt == 1;
+    }
+
+    GRL_INLINE BackPointers* BVHBase_GetBackPointers(const BVHBase* p)
+    {
+        return (BackPointers*)(((char*)p) + (size_t)(64u * p->backPointerDataStart));
+    }
+
+
+    GRL_INLINE LeafTableEntry* BVHBase_GetFatLeafTable(const BVHBase* p)
+    {
+        return (LeafTableEntry*)(((char*)p) + (size_t)(64u * p->fatLeafTableStart));
+    }
+    GRL_INLINE InnerNodeTableEntry* BVHBase_GetInnerNodeTable(const BVHBase* p)
+    {
+        return (InnerNodeTableEntry*)(((char*)p) + (size_t)(64u * p->innerTableStart));
+    }
+    GRL_INLINE QuadDataIndices* BVHBase_GetQuadDataIndicesTable(const BVHBase* p)
+    {
+        return (QuadDataIndices*)(((char*)p) + (size_t)(64u * p->quadIndicesDataStart));
+    }
+
+    GRL_INLINE unsigned* InnerNode_GetBackPointer(
+        BackPointers* backpointersStruct,
+        uint32_t inodeOffset /*in 64B units, from the earliest Inner node*/)
+    {
+        uint* backpointersArray = (uint*)backpointersStruct;
+        // BACKPOINTER_LAYOUT
+        uint new_index = inodeOffset;                                                                              //<-layout canonical
+        //uint new_index = inodeOffset*16;                                                                           //<-layout scattered
+        // uint new_index = (inodeOffset & (~0xFFFF)) | (((inodeOffset & 0xFF) << 8) | ((inodeOffset & 0xFF00) >> 8));     //<-layout hashed
+
+        return backpointersArray + new_index;
+    }
+
+    GRL_INLINE uint32_t BVHBase_GetRefitStructsDataSize(const BVHBase* p)
+    {
+        return 64u * (p->BVHDataEnd - p->backPointerDataStart);
+    }
+
+    GRL_INLINE uint32_t BVHBase_GetBackpointersDataSize(const BVHBase* p)
+    {
+        return 64u * (p->refitTreeletsDataStart - p->backPointerDataStart);
+    }
+
+    GRL_INLINE uint32_t* BVHBase_GetBVHDataEnd( const BVHBase* p )
+    {
+        return (uint32_t*)(((char*)p) + (size_t)(64u * p->BVHDataEnd));
+    }
+
+    GRL_INLINE bool BVHBase_HasBackPointers( const BVHBase* p )
+    {
+        return p->refitTreeletsDataStart > p->backPointerDataStart;
+    }
+
+    GRL_INLINE const size_t BVHBase_GetNumQuads(const BVHBase* p)
+    {
+        return p->quadLeafCur - p->quadLeafStart;
+    }
+
+    GRL_INLINE const size_t BVHBase_GetNumProcedurals(const BVHBase* p)
+    {
+        return p->proceduralDataCur - p->proceduralDataStart;
+    }
+
+    GRL_INLINE const size_t BVHBase_GetNumInstances(const BVHBase* p)
+    {
+        return (p->instanceLeafEnd - p->instanceLeafStart) / 2;
+    }
+
+    GRL_INLINE const size_t BVHBase_totalBytes(const BVHBase* p)
+    {
+        return p->BVHDataEnd * 64u;
+    }
+
+
+
+    struct HwInstanceLeaf
+    {
+        /* first 64 bytes accessed during traversal */
+        struct Part0
+        {
+            //uint32_t shaderIndex : 24;
+            //uint32_t geomMask : 8;
+            uint32_t DW0;
+
+            // uint32_t instanceContributionToHitGroupIndex : 24;
+            // uint32_t pad0 : 8
+            //
+            // NOTE:  Traversal shaders are implemented by aliasing instance leaves as procedural and sending them through the procedural path
+            //    For a procedural instance, bit 29 should be set to 1, to disable "opaque culling"
+            //      and bits 30 and 31 must be zero.  See also the definition of the 'PrimLeafDesc' structure
+            uint32_t DW1;
+
+            //      uint64_t rootNodePtr : 48;
+            //      uint64_t instFlags : 8;
+            //      uint64_t pad1 : 8;
+            uint64_t DW2_DW3;
+
+            // Vec3f world2obj_vx;   // 1st row of Worl2Obj transform
+            float    world2obj_vx_x;
+            float    world2obj_vx_y;
+            float    world2obj_vx_z;
+
+            // Vec3f world2obj_vy;   // 2nd row of Worl2Obj transform
+            float    world2obj_vy_x;
+            float    world2obj_vy_y;
+            float    world2obj_vy_z;
+
+            // Vec3f world2obj_vz;   // 3rd row of Worl2Obj transform
+            float    world2obj_vz_x;
+            float    world2obj_vz_y;
+            float    world2obj_vz_z;
+
+            // Vec3f obj2world_p;    // translation of Obj2World transform (on purpose in fist 64 bytes)
+            float    obj2world_p_x;
+            float    obj2world_p_y;
+            float    obj2world_p_z;
+        } part0;
+
+        /* second 64 bytes accessed during shading */
+        // NOTE: Everything in this block is under SW control
+        struct Part1
+        {
+            //      uint64_t bvhPtr : 48;
+            //      uint64_t pad : 16;
+            uint64_t DW0_DW1;
+
+            uint32_t instanceID;
+            uint32_t instanceIndex;
+
+            // Vec3f world2obj_vx;   // 1st row of Worl2Obj transform
+            float    obj2world_vx_x;
+            float    obj2world_vx_y;
+            float    obj2world_vx_z;
+
+            // Vec3f world2obj_vy;   // 2nd row of Worl2Obj transform
+            float    obj2world_vy_x;
+            float    obj2world_vy_y;
+            float    obj2world_vy_z;
+
+            // Vec3f world2obj_vz;   // 3rd row of Worl2Obj transform
+            float    obj2world_vz_x;
+            float    obj2world_vz_y;
+            float    obj2world_vz_z;
+
+            // Vec3f obj2world_p;    // translation of Obj2World transform (on purpose in fist 64 bytes)
+            float    world2obj_p_x;
+            float    world2obj_p_y;
+            float    world2obj_p_z;
+        } part1;
+    };
+
+    __constant const uint64_t c_one = 1ul;
+
+    GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceMask( const HwInstanceLeaf* p )
+    {
+        return p->part0.DW0 >> 24;
+    }
+
+    GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceContributionToHitGroupIndex( const HwInstanceLeaf* p )
+    {
+        return p->part0.DW1 & 0x00ffffff;
+    }
+
+    GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceFlags( const HwInstanceLeaf* p )
+    {
+        return (p->part0.DW2_DW3 >> 48) & 0xff;
+    }
+    GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceID( const HwInstanceLeaf* p )
+    {
+        return p->part1.instanceID;
+    }
+
+    GRL_INLINE gpuva_t HwInstanceLeaf_GetBVH( const HwInstanceLeaf* p )           { return p->part1.DW0_DW1 & ((c_one << 48) - 1); }
+    GRL_INLINE gpuva_t HwInstanceLeaf_GetStartNode( const HwInstanceLeaf* p )     { return p->part0.DW2_DW3 & ((c_one << 48) - 1); }
+    GRL_INLINE uint32_t HwInstanceLeaf_GetInstanceIndex( const HwInstanceLeaf* p ) { return p->part1.instanceIndex; }
+
+    GRL_INLINE void HwInstanceLeaf_GetTransform(struct HwInstanceLeaf* p, float* transform)
+    {
+        transform[0]  = p->part1.obj2world_vx_x;
+        transform[1]  = p->part1.obj2world_vy_x;
+        transform[2]  = p->part1.obj2world_vz_x;
+        transform[3]  = p->part0.obj2world_p_x;
+        transform[4]  = p->part1.obj2world_vx_y;
+        transform[5]  = p->part1.obj2world_vy_y;
+        transform[6]  = p->part1.obj2world_vz_y;
+        transform[7]  = p->part0.obj2world_p_y;
+        transform[8]  = p->part1.obj2world_vx_z;
+        transform[9]  = p->part1.obj2world_vy_z;
+        transform[10] = p->part1.obj2world_vz_z;
+        transform[11] = p->part0.obj2world_p_z;
+    }
+
+    GRL_INLINE void HwInstanceLeaf_SetBVH( HwInstanceLeaf* p, gpuva_t b ) {
+        uint64_t mask = ((c_one << 48) - 1);
+        uint64_t v = p->part1.DW0_DW1;
+        v = (b & mask) | (v & ~mask);
+        p->part1.DW0_DW1 = v;
+    }
+    GRL_INLINE void HwInstanceLeaf_SetStartNode( HwInstanceLeaf* p, gpuva_t b ) {
+        uint64_t mask = ((c_one << 48) - 1);
+        uint64_t v = p->part0.DW2_DW3;
+        v = (b & mask) | (v & ~mask);
+        p->part0.DW2_DW3 = v;
+    }
+    GRL_INLINE void HwInstanceLeaf_SetStartNodeAndInstanceFlags( HwInstanceLeaf* p,
+                                                             gpuva_t root,
+                                                             uint8_t flags ) {
+        uint64_t mask = ((1ull << 48) - 1);
+        uint64_t v = (root & mask) | ((uint64_t)(flags)<<48);
+        p->part1.DW0_DW1 = v;
+    }
+
+    struct InternalNode
+    {
+        float lower[3];       // world space origin of quantization grid
+        int32_t childOffset;  // offset to all children in 64B multiples
+
+        uint8_t nodeType;     // the type of the node
+        uint8_t pad;          // unused byte
+
+        int8_t exp_x;         // 2^exp_x is the size of the grid in x dimension
+        int8_t exp_y;         // 2^exp_y is the size of the grid in y dimension
+        int8_t exp_z;         // 2^exp_z is the size of the grid in z dimension
+        uint8_t nodeMask;     // mask used for ray filtering
+
+        struct ChildData
+        {
+            //uint8_t blockIncr : 2; // size of child in 64 byte blocks.   Must be ==2 for instance leaves, <=2 for quad leaves.
+            //uint8_t startPrim : 4; // start primitive in fat leaf mode or child type in mixed mode
+            //uint8_t pad : 2; // unused bits
+            uint8_t bits;
+        } childData[6];
+
+        uint8_t lower_x[6];  // the quantized lower bounds in x-dimension
+        uint8_t upper_x[6];  // the quantized upper bounds in x-dimension
+        uint8_t lower_y[6];  // the quantized lower bounds in y-dimension
+        uint8_t upper_y[6];  // the quantized upper bounds in y-dimension
+        uint8_t lower_z[6];  // the quantized lower bounds in z-dimension
+        uint8_t upper_z[6];  // the quantized upper bounds in z-dimension
+    };
+
+    GRL_INLINE uint InternalNode_GetChildBlockIncr( const InternalNode* p, uint idx )
+    {
+        return p->childData[idx].bits & 3;
+    }
+    GRL_INLINE uint InternalNode_GetChildStartPrim( const InternalNode* p, uint idx )
+    {
+        return (p->childData[idx].bits>>2) & 0xf;
+    }
+
+    GRL_INLINE uint8_t InternalNode_GetChildType( const InternalNode* p, uint idx )
+    {
+        return (p->childData[idx].bits >> 2) & 0xF;
+    }
+
+    GRL_INLINE void InternalNode_SetChildType( InternalNode* p, uint idx, uint type )
+    {
+        uint bits = p->childData[idx].bits;
+        const uint mask = (0xF << 2);
+        bits = ((type << 2) & mask) | (bits & ~mask);
+        p->childData[idx].bits = (uint8_t)bits;
+    }
+
+    GRL_INLINE bool InternalNode_IsChildValid( const InternalNode* p, size_t child )
+    {
+        bool lower = p->lower_x[child] & 0x80; // invalid nodes are indicated by setting lower_msb = 1 and upper_msb=0
+        bool upper = p->upper_x[child] & 0x80;
+        return !lower || upper;
+    }
+
+    GRL_INLINE AABB3f InternalNode_GetChildAABB(const InternalNode* node, size_t i)
+    {
+        float4 lower, upper;
+        const float4 base = { node->lower[0], node->lower[1], node->lower[2], 0.0f };
+        const int4 lower_i = { node->lower_x[i], node->lower_y[i], node->lower_z[i], 0 };
+        const int4 upper_i = { node->upper_x[i], node->upper_y[i], node->upper_z[i], 0 };
+        const int4 exp_i = { node->exp_x, node->exp_y, node->exp_z, 0 };
+        lower = base + bitShiftLdexp4(convert_float4_rtn(lower_i), exp_i - 8);
+        upper = base + bitShiftLdexp4(convert_float4_rtp(upper_i), exp_i - 8);
+        AABB3f aabb3f = {
+            { lower.x, lower.y, lower.z },
+            { upper.x, upper.y, upper.z } };
+        return aabb3f;
+    }
+
+    GRL_INLINE void* InternalNode_GetChildren( InternalNode* node)
+    {
+        return (void*)(((char*)node) + node->childOffset * 64);
+    }
+
+    typedef struct PrimLeafDesc
+    {
+        //uint32_t shaderIndex : 24;    // shader index used for shader record calculations
+        //uint32_t geomMask : 8;        // geometry mask used for ray masking
+        uint32_t shaderIndex_geomMask;
+
+        //uint32_t geomIndex : 29;      // the geometry index specifies the n'th geometry of the scene
+        //PrimLeafType type : 1;        // see above
+        //GeometryFlags geomFlags : 2;  // geometry flags of this geometry
+        uint32_t geomIndex_flags;
+    } PrimLeafDesc;
+
+    GRL_INLINE uint32_t PrimLeaf_GetShaderIndex( const PrimLeafDesc* p )
+    {
+        return p->shaderIndex_geomMask & ((1 << 24) - 1);
+    }
+    GRL_INLINE uint32_t PrimLeaf_GetGeoIndex( const PrimLeafDesc* p )
+    {
+        return p->geomIndex_flags & ((1<<29)-1);
+    }
+    GRL_INLINE uint32_t PrimLeaf_GetGeomFlags( const PrimLeafDesc* p )
+    {
+        return (p->geomIndex_flags >> 30);
+    }
+    GRL_INLINE uint32_t PrimLeaf_GetType(const PrimLeafDesc* p)
+    {
+        return (p->geomIndex_flags >> 29) & 1;
+    }
+
+    struct QuadLeaf
+    {
+        PrimLeafDesc leafDesc;
+
+        uint32_t primIndex0;
+
+        //uint32_t primIndex1Delta : 16;
+        //uint32_t j0 : 2;
+        //uint32_t j1 : 2;
+        //uint32_t j2 : 2;
+        //uint32_t last : 1; // last quad in list
+        //uint32_t pad : 9;
+        uint32_t DW1;
+
+        float v[4][3];
+    };
+
+    GRL_INLINE uint32_t QuadLeaf_GetPrimIndexDelta( const QuadLeaf* p )
+    {
+        return p->DW1 & 0x0000ffff;
+    }
+    GRL_INLINE uint32_t QuadLeaf_GetPrimIndex0( const QuadLeaf* p )
+    {
+        return p->primIndex0;
+    }
+    GRL_INLINE uint32_t QuadLeaf_GetPrimIndex1( const QuadLeaf* p )
+    {
+        return p->primIndex0 + QuadLeaf_GetPrimIndexDelta(p);
+    }
+    GRL_INLINE bool QuadLeaf_IsSingleTriangle( const QuadLeaf* p )
+    {
+        return QuadLeaf_GetPrimIndexDelta(p) == 0;
+    }
+    GRL_INLINE uint32_t QuadLeaf_GetSecondTriangleIndices( const QuadLeaf* p )
+    {
+        return (p->DW1>>16) & 0x3f;
+    }
+
+    GRL_INLINE void QuadLeaf_SetVertices( QuadLeaf* quad, float3 v0, float3 v1, float3 v2, float3 v3 )
+    {
+        quad->v[0][0] = v0.x;
+        quad->v[0][1] = v0.y;
+        quad->v[0][2] = v0.z;
+        quad->v[1][0] = v1.x;
+        quad->v[1][1] = v1.y;
+        quad->v[1][2] = v1.z;
+        quad->v[2][0] = v2.x;
+        quad->v[2][1] = v2.y;
+        quad->v[2][2] = v2.z;
+        quad->v[3][0] = v3.x;
+        quad->v[3][1] = v3.y;
+        quad->v[3][2] = v3.z;
+    }
+
+
+    struct ProceduralLeaf {
+        PrimLeafDesc leafDesc;
+
+        // Number of primitives + "last" bits.
+        // The meaning of this section is SW-defined and flexible
+        uint32_t DW1 ;
+        uint32_t _primIndex[13];
+    } ;
+
+GRL_NAMESPACE_END(Gen12)
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
diff --git a/src/intel/vulkan/grl/include/GRLIntTypes.h b/src/intel/vulkan/grl/include/GRLIntTypes.h
new file mode 100644
index 00000000000..573dbbc7481
--- /dev/null
+++ b/src/intel/vulkan/grl/include/GRLIntTypes.h
@@ -0,0 +1,152 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+//********************************************************************************************
+//   WARNING!!!!!
+//
+// This file is shared by OpenCL and C++ source code and must be a pure C header
+//  There should only be C structure definitions and trivial inline functions here
+//
+//********************************************************************************************
+
+#pragma once
+
+#include "GRLOCLCompatibility.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+
+    typedef uint32_t dword;
+    typedef uint64_t qword;
+    typedef qword gpuva_t;
+
+
+    enum_uint8( InstanceFlags )
+    {
+        INSTANCE_FLAG_TRIANGLE_CULL_DISABLE = 0x1,
+        INSTANCE_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE = 0x2,
+        INSTANCE_FLAG_FORCE_OPAQUE = 0x4,
+        INSTANCE_FLAG_FORCE_NON_OPAQUE = 0x8,
+    };
+
+    enum_uint8( GeometryFlags )
+    {
+        GEOMETRY_FLAG_NONE = 0x0,
+        GEOMETRY_FLAG_OPAQUE = 0x1,
+        GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION = 0x2,
+    };
+
+    enum_uint8( GeometryType )
+    {
+        GEOMETRY_TYPE_TRIANGLES = 0,
+        GEOMETRY_TYPE_PROCEDURAL = 1,
+        NUM_GEOMETRY_TYPES = 2
+    };
+
+    // NOTE: Does NOT match DXR
+    enum_uint8( IndexFormat )
+    {
+        INDEX_FORMAT_NONE     = 0,     // INDEX_FORMAT_NONE Indicates non-indexed geometry
+        INDEX_FORMAT_R16_UINT = 2,
+        INDEX_FORMAT_R32_UINT = 4,
+        INDEX_FORMAT_END = INDEX_FORMAT_R32_UINT + 1
+    };
+
+    // NOTE: Does NOT match DXR
+    enum_uint8( VertexFormat )
+    {
+        VERTEX_FORMAT_R32G32_FLOAT          = 0,
+        VERTEX_FORMAT_R32G32B32_FLOAT       = 1,
+        VERTEX_FORMAT_R16G16_FLOAT          = 2,
+        VERTEX_FORMAT_R16G16B16A16_FLOAT    = 3,
+        VERTEX_FORMAT_R16G16_SNORM          = 4,
+        VERTEX_FORMAT_R16G16B16A16_SNORM    = 5,
+        VERTEX_FORMAT_R16G16B16A16_UNORM    = 6,
+        VERTEX_FORMAT_R16G16_UNORM          = 7,
+        VERTEX_FORMAT_R10G10B10A2_UNORM     = 8,
+        VERTEX_FORMAT_R8G8B8A8_UNORM        = 9,
+        VERTEX_FORMAT_R8G8_UNORM            = 10,
+        VERTEX_FORMAT_R8G8B8A8_SNORM        = 11,
+        VERTEX_FORMAT_R8G8_SNORM            = 12,
+        VERTEX_FORMAT_END = VERTEX_FORMAT_R8G8_SNORM + 1
+    };
+
+
+
+    enum_uint32(RTASFlags)
+    {
+        // These flags match DXR
+        BUILD_FLAG_ALLOW_UPDATE                 = 1<<0,
+        BUILD_FLAG_ALLOW_COMPACTION             = 1<<1,
+        BUILD_FLAG_PREFER_FAST_TRACE            = 1<<2,
+        BUILD_FLAG_PREFER_FAST_BUILD            = 1<<3,
+        BUILD_FLAG_MINIMIZE_MEMORY              = 1<<4,
+        BUILD_FLAG_PERFORM_UPDATE               = 1<<5,
+
+        // internal flags start here
+        BUILD_FLAG_DISALLOW_REBRAID             = 1<<16,
+
+        BUILD_FLAG_ALL = 0x0001003f
+    };
+
+    enum_uint8(BVHType)
+    {
+        BVH_TYPE_NONE, // This is a sentinel for drivers to use when compiling out GRL on non-RT devices
+        BVH_TYPE_GEN12,
+    };
+
+    enum_uint8(PostBuildInfoType)
+    {
+        PBI_CURRENT_SIZE,
+        PBI_COMPACTED_SIZE,
+        PBI_DXR_TOOLS_VISUALIZATION_DESC,
+        PBI_DXR_SERIALIZATION_DESC,
+    };
+
+    enum_uint32(HazardTypes)
+    {
+        HAZARD_RTAS_READ       = 1 << 0,
+        HAZARD_RTAS_WRITE      = 1 << 1,
+        HAZARD_READ            = 1 << 2,
+        HAZARD_WRITE           = 1 << 3,
+        HAZARD_ALL             = 0xf
+    };
+    
+    enum_uint32(RaytracingAccelerationStructureType)
+    {
+        TOP_LEVEL    = 0x0,
+        BOTTOM_LEVEL = 0x1,
+    };
+
+    typedef struct PostbuildInfoCurrentSize
+    {
+        uint64_t CurrentSizeInBytes;
+    } PostbuildInfoCurrentSize;
+
+    typedef struct PostbuildInfoCompactedSize
+    {
+        uint64_t CompactedSizeInBytes;
+    } PostbuildInfoCompactedSize;
+
+    typedef struct PostbuildInfoToolsVisualizationDesc
+    {
+        uint64_t DecodedSizeInBytes;
+    } PostbuildInfoToolsVisualizationDesc;
+
+    typedef struct PostbuildInfoSerializationDesc
+    {
+        uint64_t SerializedSizeInBytes;
+        uint64_t NumBottomLevelAccelerationStructurePointers;
+    } PostbuildInfoSerializationDesc;
+
+    typedef struct DecodeHeader
+    {
+        RaytracingAccelerationStructureType Type;
+        uint32_t NumDesc;
+    } DecodeHeader;
+
+
+GRL_NAMESPACE_END(GRL)
+\ No newline at end of file
diff --git a/src/intel/vulkan/grl/include/GRLOCLCompatibility.h b/src/intel/vulkan/grl/include/GRLOCLCompatibility.h
new file mode 100644
index 00000000000..119104f1532
--- /dev/null
+++ b/src/intel/vulkan/grl/include/GRLOCLCompatibility.h
@@ -0,0 +1,210 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#ifdef __OPENCL_VERSION__
+
+typedef uchar  uint8_t;
+typedef ushort uint16_t;
+typedef uint   uint32_t;
+typedef ulong  uint64_t;
+typedef char   int8_t;
+typedef short  int16_t;
+typedef int    int32_t;
+typedef long   int64_t;
+
+#else
+
+#include <stdint.h>
+
+typedef uint8_t  uchar;
+typedef uint16_t ushort;
+typedef uint32_t uint;
+typedef uint64_t ulong;
+
+#define __constant
+#define __global
+
+typedef struct uint2
+{
+#ifdef __cplusplus
+    uint2() {};
+    uint2( uint ix, uint iy ) : x( ix ), y( iy ) {};
+#endif
+    uint x;
+    uint y;
+} uint2;
+
+typedef struct uint3
+{
+#ifdef __cplusplus
+    uint3() {};
+    uint3( uint ix, uint iy, uint iz ) : x( ix ), y( iy ), z( iz ) {};
+#endif
+    uint x;
+    uint y;
+    uint z;
+} uint3;
+
+typedef struct int3
+{
+    int32_t x;
+    int32_t y;
+    int32_t z;
+
+#ifdef __cplusplus
+    int3() {};
+    int3(int32_t ix, int32_t iy, int32_t iz) : x(ix), y(iy), z(iz) {};
+
+    int3 operator+(const int32_t i) const { return int3(this->x + i, this->y + i, this->z + i); }
+    int3 operator<<(const int32_t i) const { return int3(this->x << i, this->y << i, this->z << i); }
+#endif
+} int3;
+
+typedef struct int4
+{
+    int32_t x;
+    int32_t y;
+    int32_t z;
+    int32_t w;
+
+#ifdef __cplusplus
+    int4() {};
+    int4(int32_t ix, int32_t iy, int32_t iz, int32_t iw) : x(ix), y(iy), z(iz), w(iw) {};
+
+    int4 operator+(const int32_t i) const { return int4(this->x + i, this->y + i, this->z + i, this->w + i); }
+    int4 operator-(const int32_t i) const { return int4(this->x - i, this->y - i, this->z - i, this->w - i); }
+    int4 operator<<(const int32_t i) const { return int4(this->x << i, this->y << i, this->z << i, this->w << i); }
+#endif
+} int4;
+
+typedef struct float3
+{
+    float x;
+    float y;
+    float z;
+
+#ifdef __cplusplus
+    float3(){};
+    float3( float ix, float iy, float iz ) : x(ix), y(iy), z(iz){};
+
+    float3 operator+( const float3& f3 ) { return float3( this->x + f3.x, this->y + f3.y, this->z + f3.z ); }
+    float3 operator*( const float& f ) { return float3( this->x * f, this->y * f, this->z * f ); }
+    float3 operator*( const float3& f3 ) const { return float3(this->x * f3.x, this->y * f3.y, this->z * f3.z); }
+    float3 operator-() { return float3(-this->x, -this->y, -this->z); }
+    float3 operator-( const float3& f3) { return float3(this->x - f3.x, this->y - f3.y, this->z - f3.z); }
+#endif
+} float3;
+
+typedef struct float4
+{
+    float x;
+    float y;
+    float z;
+    float w;
+
+#ifdef __cplusplus
+    float4() {};
+    float4( float ix, float iy, float iz, float iw ) : x( ix ), y( iy ), z( iz ), w( iw ) {};
+
+    float4 operator+(const float4& f4) const { return float4(this->x + f4.x, this->y + f4.y, this->z + f4.z, this->w + f4.w); }
+    float4 operator*(const float4& f4) const { return float4(this->x * f4.x, this->y * f4.y, this->z * f4.z, this->w * f4.w); }
+#endif
+} float4;
+
+#endif /* ! __OPENCL_VERSION__ */
+
+
+#ifndef __cplusplus
+
+#define GRL_NAMESPACE_BEGIN(x)
+#define GRL_NAMESPACE_END(x)
+#define GRL_OVERLOADABLE __attribute((overloadable))
+#define GRL_INLINE __attribute__((always_inline)) inline static
+
+#   define enum_uint8(name)   \
+        typedef uint8_t name; \
+        enum name##_uint32
+#   define enum_uint16(name)   \
+        typedef uint16_t name; \
+        enum name##_uint32
+#   define enum_uint32(name)   \
+        typedef uint32_t name; \
+        enum name##_uint32
+
+#define OCL_BYTE_ALIGN(n) __attribute__ ((aligned (n)))
+#define GRL_STATIC_ASSERT(condition,desc)
+
+#else /* C++ */
+#ifdef __OPENCL_VERSION__
+#error "OpenCL C++ not supported by this header"
+#endif
+
+#define GRL_NAMESPACE_BEGIN(x) namespace x {
+#define GRL_NAMESPACE_END(x) }
+#define GRL_OVERLOADABLE
+#define GRL_INLINE inline
+
+#define enum_uint8(N) enum N : uint8_t
+#define enum_uint16(N) enum N : uint16_t
+#define enum_uint32(N) enum N : uint32_t
+
+#define OCL_BYTE_ALIGN(n)
+#define GRL_STATIC_ASSERT(condition,desc) static_assert( condition, desc )
+
+#include <cmath>
+
+inline float3 fmin(float3 a, float3 b)
+{
+    float3 o = { std::fmin(a.x, b.x), std::fmin(a.y, b.y), std::fmin(a.z, b.z) };
+    return o;
+}
+
+inline float3 fmax(float3 a, float3 b)
+{
+    float3 o = { std::fmax(a.x, b.x), std::fmax(a.y, b.y), std::fmax(a.z, b.z) };
+    return o;
+}
+
+inline float3 operator/(const float3& f3, const float& f) { return float3(f3.x / f, f3.y / f, f3.z / f); }
+
+inline float dot(const float3& a, const float3& b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+inline float as_float(uint32_t i)
+{
+    union { float f; uint32_t i; } fi;
+
+    fi.i = i;
+    return fi.f;
+}
+
+inline float3 as_float3(int3 i3)
+{
+    float3 o = { as_float(i3.x), as_float(i3.y), as_float(i3.z) };
+    return o;
+}
+
+inline float4 as_float4(int4 i4)
+{
+    float4 o = { as_float(i4.x), as_float(i4.y), as_float(i4.z), as_float(i4.w) };
+    return o;
+}
+
+inline float4 convert_float4_rtn(int4 i4)
+{
+    return float4(static_cast<float>(i4.x), static_cast<float>(i4.y), static_cast<float>(i4.z), static_cast<float>(i4.w));
+}
+
+inline float4 convert_float4_rtp(int4 i4)
+{
+    return convert_float4_rtn(i4);
+}
+
+#endif
diff --git a/src/intel/vulkan/grl/include/GRLRTASCommon.h b/src/intel/vulkan/grl/include/GRLRTASCommon.h
new file mode 100644
index 00000000000..1f2cda2ea0b
--- /dev/null
+++ b/src/intel/vulkan/grl/include/GRLRTASCommon.h
@@ -0,0 +1,142 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+//
+// This file is to contain structure definitions for RTAS-related meta-deta.
+//   The structures here should be generic enough to apply to any acceleration structure.
+//   If we ever move to KD-Trees or Octrees, this file should not need to change.
+//
+
+//********************************************************************************************
+//   WARNING!!!!!
+//
+// This file is shared by OpenCL and C++ source code and must be a pure C header
+//  There should only be C structure definitions and trivial inline functions here
+//
+//********************************************************************************************
+
+
+#pragma once
+#include "GRLIntTypes.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+
+    typedef struct SerializationIdentifier
+    {
+        uint8_t Bytes[16];
+    } SerializationIdentifier;
+
+    GRL_STATIC_ASSERT(sizeof(SerializationIdentifier) == 16, "Wrong size!");
+
+
+    // Header structure for RTAS serialization.
+    //    This structure is binary-compatible with the DXR and Vulkan API definitions
+    typedef struct SerializationHeader
+    {
+        SerializationIdentifier DriverID;   // DXR 'DriverOpaqueGUID'.            Vulkan: 'driverUUID'
+        SerializationIdentifier GRLID;      // DXR 'DriverOpaqueVersioningData'.  Vulkan: 'accelerationStructureUUID'
+
+        uint64_t SerializedSizeInBytesIncludingHeader;
+        uint64_t DeserializedSizeInBytes;
+        uint64_t InstanceHandleCount;
+    } SerializationHeader;
+
+    GRL_STATIC_ASSERT(sizeof(SerializationHeader) == 56, "Wrong size!");
+
+    // This structure is binary-compatible with DXR and Vulkan 'InstanceDesc' structures
+    typedef struct InstanceDesc {
+        float    Transform[3][4];
+        uint32_t InstanceIDAndMask; // mask in 8 msbs
+        uint32_t InstanceContributionToHitGroupIndexAndFlags; // flags in 8 msbs
+        gpuva_t  AccelerationStructureGPUVA; // NOTE:  In GRL this is always a VA.  Vulkan CPU builds use handles here, and these may need to be translated
+    } InstanceDesc;
+    GRL_STATIC_ASSERT(sizeof(InstanceDesc) == 64, "Wrong size!");
+
+    typedef struct GeoMetaData{
+        uint32_t PrimitiveCount;
+        uint16_t Type;
+        uint16_t Flags;
+    } GeoMetaData;
+    GRL_STATIC_ASSERT(sizeof(GeoMetaData) == 8, "Wrong size!");
+
+    typedef struct AABB3f {
+        float lower[3];
+        float upper[3];
+    } AABB3f;
+    GRL_STATIC_ASSERT(sizeof(AABB3f) == 24, "Wrong size!");
+
+    enum_uint32(error_t_) {
+        error_t_no_error = 0x0,
+        error_t_internal_node_child_OOB = 0x1,
+        error_t_leaf_node_child_OOB = 0x2,
+        error_t_unrecognised_node_t = 0x4,
+        error_t_mixed_node_unsupported = 0x8,
+        error_t_instance_pointers_inconsistent = 0x10,
+        error_t_instance_pointed_root_not_internal = 0x20,
+        error_t_leaf_node_instance_child_missed_by_64B = 0x40,
+        error_t_internal_node_child_cycle = 0x80,
+        error_t_input_geo_insane = 0x100,
+        error_t_quad_leaf_broken = 0x200,
+        error_t_backpointer_not_reset = 0x400,
+        error_t_backpointer_wrong_children_num = 0x500,
+        error_t_backpointer_inconsitent_parent_child = 0x600,
+        error_t_backpointer_root_not_root_error = 0x700,
+        error_t_backpointer_OOB = 0x800,
+        error_t_backpointers_buffer_too_small = 0x900,
+        error_t_atomic_update_struct_fatleaf_count_oob = 0x1000,            // for this and following:
+        error_t_atomic_update_struct_fatleaf_node_idx_oob = 0x2000,         // offset_in_BVH is just index in fatleaf or inner node arrays
+        error_t_atomic_update_struct_fatleaf_backpointer_mismatch = 0x3000,
+        error_t_atomic_update_struct_fatleaf_num_children_error = 0x4000,
+        error_t_atomic_update_struct_fatleaf_children_non_leaf = 0x5000,
+        error_t_atomic_update_struct_inner_count_oob = 0x6000,
+        error_t_atomic_update_struct_inner_node_idx_oob = 0x7000,
+        error_t_atomic_update_struct_inner_node_child_idx_error = 0x8000,
+        error_t_atomic_update_struct_inner_num_children_error = 0x9000,
+        error_t_atomic_update_struct_inner_children_non_internal = 0xA000,
+        error_t_unknown = 1u << 31,
+    };
+
+    enum_uint32(error_phase_t) {
+        error_phase_t_unknown = 0,
+        error_phase_t_post_build_Morton  = 1,
+        error_phase_t_post_build_Trivial = 2,
+        error_phase_t_post_build_NewSAH  = 3,
+        error_phase_t_post_update        = 4,
+        error_phase_t_pre_update         = 5,
+        error_phase_t_post_copy_op       = 6,
+    };
+
+    typedef struct ERROR_INFO {
+        error_t_ type;
+        uint    offset_in_BVH; //in 64B units
+        error_phase_t when;
+        uint reserved;
+    } ERROR_INFO;
+
+    // Meta-data common to all acceleration structures, which is needed to implement required functionality
+    //  All RTAS structures must contain a struct of this type named 'Meta'
+    typedef struct RTASMetaData {
+        struct AABB3f bounds;
+
+        uint32_t instanceDescsStart;  // byte offset to array of original instance_descs used for build.  Required for DXR visualization and serialization
+        uint32_t instanceCount;
+
+        uint32_t geoDescsStart;     // byte offset to array of 'GeoMetaData' matching input geos.  Required for DXR visualization
+        uint32_t geoCount;
+
+        uint64_t allocationSize;  // Size of the memory allocation containing this RTAS
+                                  //  This is the size given to the app in the prebuild info when the RTAS was first created
+                                  //  If RTAS was compacted, this will be the compacted size
+
+        ERROR_INFO errors;        // only used in debug mode
+    } RTASMetaData;
+
+    GRL_STATIC_ASSERT( sizeof(RTASMetaData) == 64, "Wrong size!");
+
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
diff --git a/src/intel/vulkan/grl/include/GRLStructs.h b/src/intel/vulkan/grl/include/GRLStructs.h
new file mode 100644
index 00000000000..c8af8313ffc
--- /dev/null
+++ b/src/intel/vulkan/grl/include/GRLStructs.h
@@ -0,0 +1,60 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "GRLIntTypes.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(_INTERNAL)
+
+    struct GeometryTriangles
+    {
+        gpuva_t        pTransformBuffer;
+        gpuva_t        pIndexBuffer;
+        gpuva_t        pVertexBuffer;
+        qword          VertexBufferByteStride;
+        dword          IndexCount;
+        dword          VertexCount;
+        IndexFormat    IndexFormat;
+        VertexFormat   VertexFormat;
+    };
+
+    struct GeometryProcedural
+    {
+        gpuva_t  pAABBs_GPUVA; ///<elements of pAABBs_GPUVA are gpuAABB format.
+        qword    AABBByteStride;
+        dword    AABBCount;
+    };
+
+    // TODO we miss 'unsigned int ShaderIndex_Mask; // extension' field
+    struct Geo
+    {
+        union
+        {
+            struct GeometryTriangles Triangles;
+            struct GeometryProcedural Procedural;
+        } Desc;
+
+        GeometryType Type;
+        uint8_t Flags;
+    };
+
+    // Matches the Vulkan VkAccelerationStructureBuildRangeInfoKHR structure
+    // See Vulkan spec for data access rules:
+    //     https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VkAccelerationStructureBuildRangeInfoKHR.html
+    //
+    struct IndirectBuildRangeInfo
+    {
+        dword    primitiveCount;        // Number of primitives
+        dword    primitiveOffset;       // Byte offset to primitive data
+        dword    firstVertex;           // Index of first vertex
+        dword    transformOffset;       // Byte offset to transform data (for triangle Geo with non-null transform)
+    };
+
+GRL_NAMESPACE_END(_INTERNAL)
+GRL_NAMESPACE_END(GRL)
+\ No newline at end of file
diff --git a/src/intel/vulkan/grl/include/GRLUtilities.h b/src/intel/vulkan/grl/include/GRLUtilities.h
new file mode 100644
index 00000000000..22670bfad1b
--- /dev/null
+++ b/src/intel/vulkan/grl/include/GRLUtilities.h
@@ -0,0 +1,32 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "GRLOCLCompatibility.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+
+    GRL_INLINE float4 bitShiftLdexp4(float4 x, int4 y)
+    {
+        y = (y + 127) << 23;
+        return x * as_float4(y);
+    }
+
+    GRL_INLINE float3 bitShiftLdexp3(float3 x, int3 y)
+    {
+        y = (y + 127) << 23;
+        return x * as_float3(y);
+    }
+
+    GRL_INLINE float bitShiftLdexp(float x, int y)
+    {
+        y = (y + 127) << 23;
+        return x * as_float(y);
+    }
+
+GRL_NAMESPACE_END(GRL)
+\ No newline at end of file
diff --git a/src/intel/vulkan/grl/include/affinespace.h b/src/intel/vulkan/grl/include/affinespace.h
new file mode 100644
index 00000000000..36ebae0ede6
--- /dev/null
+++ b/src/intel/vulkan/grl/include/affinespace.h
@@ -0,0 +1,192 @@
+//
+// Copyright (C) 2009-2021 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+//
+//
+
+#pragma once
+
+#include "GRLRTASCommon.h"
+
+GRL_NAMESPACE_BEGIN(GRL)
+GRL_NAMESPACE_BEGIN(RTAS)
+inline float3 GRL_OVERLOADABLE cross(const float3 a, const float3 b)
+{
+    float3 res = { a.y * b.z - a.z * b.y,
+                   a.z * b.x - a.x * b.z,
+                   a.x * b.y - a.y * b.x };
+    return res;
+}
+
+struct LinearSpace3f
+{
+    float3 vx;
+    float3 vy;
+    float3 vz;
+};
+
+/* compute the determinant of the matrix */
+GRL_INLINE struct LinearSpace3f LinearSpace3f_Constructor(const float3 vx, const float3 vy, const float3 vz)
+{
+    struct LinearSpace3f xfm;
+    xfm.vx = vx;
+    xfm.vy = vy;
+    xfm.vz = vz;
+    return xfm;
+}
+
+/* compute the determinant of the matrix */
+GRL_INLINE float LinearSpace3f_det(struct LinearSpace3f xfm)
+{
+    return dot(xfm.vx, cross(xfm.vy, xfm.vz));
+}
+
+/* compute transposed matrix */
+GRL_INLINE struct LinearSpace3f LinearSpace3f_transpose(struct LinearSpace3f in)
+{
+    float3 x = { in.vx.x, in.vy.x, in.vz.x };
+    float3 y = { in.vx.y, in.vy.y, in.vz.y };
+    float3 z = { in.vx.z, in.vy.z, in.vz.z };
+
+    return LinearSpace3f_Constructor(x,
+                                     y,
+                                     z);
+}
+
+/* compute adjoint matrix */
+GRL_INLINE const struct LinearSpace3f LinearSpace3f_adjoint(struct LinearSpace3f in)
+{
+    return LinearSpace3f_transpose(LinearSpace3f_Constructor(cross(in.vy, in.vz),
+                                                             cross(in.vz, in.vx),
+                                                             cross(in.vx, in.vy)));
+}
+
+/* compute inverse matrix */
+GRL_INLINE struct LinearSpace3f LinearSpace3f_invert(struct LinearSpace3f in)
+{
+    const float det = LinearSpace3f_det(in);
+    const struct LinearSpace3f adj = LinearSpace3f_adjoint(in);
+    return LinearSpace3f_Constructor(adj.vx / det, adj.vy / det, adj.vz / det);
+}
+
+GRL_INLINE float3 GRL_OVERLOADABLE xfmPoint(struct LinearSpace3f xfm, float3 p)
+{
+    return xfm.vx * p.x + xfm.vy * p.y + xfm.vz * p.z;
+}
+
+struct AffineSpace3f
+{
+    struct LinearSpace3f l;
+    float3 p;
+};
+
+GRL_INLINE struct AffineSpace3f AffineSpace3f_Constructor(struct LinearSpace3f l, float3 p)
+{
+    struct AffineSpace3f out;
+    out.l = l;
+    out.p = p;
+    return out;
+}
+
+GRL_INLINE struct AffineSpace3f AffineSpace3f_load_row_major(const float *in)
+{
+    struct AffineSpace3f out;
+    out.l.vx.x = in[0];
+    out.l.vx.y = in[4];
+    out.l.vx.z = in[8];
+    out.l.vy.x = in[1];
+    out.l.vy.y = in[5];
+    out.l.vy.z = in[9];
+    out.l.vz.x = in[2];
+    out.l.vz.y = in[6];
+    out.l.vz.z = in[10];
+    out.p.x = in[3];
+    out.p.y = in[7];
+    out.p.z = in[11];
+    return out;
+}
+
+// squared proportion of oriented transformed cube to aa box that would contain it.
+// the smaller it is the more overhead transformation produces
+GRL_INLINE
+float transformation_bbox_surf_overhead(const float* Transform)
+{
+    // We use an abs-matrix to transform the AABB extent vector, which is enough to compute the area
+    //     New AABB is center +- Extent.
+    //
+    // For derivation see:
+    //    https://zeux.io/2010/10/17/aabb-from-obb-with-component-wise-abs/
+    //
+
+
+    // take the cube of side 1 and see how big aabb containing it transformed is vs just surface of transformed
+    float ex = fabs(Transform[0]) + fabs(Transform[1]) + fabs(Transform[2]);
+    float ey = fabs(Transform[4]) + fabs(Transform[5]) + fabs(Transform[6]);
+    float ez = fabs(Transform[8]) + fabs(Transform[9]) + fabs(Transform[10]);
+
+    // we will compare squared sizes
+    ex = ex * ex;
+    ey = ey * ey;
+    ez = ez * ez;
+
+    // surface of aabb containing oriented box;
+    float aabb_sq_half_surf = ex * ey + ey * ez + ez * ex;
+
+    // ^2 lengths of transformed <1,0,0>, <0,1,0>, <0,0,1>
+    float obx = Transform[0] * Transform[0] + Transform[4] * Transform[4] + Transform[8] * Transform[8];
+    float oby = Transform[1] * Transform[1] + Transform[5] * Transform[5] + Transform[9] * Transform[9];
+    float obz = Transform[2] * Transform[2] + Transform[6] * Transform[6] + Transform[10] * Transform[10];
+
+    float obb_sq_half_surf = obx * oby + oby * obz + obz * obx;
+
+    return obb_sq_half_surf / aabb_sq_half_surf;
+
+    // ex = 2.0
+    // ey = 2.0
+    // ez = 2.0
+    // ex = 4.0
+    // ey = 4.0
+    // ez = 4.0
+    // aabb_half_surf = 16+16 *2.0 +  2.0*2.0+ 2.0*2.0; = 12;
+    // aabb_sq_half_surf = 144;
+    //
+    // obx = 4.0;
+    // oby = 4.0;
+    // obz = 4.0;
+    // obb_sq_half_surf = 16 + 16+ 16;
+    // obb_sq_half_surf = 16.0 *3 = 48
+}
+
+GRL_INLINE void load_row_major_from_AffineSpace3f(struct AffineSpace3f in, float* out)
+{
+    out[0]  = in.l.vx.x;
+    out[4]  = in.l.vx.y;
+    out[8]  = in.l.vx.z;
+    out[1]  = in.l.vy.x;
+    out[5]  = in.l.vy.y;
+    out[9]  = in.l.vy.z;
+    out[2]  = in.l.vz.x;
+    out[6]  = in.l.vz.y;
+    out[10] = in.l.vz.z;
+
+    out[3]  = in.p.x;
+    out[7]  = in.p.y;
+    out[11] = in.p.z;
+}
+
+GRL_INLINE float3 GRL_OVERLOADABLE xfmPoint(struct AffineSpace3f xfm, float3 p)
+{
+    return xfmPoint(xfm.l, p) + xfm.p;
+}
+
+/* compute inverse matrix */
+GRL_INLINE struct AffineSpace3f AffineSpace3f_invert(struct AffineSpace3f in)
+{
+    const struct LinearSpace3f il = LinearSpace3f_invert(in.l);
+    float3 ip = -xfmPoint(il, in.p);
+    return AffineSpace3f_Constructor(il, ip);
+}
+
+GRL_NAMESPACE_END(RTAS)
+GRL_NAMESPACE_END(GRL)
diff --git a/src/intel/vulkan/grl/meson.build b/src/intel/vulkan/grl/meson.build
new file mode 100644
index 00000000000..61cb7aa8ea3
--- /dev/null
+++ b/src/intel/vulkan/grl/meson.build
@@ -0,0 +1,203 @@
+# Copyright © 2021 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+fs = import('fs')
+
+grl_lib_files = [
+  'gpu/libs/libraries.grl',
+]
+
+grl_grl_files = [
+  'gpu/build_leaf.grl',
+  'gpu/build_primref.grl',
+#  'gpu/build_refit.grl',
+  'gpu/copy.grl',
+#  'gpu/grl_api_interface_verify.grl',
+  'gpu/misc.grl',
+#  'gpu/morton_builder.grl',
+#  'gpu/msb_radix_bitonic_sort.grl',
+  'gpu/new_sah_builder.grl',
+  'gpu/postbuild_info.grl',
+#  'gpu/presplit.grl',
+#  'gpu/radix_sort.grl',
+#  'gpu/rebraid.grl',
+#  'gpu/traversal_shader.grl',
+]
+
+grl_lib_args = []
+foreach libfile : grl_lib_files
+  grl_lib_args += '--library'
+  grl_lib_args += files(libfile)
+endforeach
+
+grl_genX_files = [
+  'genX_grl_dispatch.c',
+  'genX_grl_uuid.cpp',
+]
+
+grl_lib_args = []
+foreach libfile : grl_lib_files
+  grl_lib_args += '--library'
+  grl_lib_args += files(libfile)
+endforeach
+
+grl_cl_kernel_h = custom_target(
+  'grl_cl_kernel.h',
+  input : ['grl_cl_kernel_gen.py', grl_grl_files, grl_lib_files],
+  output : 'grl_cl_kernel.h',
+  command : [
+    prog_python, '@INPUT0@', '--out-h', '@OUTPUT@',
+    grl_lib_args, files(grl_grl_files),
+  ],
+)
+
+has_ply = run_command(
+  prog_python, '-c',
+  '''
+import ply
+  ''', check : false)
+if has_ply.returncode() != 0
+  error('Python (3.x) ply module required to build GRL kernels.')
+endif
+
+r = run_command(prog_python, 'grl_cl_kernel_gen.py',
+                grl_lib_args, '--ls-kernels', grl_grl_files, check : false)
+assert(r.returncode() == 0, 'Failed to fetch GRL CL kernels')
+grl_kernels = r.stdout().strip().split()
+
+grl_metakernel_c = []
+grl_metakernel_h = []
+foreach grl_file : grl_grl_files
+  base_outfile = 'grl_metakernel_' + fs.replace_suffix(fs.name(grl_file), '')
+  outfiles = custom_target(
+    base_outfile,
+    input : ['grl_metakernel_gen.py', grl_file, grl_lib_files],
+    output : [base_outfile + '.h', base_outfile + '.c'],
+    command : [
+      prog_python, '@INPUT0@', '--out-h', '@OUTPUT0@',
+      '--out-c', '@OUTPUT1@', grl_lib_args, '@INPUT1@',
+    ],
+  )
+  grl_metakernel_h += outfiles[0]
+  grl_metakernel_c += outfiles[1]
+endforeach
+
+grl_genX_libs = []
+foreach t : [['125', 'gfx125', 'dg2']]
+  verX10 = t[0]
+  genX_prefix = t[1]
+  platform = t[2]
+
+  grl_compiled_cl_kernels = []
+  foreach k : grl_kernels
+    # get_cl_files dumps out filename:entrypoint:libfile1,libfile2,libfile3
+    cl_file = k.split(':')[0]
+    entrypoint = k.split(':')[1]
+    library_files = k.split(':')[2]
+    kernel_prefix = '_'.join([
+      genX_prefix,
+      fs.replace_suffix(cl_file, '').replace('gpu/', '').replace('/', '_'),
+      entrypoint
+    ])
+    input_args = [ files(cl_file), ]
+    if library_files != ''
+      foreach lib_file : library_files.split(',')
+        input_args += [ lib_file ]
+      endforeach
+    endif
+    prepended_input_args = []
+    foreach input_arg : input_args
+      prepended_input_args += ['--in', input_arg]
+    endforeach
+    outfile = kernel_prefix + '.h'
+    grl_compiled_cl_kernels += custom_target(
+      outfile,
+      input : cl_file,
+      output : outfile,
+      command : [
+        prog_intel_clc, '-p', platform, '--prefix', kernel_prefix,
+        '-e', entrypoint, prepended_input_args, '-o', '@OUTPUT@', '--',
+        '-cl-std=cl2.0', '-D__OPENCL_VERSION__=200',
+        '-DMAX_HW_SIMD_WIDTH=16', '-DMAX_WORKGROUP_SIZE=16',
+        '-I' + join_paths(meson.current_source_dir(), 'gpu'),
+        '-I' + join_paths(meson.current_source_dir(), 'include'),
+      ],
+      env: ['MESA_SHADER_CACHE_DISABLE=true',
+            'MESA_SPIRV_LOG_LEVEL=error'],
+      depends : dep_prog_intel_clc
+    )
+  endforeach
+
+  grl_cl_kernel_c = custom_target(
+    'grl_@0@_cl_kernel.c'.format(genX_prefix),
+    input : ['grl_cl_kernel_gen.py', grl_grl_files, grl_lib_files],
+    output : 'grl_@0@_cl_kernel.c'.format(genX_prefix),
+    command : [
+      prog_python, '@INPUT0@', '--out-c', '@OUTPUT@',
+      grl_lib_args, '--prefix', genX_prefix, files(grl_grl_files),
+    ],
+  )
+
+  grl_genX_libs += static_library(
+    'grl_@0@'.format(genX_prefix),
+    [grl_cl_kernel_h, grl_compiled_cl_kernels, grl_cl_kernel_c,
+     grl_genX_files, grl_metakernel_c, grl_metakernel_h],
+    include_directories : [
+      inc_include, inc_src,
+      inc_intel,
+    ],
+    c_args : [
+      no_override_init_args, sse2_args,
+      '-DGFX_VERx10=@0@'.format(verX10),
+    ],
+    cpp_args : [
+      sse2_args,
+      '-DGFX_VERx10=@0@'.format(verX10),
+    ],
+    dependencies : [
+      dep_valgrind, idep_nir_headers, idep_vulkan_util_headers, idep_vulkan_wsi_headers,
+      idep_vulkan_runtime_headers, idep_anv_headers, idep_genxml,
+    ],
+    gnu_symbol_visibility : 'hidden',
+  )
+endforeach
+
+libgrl_deps = [
+  dep_valgrind,
+  idep_nir_headers,
+  idep_vulkan_util_headers,
+  idep_vulkan_wsi_headers,
+]
+
+libgrl = static_library(
+  'grl',
+  [grl_cl_kernel_h],
+  include_directories : [
+    inc_include, inc_src, inc_intel,
+  ],
+  link_whole : [grl_genX_libs],
+  dependencies : [libgrl_deps, idep_anv_headers],
+)
+idep_grl = declare_dependency(
+  link_with : libgrl,
+  dependencies : libgrl_deps,
+  sources : [grl_metakernel_h, grl_cl_kernel_h],
+  include_directories : include_directories('include', 'gpu'),
+)
diff --git a/src/intel/vulkan/i915/anv_batch_chain.c b/src/intel/vulkan/i915/anv_batch_chain.c
new file mode 100644
index 00000000000..dd3d40bf13f
--- /dev/null
+++ b/src/intel/vulkan/i915/anv_batch_chain.c
@@ -0,0 +1,1107 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "i915/anv_batch_chain.h"
+#include "anv_private.h"
+#include "anv_measure.h"
+
+#include "perf/intel_perf.h"
+#include "util/u_debug.h"
+
+#include "drm-uapi/i915_drm.h"
+
+struct anv_execbuf {
+   struct drm_i915_gem_execbuffer2           execbuf;
+
+   struct drm_i915_gem_execbuffer_ext_timeline_fences timeline_fences;
+
+   struct drm_i915_gem_exec_object2 *        objects;
+   uint32_t                                  bo_count;
+   uint32_t                                  bo_array_length;
+   struct anv_bo **                          bos;
+
+   uint32_t                                  syncobj_count;
+   uint32_t                                  syncobj_array_length;
+   struct drm_i915_gem_exec_fence *          syncobjs;
+   uint64_t *                                syncobj_values;
+
+   uint32_t                                  cmd_buffer_count;
+   struct anv_query_pool                     *perf_query_pool;
+
+   const VkAllocationCallbacks *             alloc;
+   VkSystemAllocationScope                   alloc_scope;
+
+   int                                       perf_query_pass;
+};
+
+static void
+anv_execbuf_finish(struct anv_execbuf *exec)
+{
+   vk_free(exec->alloc, exec->syncobjs);
+   vk_free(exec->alloc, exec->syncobj_values);
+   vk_free(exec->alloc, exec->objects);
+   vk_free(exec->alloc, exec->bos);
+}
+
+static void
+anv_execbuf_add_ext(struct anv_execbuf *exec,
+                    uint32_t ext_name,
+                    struct i915_user_extension *ext)
+{
+   __u64 *iter = &exec->execbuf.cliprects_ptr;
+
+   exec->execbuf.flags |= I915_EXEC_USE_EXTENSIONS;
+
+   while (*iter != 0) {
+      iter = (__u64 *) &((struct i915_user_extension *)(uintptr_t)*iter)->next_extension;
+   }
+
+   ext->name = ext_name;
+
+   *iter = (uintptr_t) ext;
+}
+
+static VkResult
+anv_execbuf_add_bo_bitset(struct anv_device *device,
+                          struct anv_execbuf *exec,
+                          uint32_t dep_words,
+                          BITSET_WORD *deps,
+                          uint32_t extra_flags);
+
+static VkResult
+anv_execbuf_add_bo(struct anv_device *device,
+                   struct anv_execbuf *exec,
+                   struct anv_bo *bo,
+                   struct anv_reloc_list *relocs,
+                   uint32_t extra_flags)
+{
+   struct drm_i915_gem_exec_object2 *obj = NULL;
+
+   if (bo->exec_obj_index < exec->bo_count &&
+       exec->bos[bo->exec_obj_index] == bo)
+      obj = &exec->objects[bo->exec_obj_index];
+
+   if (obj == NULL) {
+      /* We've never seen this one before.  Add it to the list and assign
+       * an id that we can use later.
+       */
+      if (exec->bo_count >= exec->bo_array_length) {
+         uint32_t new_len = exec->objects ? exec->bo_array_length * 2 : 64;
+
+         struct drm_i915_gem_exec_object2 *new_objects =
+            vk_realloc(exec->alloc, exec->objects,
+                       new_len * sizeof(*new_objects), 8, exec->alloc_scope);
+         if (new_objects == NULL)
+            return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+         exec->objects = new_objects;
+
+         struct anv_bo **new_bos =
+            vk_realloc(exec->alloc, exec->bos, new_len * sizeof(*new_bos), 8,
+                       exec->alloc_scope);
+         if (new_bos == NULL)
+            return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+         exec->bos = new_bos;
+         exec->bo_array_length = new_len;
+      }
+
+      assert(exec->bo_count < exec->bo_array_length);
+
+      bo->exec_obj_index = exec->bo_count++;
+      obj = &exec->objects[bo->exec_obj_index];
+      exec->bos[bo->exec_obj_index] = bo;
+
+      obj->handle = bo->gem_handle;
+      obj->relocation_count = 0;
+      obj->relocs_ptr = 0;
+      obj->alignment = 0;
+      obj->offset = bo->offset;
+      obj->flags = bo->flags | extra_flags;
+      obj->rsvd1 = 0;
+      obj->rsvd2 = 0;
+   }
+
+   if (extra_flags & EXEC_OBJECT_WRITE) {
+      obj->flags |= EXEC_OBJECT_WRITE;
+      obj->flags &= ~EXEC_OBJECT_ASYNC;
+   }
+
+   if (relocs != NULL) {
+      return anv_execbuf_add_bo_bitset(device, exec, relocs->dep_words,
+                                       relocs->deps, extra_flags);
+   }
+
+   return VK_SUCCESS;
+}
+
+/* Add BO dependencies to execbuf */
+static VkResult
+anv_execbuf_add_bo_bitset(struct anv_device *device,
+                          struct anv_execbuf *exec,
+                          uint32_t dep_words,
+                          BITSET_WORD *deps,
+                          uint32_t extra_flags)
+{
+   for (uint32_t w = 0; w < dep_words; w++) {
+      BITSET_WORD mask = deps[w];
+      while (mask) {
+         int i = u_bit_scan(&mask);
+         uint32_t gem_handle = w * BITSET_WORDBITS + i;
+         struct anv_bo *bo = anv_device_lookup_bo(device, gem_handle);
+         assert(bo->refcount > 0);
+         VkResult result =
+            anv_execbuf_add_bo(device, exec, bo, NULL, extra_flags);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_execbuf_add_syncobj(struct anv_device *device,
+                        struct anv_execbuf *exec,
+                        uint32_t syncobj,
+                        uint32_t flags,
+                        uint64_t timeline_value)
+{
+   if (exec->syncobj_count >= exec->syncobj_array_length) {
+      uint32_t new_len = MAX2(exec->syncobj_array_length * 2, 16);
+
+      struct drm_i915_gem_exec_fence *new_syncobjs =
+         vk_realloc(exec->alloc, exec->syncobjs,
+                    new_len * sizeof(*new_syncobjs), 8, exec->alloc_scope);
+      if (new_syncobjs == NULL)
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+      exec->syncobjs = new_syncobjs;
+
+      if (exec->syncobj_values) {
+         uint64_t *new_syncobj_values =
+            vk_realloc(exec->alloc, exec->syncobj_values,
+                       new_len * sizeof(*new_syncobj_values), 8,
+                       exec->alloc_scope);
+         if (new_syncobj_values == NULL)
+            return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+         exec->syncobj_values = new_syncobj_values;
+      }
+
+      exec->syncobj_array_length = new_len;
+   }
+
+   if (timeline_value && !exec->syncobj_values) {
+      exec->syncobj_values =
+         vk_zalloc(exec->alloc, exec->syncobj_array_length *
+                                sizeof(*exec->syncobj_values),
+                   8, exec->alloc_scope);
+      if (!exec->syncobj_values)
+         return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
+   exec->syncobjs[exec->syncobj_count] = (struct drm_i915_gem_exec_fence) {
+      .handle = syncobj,
+      .flags = flags,
+   };
+   if (exec->syncobj_values)
+      exec->syncobj_values[exec->syncobj_count] = timeline_value;
+
+   exec->syncobj_count++;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_execbuf_add_sync(struct anv_device *device,
+                     struct anv_execbuf *execbuf,
+                     struct vk_sync *sync,
+                     bool is_signal,
+                     uint64_t value)
+{
+   /* It's illegal to signal a timeline with value 0 because that's never
+    * higher than the current value.  A timeline wait on value 0 is always
+    * trivial because 0 <= uint64_t always.
+    */
+   if ((sync->flags & VK_SYNC_IS_TIMELINE) && value == 0)
+      return VK_SUCCESS;
+
+   if (vk_sync_is_anv_bo_sync(sync)) {
+      struct anv_bo_sync *bo_sync =
+         container_of(sync, struct anv_bo_sync, sync);
+
+      assert(is_signal == (bo_sync->state == ANV_BO_SYNC_STATE_RESET));
+
+      return anv_execbuf_add_bo(device, execbuf, bo_sync->bo, NULL,
+                                is_signal ? EXEC_OBJECT_WRITE : 0);
+   } else if (vk_sync_type_is_drm_syncobj(sync->type)) {
+      struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(sync);
+
+      if (!(sync->flags & VK_SYNC_IS_TIMELINE))
+         value = 0;
+
+      return anv_execbuf_add_syncobj(device, execbuf, syncobj->syncobj,
+                                     is_signal ? I915_EXEC_FENCE_SIGNAL :
+                                                 I915_EXEC_FENCE_WAIT,
+                                     value);
+   }
+
+   unreachable("Invalid sync type");
+}
+
+static VkResult
+setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,
+                             struct anv_cmd_buffer *cmd_buffer)
+{
+   VkResult result;
+   /* Add surface dependencies (BOs) to the execbuf */
+   result = anv_execbuf_add_bo_bitset(cmd_buffer->device, execbuf,
+                                      cmd_buffer->surface_relocs.dep_words,
+                                      cmd_buffer->surface_relocs.deps, 0);
+   if (result != VK_SUCCESS)
+      return result;
+
+   /* First, we walk over all of the bos we've seen and add them and their
+    * relocations to the validate list.
+    */
+   struct anv_batch_bo **bbo;
+   u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
+      result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
+                                  (*bbo)->bo, &(*bbo)->relocs, 0);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   struct anv_bo **bo_entry;
+   u_vector_foreach(bo_entry, &cmd_buffer->dynamic_bos) {
+      result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
+                                  *bo_entry, NULL, 0);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+pin_state_pool(struct anv_device *device,
+               struct anv_execbuf *execbuf,
+               struct anv_state_pool *pool)
+{
+   anv_block_pool_foreach_bo(bo, &pool->block_pool) {
+      VkResult result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   return VK_SUCCESS;
+}
+
+static void
+get_context_and_exec_flags(struct anv_queue *queue,
+                           bool is_companion_rcs_batch,
+                           uint64_t *exec_flags,
+                           uint32_t *context_id)
+{
+   assert(queue != NULL);
+
+   struct anv_device *device = queue->device;
+
+   /** Submit batch to index 0 which is the main virtual engine */
+   *exec_flags = device->physical->has_vm_control ? 0 : queue->exec_flags;
+
+   *context_id = device->physical->has_vm_control ?
+                 is_companion_rcs_batch ?
+                 queue->companion_rcs_id :
+                 queue->context_id :
+                 device->context_id;
+}
+
+static VkResult
+anv_execbuf_add_trtt_bos(struct anv_device *device,
+                         struct anv_execbuf *execbuf)
+{
+   struct anv_trtt *trtt = &device->trtt;
+   VkResult result = VK_SUCCESS;
+
+   /* If l3_addr is zero we're not using TR-TT, there's no bo to add. */
+   if (!trtt->l3_addr)
+      return VK_SUCCESS;
+
+   pthread_mutex_lock(&trtt->mutex);
+
+   for (int i = 0; i < trtt->num_page_table_bos; i++) {
+      result = anv_execbuf_add_bo(device, execbuf, trtt->page_table_bos[i],
+                                  NULL, 0);
+      if (result != VK_SUCCESS)
+         goto out;
+   }
+
+out:
+   pthread_mutex_unlock(&trtt->mutex);
+   return result;
+}
+
+static VkResult
+setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
+                              struct anv_queue *queue,
+                              struct anv_cmd_buffer **cmd_buffers,
+                              uint32_t num_cmd_buffers)
+{
+   struct anv_device *device = queue->device;
+   VkResult result;
+
+   if (unlikely(device->physical->measure_device.config)) {
+      for (uint32_t i = 0; i < num_cmd_buffers; i++)
+         anv_measure_submit(cmd_buffers[i]);
+   }
+
+   /* Edit the tail of the command buffers to chain them all together if they
+    * can be.
+    */
+   anv_cmd_buffer_chain_command_buffers(cmd_buffers, num_cmd_buffers);
+
+   for (uint32_t i = 0; i < num_cmd_buffers; i++) {
+      result = setup_execbuf_for_cmd_buffer(execbuf, cmd_buffers[i]);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   /* Add all the global BOs to the object list for softpin case. */
+   result = pin_state_pool(device, execbuf, &device->scratch_surface_state_pool);
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (device->physical->va.bindless_surface_state_pool.size > 0) {
+      result = pin_state_pool(device, execbuf, &device->bindless_surface_state_pool);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   if (device->physical->va.indirect_push_descriptor_pool.size > 0) {
+      result = pin_state_pool(device, execbuf, &device->indirect_push_descriptor_pool);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   result = pin_state_pool(device, execbuf, &device->internal_surface_state_pool);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = pin_state_pool(device, execbuf, &device->dynamic_state_pool);
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (device->vk.enabled_extensions.EXT_descriptor_buffer) {
+      result = pin_state_pool(device, execbuf, &device->dynamic_state_db_pool);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   result = pin_state_pool(device, execbuf, &device->general_state_pool);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = pin_state_pool(device, execbuf, &device->instruction_state_pool);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = pin_state_pool(device, execbuf, &device->binding_table_pool);
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (device->physical->va.aux_tt_pool.size > 0) {
+      result = pin_state_pool(device, execbuf, &device->aux_tt_pool);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   if (device->physical->va.push_descriptor_buffer_pool.size > 0) {
+      result = pin_state_pool(device, execbuf, &device->push_descriptor_buffer_pool);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   /* Add the BOs for all user allocated memory objects because we can't
+    * track after binding updates of VK_EXT_descriptor_indexing and due to how
+    * sparse resources work.
+    */
+   list_for_each_entry(struct anv_device_memory, mem,
+                       &device->memory_objects, link) {
+      result = anv_execbuf_add_bo(device, execbuf, mem->bo, NULL, 0);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   result = anv_execbuf_add_trtt_bos(device, execbuf);
+   if (result != VK_SUCCESS)
+      return result;
+
+   /* Add all the private BOs from images because we can't track after binding
+    * updates of VK_EXT_descriptor_indexing.
+    */
+   list_for_each_entry(struct anv_image, image,
+                       &device->image_private_objects, link) {
+      struct anv_bo *private_bo =
+         image->bindings[ANV_IMAGE_MEMORY_BINDING_PRIVATE].address.bo;
+      result = anv_execbuf_add_bo(device, execbuf, private_bo, NULL, 0);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   struct list_head *batch_bo = &cmd_buffers[0]->batch_bos;
+   struct anv_batch_bo *first_batch_bo =
+      list_first_entry(batch_bo, struct anv_batch_bo, link);
+
+   /* The kernel requires that the last entry in the validation list be the
+    * batch buffer to execute.  We can simply swap the element
+    * corresponding to the first batch_bo in the chain with the last
+    * element in the list.
+    */
+   if (first_batch_bo->bo->exec_obj_index != execbuf->bo_count - 1) {
+      uint32_t idx = first_batch_bo->bo->exec_obj_index;
+      uint32_t last_idx = execbuf->bo_count - 1;
+
+      struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx];
+      assert(execbuf->bos[idx] == first_batch_bo->bo);
+
+      execbuf->objects[idx] = execbuf->objects[last_idx];
+      execbuf->bos[idx] = execbuf->bos[last_idx];
+      execbuf->bos[idx]->exec_obj_index = idx;
+
+      execbuf->objects[last_idx] = tmp_obj;
+      execbuf->bos[last_idx] = first_batch_bo->bo;
+      first_batch_bo->bo->exec_obj_index = last_idx;
+   }
+
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+   if (device->physical->memory.need_flush &&
+       anv_bo_needs_host_cache_flush(device->batch_bo_pool.bo_alloc_flags))
+      anv_cmd_buffer_clflush(cmd_buffers, num_cmd_buffers);
+#endif
+
+   assert(!cmd_buffers[0]->is_companion_rcs_cmd_buffer || device->physical->has_vm_control);
+   uint64_t exec_flags = 0;
+   uint32_t context_id;
+   get_context_and_exec_flags(queue, cmd_buffers[0]->is_companion_rcs_cmd_buffer,
+                              &exec_flags, &context_id);
+
+   execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
+      .buffers_ptr = (uintptr_t) execbuf->objects,
+      .buffer_count = execbuf->bo_count,
+      .batch_start_offset = 0,
+      .batch_len = 0,
+      .cliprects_ptr = 0,
+      .num_cliprects = 0,
+      .DR1 = 0,
+      .DR4 = 0,
+      .flags = I915_EXEC_NO_RELOC |
+               I915_EXEC_HANDLE_LUT |
+               exec_flags,
+      .rsvd1 = context_id,
+      .rsvd2 = 0,
+   };
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+setup_empty_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue)
+{
+   struct anv_device *device = queue->device;
+   VkResult result = anv_execbuf_add_bo(device, execbuf,
+                                        device->trivial_batch_bo,
+                                        NULL, 0);
+   if (result != VK_SUCCESS)
+      return result;
+
+   uint64_t exec_flags = 0;
+   uint32_t context_id;
+   get_context_and_exec_flags(queue, false, &exec_flags, &context_id);
+
+   execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
+      .buffers_ptr = (uintptr_t) execbuf->objects,
+      .buffer_count = execbuf->bo_count,
+      .batch_start_offset = 0,
+      .batch_len = 8, /* GFX7_MI_BATCH_BUFFER_END and NOOP */
+      .flags = I915_EXEC_HANDLE_LUT | exec_flags | I915_EXEC_NO_RELOC,
+      .rsvd1 = context_id,
+      .rsvd2 = 0,
+   };
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+setup_utrace_execbuf(struct anv_execbuf *execbuf, struct anv_queue *queue,
+                     struct anv_utrace_submit *submit)
+{
+   struct anv_device *device = queue->device;
+
+   /* Always add the workaround BO as it includes a driver identifier for the
+    * error_state.
+    */
+   VkResult result = anv_execbuf_add_bo(device, execbuf,
+                                        device->workaround_bo,
+                                        NULL, 0);
+   if (result != VK_SUCCESS)
+      return result;
+
+   util_dynarray_foreach(&submit->batch_bos, struct anv_bo *, _bo) {
+      struct anv_bo *bo = *_bo;
+
+      result = anv_execbuf_add_bo(device, execbuf, bo,
+                                  &submit->relocs, 0);
+      if (result != VK_SUCCESS)
+         return result;
+
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+      if (device->physical->memory.need_flush &&
+          anv_bo_needs_host_cache_flush(bo->alloc_flags))
+         intel_flush_range(bo->map, bo->size);
+#endif
+   }
+
+   result = anv_execbuf_add_sync(device, execbuf, submit->sync,
+                                 true /* is_signal */, 0 /* value */);
+   if (result != VK_SUCCESS)
+      return result;
+
+   struct anv_bo *batch_bo =
+      *util_dynarray_element(&submit->batch_bos, struct anv_bo *, 0);
+   if (batch_bo->exec_obj_index != execbuf->bo_count - 1) {
+      uint32_t idx = batch_bo->exec_obj_index;
+      uint32_t last_idx = execbuf->bo_count - 1;
+
+      struct drm_i915_gem_exec_object2 tmp_obj = execbuf->objects[idx];
+      assert(execbuf->bos[idx] == batch_bo);
+
+      execbuf->objects[idx] = execbuf->objects[last_idx];
+      execbuf->bos[idx] = execbuf->bos[last_idx];
+      execbuf->bos[idx]->exec_obj_index = idx;
+
+      execbuf->objects[last_idx] = tmp_obj;
+      execbuf->bos[last_idx] = batch_bo;
+      batch_bo->exec_obj_index = last_idx;
+   }
+
+   uint64_t exec_flags = 0;
+   uint32_t context_id;
+   get_context_and_exec_flags(queue, false, &exec_flags, &context_id);
+
+   execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
+      .buffers_ptr = (uintptr_t) execbuf->objects,
+      .buffer_count = execbuf->bo_count,
+      .batch_start_offset = 0,
+      .batch_len = submit->batch.next - submit->batch.start,
+      .flags = I915_EXEC_NO_RELOC |
+               I915_EXEC_HANDLE_LUT |
+               I915_EXEC_FENCE_ARRAY |
+               exec_flags,
+      .rsvd1 = context_id,
+      .rsvd2 = 0,
+      .num_cliprects = execbuf->syncobj_count,
+      .cliprects_ptr = (uintptr_t)execbuf->syncobjs,
+   };
+
+   return VK_SUCCESS;
+}
+
+static int
+anv_gem_execbuffer(struct anv_device *device,
+                   struct drm_i915_gem_execbuffer2 *execbuf)
+{
+   int ret;
+   const unsigned long request = (execbuf->flags & I915_EXEC_FENCE_OUT) ?
+      DRM_IOCTL_I915_GEM_EXECBUFFER2_WR :
+      DRM_IOCTL_I915_GEM_EXECBUFFER2;
+
+   do {
+      ret = intel_ioctl(device->fd, request, execbuf);
+   } while (ret && errno == ENOMEM);
+
+   return ret;
+}
+
+static VkResult
+anv_queue_exec_utrace_locked(struct anv_queue *queue,
+                             struct anv_utrace_submit *submit)
+{
+   assert(util_dynarray_num_elements(&submit->batch_bos,
+                                     struct anv_bo *) > 0);
+
+   struct anv_device *device = queue->device;
+   struct anv_execbuf execbuf = {
+      .alloc = &device->vk.alloc,
+      .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
+   };
+
+   VkResult result = setup_utrace_execbuf(&execbuf, queue, submit);
+   if (result != VK_SUCCESS)
+      goto error;
+
+   ANV_RMV(bos_gtt_map, device, execbuf.bos, execbuf.bo_count);
+
+   int ret = queue->device->info->no_hw ? 0 :
+      anv_gem_execbuffer(queue->device, &execbuf.execbuf);
+   if (ret)
+      result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");
+
+ error:
+   anv_execbuf_finish(&execbuf);
+
+   return result;
+}
+
+static void
+anv_i915_debug_submit(const struct anv_execbuf *execbuf)
+{
+   uint32_t total_size_kb = 0, total_vram_only_size_kb = 0;
+   for (uint32_t i = 0; i < execbuf->bo_count; i++) {
+      const struct anv_bo *bo = execbuf->bos[i];
+      total_size_kb += bo->size / 1024;
+      if (anv_bo_is_vram_only(bo))
+         total_vram_only_size_kb += bo->size / 1024;
+   }
+
+   fprintf(stderr, "Batch offset=0x%x len=0x%x on queue 0 (aperture: %.1fMb, %.1fMb VRAM only)\n",
+           execbuf->execbuf.batch_start_offset, execbuf->execbuf.batch_len,
+           (float)total_size_kb / 1024.0f,
+           (float)total_vram_only_size_kb / 1024.0f);
+   for (uint32_t i = 0; i < execbuf->bo_count; i++) {
+      const struct anv_bo *bo = execbuf->bos[i];
+
+      fprintf(stderr, "   BO: addr=0x%016"PRIx64"-0x%016"PRIx64" size=%7"PRIu64
+              "KB handle=%05u capture=%u vram_only=%u name=%s\n",
+              bo->offset, bo->offset + bo->size - 1, bo->size / 1024,
+              bo->gem_handle, (bo->flags & EXEC_OBJECT_CAPTURE) != 0,
+              anv_bo_is_vram_only(bo), bo->name);
+   }
+}
+
+static void
+setup_execbuf_fence_params(struct anv_execbuf *execbuf)
+{
+   if (execbuf->syncobj_values) {
+      execbuf->timeline_fences.fence_count = execbuf->syncobj_count;
+      execbuf->timeline_fences.handles_ptr = (uintptr_t)execbuf->syncobjs;
+      execbuf->timeline_fences.values_ptr = (uintptr_t)execbuf->syncobj_values;
+      anv_execbuf_add_ext(execbuf,
+                          DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES,
+                          &execbuf->timeline_fences.base);
+   } else if (execbuf->syncobjs) {
+      execbuf->execbuf.flags |= I915_EXEC_FENCE_ARRAY;
+      execbuf->execbuf.num_cliprects = execbuf->syncobj_count;
+      execbuf->execbuf.cliprects_ptr = (uintptr_t)execbuf->syncobjs;
+   }
+}
+
+static VkResult
+i915_companion_rcs_queue_exec_locked(struct anv_queue *queue,
+                                     struct anv_cmd_buffer *companion_rcs_cmd_buffer,
+                                     uint32_t wait_count,
+                                     const struct vk_sync_wait *waits)
+{
+   struct anv_device *device = queue->device;
+   struct anv_execbuf execbuf = {
+      .alloc = &queue->device->vk.alloc,
+      .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
+   };
+
+   /* Always add the workaround BO as it includes a driver identifier for the
+    * error_state.
+    */
+   VkResult result =
+      anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL, 0);
+   if (result != VK_SUCCESS)
+      goto error;
+
+   for (uint32_t i = 0; i < wait_count; i++) {
+      result = anv_execbuf_add_sync(device, &execbuf,
+                                    waits[i].sync,
+                                    false /* is_signal */,
+                                    waits[i].wait_value);
+      if (result != VK_SUCCESS)
+         goto error;
+   }
+
+   if (queue->companion_sync) {
+      result = anv_execbuf_add_sync(device, &execbuf,
+                                    queue->companion_sync,
+                                    true /* is_signal */, 0);
+      if (result != VK_SUCCESS)
+         goto error;
+   }
+
+   result = setup_execbuf_for_cmd_buffers(&execbuf, queue,
+                                          &companion_rcs_cmd_buffer, 1);
+   if (result != VK_SUCCESS)
+      goto error;
+
+   if (INTEL_DEBUG(DEBUG_SUBMIT))
+      anv_i915_debug_submit(&execbuf);
+
+   anv_cmd_buffer_exec_batch_debug(queue, 1, &companion_rcs_cmd_buffer, NULL, 0);
+
+   setup_execbuf_fence_params(&execbuf);
+
+   ANV_RMV(bos_gtt_map, device, execbuf.bos, execbuf.bo_count);
+
+   int ret = queue->device->info->no_hw ? 0 :
+      anv_gem_execbuffer(queue->device, &execbuf.execbuf);
+   if (ret) {
+      anv_i915_debug_submit(&execbuf);
+      result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");
+   }
+
+ error:
+   anv_execbuf_finish(&execbuf);
+   return result;
+}
+
+VkResult
+i915_queue_exec_locked(struct anv_queue *queue,
+                       uint32_t wait_count,
+                       const struct vk_sync_wait *waits,
+                       uint32_t cmd_buffer_count,
+                       struct anv_cmd_buffer **cmd_buffers,
+                       uint32_t signal_count,
+                       const struct vk_sync_signal *signals,
+                       struct anv_query_pool *perf_query_pool,
+                       uint32_t perf_query_pass,
+                       struct anv_utrace_submit *utrace_submit)
+{
+   struct anv_device *device = queue->device;
+   struct anv_execbuf execbuf = {
+      .alloc = &queue->device->vk.alloc,
+      .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
+      .perf_query_pass = perf_query_pass,
+   };
+   VkResult result;
+
+   if (utrace_submit &&
+       util_dynarray_num_elements(&utrace_submit->batch_bos,
+                                  struct anv_bo *) == 0) {
+      result = anv_execbuf_add_sync(device, &execbuf,
+                                    utrace_submit->sync,
+                                    true /* is_signal */,
+                                    0);
+      if (result != VK_SUCCESS)
+         goto error;
+
+      /* When The utrace submission doesn't have its own batch buffer*/
+      utrace_submit = NULL;
+   }
+
+   /* Always add the workaround BO as it includes a driver identifier for the
+    * error_state.
+    */
+   result =
+      anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL, 0);
+   if (result != VK_SUCCESS)
+      goto error;
+
+   for (uint32_t i = 0; i < wait_count; i++) {
+      result = anv_execbuf_add_sync(device, &execbuf,
+                                    waits[i].sync,
+                                    false /* is_signal */,
+                                    waits[i].wait_value);
+      if (result != VK_SUCCESS)
+         goto error;
+   }
+
+   for (uint32_t i = 0; i < signal_count; i++) {
+      result = anv_execbuf_add_sync(device, &execbuf,
+                                    signals[i].sync,
+                                    true /* is_signal */,
+                                    signals[i].signal_value);
+      if (result != VK_SUCCESS)
+         goto error;
+   }
+
+   if (queue->sync) {
+      result = anv_execbuf_add_sync(device, &execbuf,
+                                    queue->sync,
+                                    true /* is_signal */,
+                                    0 /* signal_value */);
+      if (result != VK_SUCCESS)
+         goto error;
+   }
+
+   if (cmd_buffer_count) {
+      result = setup_execbuf_for_cmd_buffers(&execbuf, queue, cmd_buffers,
+                                             cmd_buffer_count);
+   } else {
+      result = setup_empty_execbuf(&execbuf, queue);
+   }
+
+   if (result != VK_SUCCESS)
+      goto error;
+
+   const bool has_perf_query =
+      perf_query_pool && perf_query_pass >= 0 && cmd_buffer_count;
+
+   if (INTEL_DEBUG(DEBUG_SUBMIT))
+      anv_i915_debug_submit(&execbuf);
+
+   anv_cmd_buffer_exec_batch_debug(queue, cmd_buffer_count, cmd_buffers,
+                                   perf_query_pool, perf_query_pass);
+
+   setup_execbuf_fence_params(&execbuf);
+
+   if (has_perf_query) {
+      assert(perf_query_pass < perf_query_pool->n_passes);
+      struct intel_perf_query_info *query_info =
+         perf_query_pool->pass_query[perf_query_pass];
+
+      /* Some performance queries just the pipeline statistic HW, no need for
+       * OA in that case, so no need to reconfigure.
+       */
+      if (!INTEL_DEBUG(DEBUG_NO_OACONFIG) &&
+          (query_info->kind == INTEL_PERF_QUERY_TYPE_OA ||
+           query_info->kind == INTEL_PERF_QUERY_TYPE_RAW)) {
+         int ret = intel_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG,
+                               (void *)(uintptr_t) query_info->oa_metrics_set_id);
+         if (ret < 0) {
+            result = vk_device_set_lost(&device->vk,
+                                        "i915-perf config failed: %s",
+                                        strerror(errno));
+         }
+      }
+
+      struct anv_bo *pass_batch_bo = perf_query_pool->bo;
+
+      struct drm_i915_gem_exec_object2 query_pass_object = {
+         .handle = pass_batch_bo->gem_handle,
+         .offset = pass_batch_bo->offset,
+         .flags  = pass_batch_bo->flags,
+      };
+
+      uint64_t exec_flags = 0;
+      uint32_t context_id;
+      get_context_and_exec_flags(queue, false, &exec_flags, &context_id);
+
+      struct drm_i915_gem_execbuffer2 query_pass_execbuf = {
+         .buffers_ptr = (uintptr_t) &query_pass_object,
+         .buffer_count = 1,
+         .batch_start_offset = khr_perf_query_preamble_offset(perf_query_pool,
+                                                              perf_query_pass),
+         .flags = I915_EXEC_HANDLE_LUT | exec_flags,
+         .rsvd1 = context_id,
+      };
+
+      int ret = queue->device->info->no_hw ? 0 :
+         anv_gem_execbuffer(queue->device, &query_pass_execbuf);
+      if (ret)
+         result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");
+   }
+
+   ANV_RMV(bos_gtt_map, device, execbuf.bos, execbuf.bo_count);
+
+   int ret = queue->device->info->no_hw ? 0 :
+      anv_gem_execbuffer(queue->device, &execbuf.execbuf);
+   if (ret) {
+      anv_i915_debug_submit(&execbuf);
+      result = vk_queue_set_lost(&queue->vk, "execbuf2 failed: %m");
+   }
+
+   if (cmd_buffer_count != 0 && cmd_buffers[0]->companion_rcs_cmd_buffer) {
+      struct anv_cmd_buffer *companion_rcs_cmd_buffer =
+         cmd_buffers[0]->companion_rcs_cmd_buffer;
+      assert(companion_rcs_cmd_buffer->is_companion_rcs_cmd_buffer);
+      assert(cmd_buffer_count == 1);
+      result = i915_companion_rcs_queue_exec_locked(queue,
+                                                    cmd_buffers[0]->companion_rcs_cmd_buffer, wait_count,
+                                                    waits);
+   }
+
+   result = anv_queue_post_submit(queue, result);
+
+ error:
+   anv_execbuf_finish(&execbuf);
+
+   if (result == VK_SUCCESS && utrace_submit)
+      result = anv_queue_exec_utrace_locked(queue, utrace_submit);
+
+   return result;
+}
+
+VkResult
+i915_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
+                          uint32_t batch_bo_size, bool is_companion_rcs_batch)
+{
+   struct anv_device *device = queue->device;
+   struct anv_execbuf execbuf = {
+      .alloc = &queue->device->vk.alloc,
+      .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
+   };
+
+   VkResult result = anv_execbuf_add_bo(device, &execbuf, batch_bo, NULL, 0);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   assert(!is_companion_rcs_batch || device->physical->has_vm_control);
+   uint64_t exec_flags = 0;
+   uint32_t context_id;
+   get_context_and_exec_flags(queue, is_companion_rcs_batch, &exec_flags,
+                              &context_id);
+
+   execbuf.execbuf = (struct drm_i915_gem_execbuffer2) {
+      .buffers_ptr = (uintptr_t) execbuf.objects,
+      .buffer_count = execbuf.bo_count,
+      .batch_start_offset = 0,
+      .batch_len = batch_bo_size,
+      .flags = I915_EXEC_HANDLE_LUT | exec_flags | I915_EXEC_NO_RELOC,
+      .rsvd1 = context_id,
+      .rsvd2 = 0,
+   };
+
+   ANV_RMV(bos_gtt_map, device, execbuf.bos, execbuf.bo_count);
+
+   if (anv_gem_execbuffer(device, &execbuf.execbuf)) {
+      result = vk_device_set_lost(&device->vk, "anv_gem_execbuffer failed: %m");
+      goto fail;
+   }
+
+   result = anv_device_wait(device, batch_bo, INT64_MAX);
+   if (result != VK_SUCCESS)
+      result = vk_device_set_lost(&device->vk,
+                                  "anv_device_wait failed: %m");
+
+fail:
+   anv_execbuf_finish(&execbuf);
+   return result;
+}
+
+VkResult
+i915_execute_trtt_batch(struct anv_sparse_submission *submit,
+                        struct anv_trtt_batch_bo *trtt_bbo)
+{
+   struct anv_queue *queue = submit->queue;
+   struct anv_device *device = queue->device;
+   struct anv_trtt *trtt = &device->trtt;
+   struct anv_execbuf execbuf = {
+      .alloc = &device->vk.alloc,
+      .alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE,
+   };
+   VkResult result;
+
+   for (uint32_t i = 0; i < submit->wait_count; i++) {
+      result = anv_execbuf_add_sync(device, &execbuf, submit->waits[i].sync,
+                                    false /* is_signal */,
+                                    submit->waits[i].wait_value);
+      if (result != VK_SUCCESS)
+         goto out;
+   }
+
+   for (uint32_t i = 0; i < submit->signal_count; i++) {
+      result = anv_execbuf_add_sync(device, &execbuf, submit->signals[i].sync,
+                                    true /* is_signal */,
+                                    submit->signals[i].signal_value);
+      if (result != VK_SUCCESS)
+         goto out;
+   }
+
+   result = anv_execbuf_add_syncobj(device, &execbuf, trtt->timeline_handle,
+                                    I915_EXEC_FENCE_SIGNAL,
+                                    trtt_bbo->timeline_val);
+   if (result != VK_SUCCESS)
+      goto out;
+
+
+   result = anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL,
+                               0);
+   if (result != VK_SUCCESS)
+      goto out;
+
+   for (int i = 0; i < trtt->num_page_table_bos; i++) {
+      result = anv_execbuf_add_bo(device, &execbuf, trtt->page_table_bos[i],
+                                  NULL, EXEC_OBJECT_WRITE);
+      if (result != VK_SUCCESS)
+         goto out;
+   }
+
+   if (queue->sync) {
+      result = anv_execbuf_add_sync(device, &execbuf, queue->sync,
+                                    true /* is_signal */,
+                                    0 /* signal_value */);
+      if (result != VK_SUCCESS)
+         goto out;
+   }
+
+   result = anv_execbuf_add_bo(device, &execbuf, trtt_bbo->bo, NULL, 0);
+   if (result != VK_SUCCESS)
+      goto out;
+
+   if (INTEL_DEBUG(DEBUG_SUBMIT))
+      anv_i915_debug_submit(&execbuf);
+
+   uint64_t exec_flags = 0;
+   uint32_t context_id;
+   get_context_and_exec_flags(queue, false, &exec_flags, &context_id);
+
+   execbuf.execbuf = (struct drm_i915_gem_execbuffer2) {
+      .buffers_ptr = (uintptr_t) execbuf.objects,
+      .buffer_count = execbuf.bo_count,
+      .batch_start_offset = 0,
+      .batch_len = trtt_bbo->size,
+      .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | exec_flags,
+      .rsvd1 = context_id,
+      .rsvd2 = 0,
+   };
+   setup_execbuf_fence_params(&execbuf);
+
+   ANV_RMV(bos_gtt_map, device, execbuf.bos, execbuf.bo_count);
+
+   int ret = queue->device->info->no_hw ? 0 :
+      anv_gem_execbuffer(device, &execbuf.execbuf);
+   if (ret) {
+      result = vk_device_set_lost(&device->vk,
+                                  "trtt anv_gem_execbuffer failed: %m");
+      goto out;
+   }
+
+   if (queue->sync) {
+      result = vk_sync_wait(&device->vk, queue->sync, 0,
+                            VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
+      if (result != VK_SUCCESS) {
+         result = vk_queue_set_lost(&queue->vk, "trtt sync wait failed");
+         goto out;
+      }
+   }
+
+out:
+   anv_execbuf_finish(&execbuf);
+   return result;
+}
+
+VkResult
+i915_queue_exec_trace(struct anv_queue *queue,
+                      struct anv_utrace_submit *submit)
+{
+   assert(util_dynarray_num_elements(&submit->batch_bos,
+                                     struct anv_bo *) > 0);
+
+   return anv_queue_exec_utrace_locked(queue, submit);
+}
diff --git a/src/intel/vulkan/i915/anv_batch_chain.h b/src/intel/vulkan/i915/anv_batch_chain.h
new file mode 100644
index 00000000000..fc799582828
--- /dev/null
+++ b/src/intel/vulkan/i915/anv_batch_chain.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "vulkan/vulkan_core.h"
+
+#include "vk_sync.h"
+
+struct anv_device;
+struct anv_queue;
+struct anv_bo;
+struct anv_cmd_buffer;
+struct anv_query_pool;
+struct anv_utrace_submit;
+struct anv_sparse_submission;
+struct anv_trtt_batch_bo;
+
+VkResult
+i915_queue_exec_trace(struct anv_queue *queue,
+                      struct anv_utrace_submit *submit);
+VkResult
+i915_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
+                          uint32_t batch_bo_size, bool is_companion_rcs_batch);
+
+VkResult
+i915_execute_trtt_batch(struct anv_sparse_submission *submit,
+                        struct anv_trtt_batch_bo *trtt_bbo);
+
+VkResult
+i915_queue_exec_locked(struct anv_queue *queue,
+                       uint32_t wait_count,
+                       const struct vk_sync_wait *waits,
+                       uint32_t cmd_buffer_count,
+                       struct anv_cmd_buffer **cmd_buffers,
+                       uint32_t signal_count,
+                       const struct vk_sync_signal *signals,
+                       struct anv_query_pool *perf_query_pool,
+                       uint32_t perf_query_pass,
+                       struct anv_utrace_submit *utrace_submit);
diff --git a/src/intel/vulkan/i915/anv_device.c b/src/intel/vulkan/i915/anv_device.c
new file mode 100644
index 00000000000..818b514ca1c
--- /dev/null
+++ b/src/intel/vulkan/i915/anv_device.c
@@ -0,0 +1,400 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "i915/anv_device.h"
+#include "anv_private.h"
+
+#include "common/i915/intel_defines.h"
+#include "common/i915/intel_gem.h"
+
+#include "drm-uapi/i915_drm.h"
+
+static int
+vk_priority_to_i915(VkQueueGlobalPriorityKHR priority)
+{
+   switch (priority) {
+   case VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR:
+      return INTEL_CONTEXT_LOW_PRIORITY;
+   case VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR:
+      return INTEL_CONTEXT_MEDIUM_PRIORITY;
+   case VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR:
+      return INTEL_CONTEXT_HIGH_PRIORITY;
+   case VK_QUEUE_GLOBAL_PRIORITY_REALTIME_KHR:
+      return INTEL_CONTEXT_REALTIME_PRIORITY;
+   default:
+      unreachable("Invalid priority");
+   }
+}
+
+int
+anv_gem_set_context_param(int fd, uint32_t context, uint32_t param, uint64_t value)
+{
+   if (param == I915_CONTEXT_PARAM_PRIORITY)
+      value = vk_priority_to_i915(value);
+
+   int err = 0;
+   if (!intel_gem_set_context_param(fd, context, param, value))
+      err = -errno;
+   return err;
+}
+
+static bool
+anv_gem_has_context_priority(int fd, VkQueueGlobalPriorityKHR priority)
+{
+   return !anv_gem_set_context_param(fd, 0, I915_CONTEXT_PARAM_PRIORITY,
+                                     priority);
+}
+
+VkResult
+anv_i915_physical_device_get_parameters(struct anv_physical_device *device)
+{
+   VkResult result = VK_SUCCESS;
+   int val, fd = device->local_fd;
+   uint64_t value;
+
+   if (!intel_gem_get_param(fd, I915_PARAM_HAS_WAIT_TIMEOUT, &val) || !val) {
+       result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                          "kernel missing gem wait");
+       return result;
+   }
+
+   if (!intel_gem_get_param(fd, I915_PARAM_HAS_EXECBUF2, &val) || !val) {
+      result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                         "kernel missing execbuf2");
+      return result;
+   }
+
+   if (!device->info.has_llc &&
+       (!intel_gem_get_param(fd, I915_PARAM_MMAP_VERSION, &val) || val < 1)) {
+       result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                          "kernel missing wc mmap");
+       return result;
+   }
+
+   if (!intel_gem_get_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN, &val) || !val) {
+      result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                         "kernel missing softpin");
+      return result;
+   }
+
+   if (!intel_gem_get_param(fd, I915_PARAM_HAS_EXEC_FENCE_ARRAY, &val) || !val) {
+      result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                         "kernel missing syncobj support");
+      return result;
+   }
+
+   if (intel_gem_get_param(fd, I915_PARAM_HAS_EXEC_ASYNC, &val))
+      device->has_exec_async = val;
+   if (intel_gem_get_param(fd, I915_PARAM_HAS_EXEC_CAPTURE, &val))
+      device->has_exec_capture = val;
+
+   /* Start with medium; sorted low to high */
+   const VkQueueGlobalPriorityKHR priorities[] = {
+         VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR,
+         VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR,
+         VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR,
+         VK_QUEUE_GLOBAL_PRIORITY_REALTIME_KHR,
+   };
+   device->max_context_priority = VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR;
+   for (unsigned i = 0; i < ARRAY_SIZE(priorities); i++) {
+      if (!anv_gem_has_context_priority(fd, priorities[i]))
+         break;
+      device->max_context_priority = priorities[i];
+   }
+
+   if (intel_gem_get_param(fd, I915_PARAM_HAS_EXEC_TIMELINE_FENCES, &val))
+      device->has_exec_timeline = val;
+
+   if (intel_gem_get_context_param(fd, 0, I915_CONTEXT_PARAM_VM, &value))
+      device->has_vm_control = value;
+
+   return result;
+}
+
+VkResult
+anv_i915_physical_device_init_memory_types(struct anv_physical_device *device)
+{
+   if (anv_physical_device_has_vram(device)) {
+      device->memory.type_count = 3;
+      device->memory.types[0] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+         .heapIndex = 0,
+      };
+      device->memory.types[1] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+                          VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+         .heapIndex = 1,
+      };
+      device->memory.types[2] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+         /* This memory type either comes from heaps[0] if there is only
+          * mappable vram region, or from heaps[2] if there is both mappable &
+          * non-mappable vram regions.
+          */
+         .heapIndex = device->vram_non_mappable.size > 0 ? 2 : 0,
+      };
+   } else if (device->info.has_llc) {
+      /* Big core GPUs share LLC with the CPU and thus one memory type can be
+       * both cached and coherent at the same time.
+       *
+       * But some game engines can't handle single type well
+       * https://gitlab.freedesktop.org/mesa/mesa/-/issues/7360#note_1719438
+       *
+       * The second memory type w/out HOST_CACHED_BIT will get write-combining.
+       * See anv_AllocateMemory()).
+       *
+       * The Intel Vulkan driver for Windows also advertises these memory types.
+       */
+      device->memory.type_count = 3;
+      device->memory.types[0] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+         .heapIndex = 0,
+      };
+      device->memory.types[1] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+         .heapIndex = 0,
+      };
+      device->memory.types[2] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+                          VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+         .heapIndex = 0,
+      };
+   } else {
+      /* The spec requires that we expose a host-visible, coherent memory
+       * type, but Atom GPUs don't share LLC. Thus we offer two memory types
+       * to give the application a choice between cached, but not coherent and
+       * coherent but uncached (WC though).
+       */
+      device->memory.type_count = 2;
+      device->memory.types[0] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+         .heapIndex = 0,
+      };
+      device->memory.types[1] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+         .heapIndex = 0,
+      };
+   }
+
+   if (device->has_protected_contexts) {
+      /* Add a memory type for protected buffers, local and not host
+       * visible.
+       */
+      device->memory.types[device->memory.type_count++] =
+         (struct anv_memory_type) {
+            .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                             VK_MEMORY_PROPERTY_PROTECTED_BIT,
+            .heapIndex = 0,
+      };
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_i915_set_queue_parameters(
+      struct anv_device *device,
+      uint32_t context_id,
+      const VkDeviceQueueGlobalPriorityCreateInfoKHR *queue_priority)
+{
+   struct anv_physical_device *physical_device = device->physical;
+
+   /* Here we tell the kernel not to attempt to recover our context but
+    * immediately (on the next batchbuffer submission) report that the
+    * context is lost, and we will do the recovery ourselves.  In the case
+    * of Vulkan, recovery means throwing VK_ERROR_DEVICE_LOST and letting
+    * the client clean up the pieces.
+    */
+   anv_gem_set_context_param(device->fd, context_id,
+                             I915_CONTEXT_PARAM_RECOVERABLE, false);
+
+   VkQueueGlobalPriorityKHR priority =
+      queue_priority ? queue_priority->globalPriority :
+         VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR;
+
+   /* As per spec, the driver implementation may deny requests to acquire
+    * a priority above the default priority (MEDIUM) if the caller does not
+    * have sufficient privileges. In this scenario VK_ERROR_NOT_PERMITTED_KHR
+    * is returned.
+    */
+   if (physical_device->max_context_priority >= VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR) {
+      int err = anv_gem_set_context_param(device->fd, context_id,
+                                          I915_CONTEXT_PARAM_PRIORITY,
+                                          priority);
+      if (err != 0 && priority > VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR) {
+         return vk_error(device, VK_ERROR_NOT_PERMITTED_KHR);
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_i915_device_setup_context(struct anv_device *device,
+                              const VkDeviceCreateInfo *pCreateInfo,
+                              const uint32_t num_queues)
+{
+   device->protected_session_id = I915_PROTECTED_CONTENT_DEFAULT_SESSION;
+
+   if (device->physical->has_vm_control)
+      return anv_i915_device_setup_vm(device);
+
+   struct anv_physical_device *physical_device = device->physical;
+   VkResult result = VK_SUCCESS;
+
+   if (device->physical->engine_info) {
+      /* The kernel API supports at most 64 engines */
+      assert(num_queues <= 64);
+      enum intel_engine_class engine_classes[64];
+      int engine_count = 0;
+      enum intel_gem_create_context_flags flags = 0;
+      for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
+         const VkDeviceQueueCreateInfo *queueCreateInfo =
+            &pCreateInfo->pQueueCreateInfos[i];
+
+         assert(queueCreateInfo->queueFamilyIndex <
+                physical_device->queue.family_count);
+         struct anv_queue_family *queue_family =
+            &physical_device->queue.families[queueCreateInfo->queueFamilyIndex];
+
+         for (uint32_t j = 0; j < queueCreateInfo->queueCount; j++)
+            engine_classes[engine_count++] = queue_family->engine_class;
+
+         if (pCreateInfo->pQueueCreateInfos[i].flags &
+             VK_DEVICE_QUEUE_CREATE_PROTECTED_BIT)
+            flags |= INTEL_GEM_CREATE_CONTEXT_EXT_PROTECTED_FLAG;
+      }
+      if (!intel_gem_create_context_engines(device->fd, flags,
+                                            physical_device->engine_info,
+                                            engine_count, engine_classes,
+                                            device->vm_id,
+                                            (uint32_t *)&device->context_id))
+         result = vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                            "kernel context creation failed");
+   } else {
+      assert(num_queues == 1);
+      if (!intel_gem_create_context(device->fd, &device->context_id))
+         result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
+   }
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   /* Check if client specified queue priority. */
+   const VkDeviceQueueGlobalPriorityCreateInfoKHR *queue_priority =
+      vk_find_struct_const(pCreateInfo->pQueueCreateInfos[0].pNext,
+                           DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
+
+   result = anv_i915_set_queue_parameters(device, device->context_id,
+                                          queue_priority);
+   if (result != VK_SUCCESS)
+      goto fail_context;
+
+   return result;
+
+fail_context:
+   intel_gem_destroy_context(device->fd, device->context_id);
+   return result;
+}
+
+static VkResult
+anv_gem_context_get_reset_stats(struct anv_device *device, int context)
+{
+   struct drm_i915_reset_stats stats = {
+      .ctx_id = context,
+   };
+
+   int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats);
+   if (ret == -1) {
+      /* We don't know the real error. */
+      return vk_device_set_lost(&device->vk, "get_reset_stats failed: %m");
+   }
+
+   if (stats.batch_active) {
+      return vk_device_set_lost(&device->vk, "GPU hung on one of our command buffers");
+   } else if (stats.batch_pending) {
+      return vk_device_set_lost(&device->vk, "GPU hung with commands in-flight");
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_i915_device_check_status(struct vk_device *vk_device)
+{
+   struct anv_device *device = container_of(vk_device, struct anv_device, vk);
+   VkResult result;
+
+   if (device->physical->has_vm_control) {
+      for (uint32_t i = 0; i < device->queue_count; i++) {
+         result = anv_gem_context_get_reset_stats(device,
+                                                  device->queues[i].context_id);
+         if (result != VK_SUCCESS)
+            return result;
+
+         if (device->queues[i].companion_rcs_id != 0) {
+            uint32_t context_id = device->queues[i].companion_rcs_id;
+            result = anv_gem_context_get_reset_stats(device, context_id);
+            if (result != VK_SUCCESS) {
+               return result;
+            }
+         }
+      }
+   } else {
+      result = anv_gem_context_get_reset_stats(device, device->context_id);
+   }
+
+   return result;
+}
+
+bool
+anv_i915_device_destroy_vm(struct anv_device *device)
+{
+   struct drm_i915_gem_vm_control destroy = {
+      .vm_id = device->vm_id,
+   };
+
+   return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_VM_DESTROY, &destroy) == 0;
+}
+
+VkResult
+anv_i915_device_setup_vm(struct anv_device *device)
+{
+   struct drm_i915_gem_vm_control create = {};
+   if (intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_VM_CREATE, &create))
+      return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                       "vm creation failed");
+
+   device->vm_id = create.vm_id;
+   return VK_SUCCESS;
+}
diff --git a/src/intel/vulkan/i915/anv_device.h b/src/intel/vulkan/i915/anv_device.h
new file mode 100644
index 00000000000..0d871a41199
--- /dev/null
+++ b/src/intel/vulkan/i915/anv_device.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "vulkan/vulkan_core.h"
+#include "vk_device.h"
+
+struct anv_device;
+struct anv_physical_device;
+
+VkResult
+anv_i915_physical_device_get_parameters(struct anv_physical_device *device);
+VkResult
+anv_i915_physical_device_init_memory_types(struct anv_physical_device *device);
+
+VkResult
+anv_i915_device_setup_context(struct anv_device *device,
+                              const VkDeviceCreateInfo *pCreateInfo,
+                              const uint32_t num_queues);
+
+VkResult anv_i915_device_check_status(struct vk_device *vk_device);
+bool anv_i915_device_destroy_vm(struct anv_device *device);
+VkResult anv_i915_device_setup_vm(struct anv_device *device);
+VkResult anv_i915_set_queue_parameters(
+      struct anv_device *device,
+      uint32_t context_id,
+      const VkDeviceQueueGlobalPriorityCreateInfoKHR *queue_priority);
diff --git a/src/intel/vulkan/i915/anv_gem.c b/src/intel/vulkan/i915/anv_gem.c
new file mode 100644
index 00000000000..a159844aa31
--- /dev/null
+++ b/src/intel/vulkan/i915/anv_gem.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "i915/anv_gem.h"
+#include "anv_private.h"
+
+#include "drm-uapi/i915_drm.h"
+
+int
+anv_i915_gem_get_tiling(struct anv_device *device, uint32_t gem_handle)
+{
+   if (!device->info->has_tiling_uapi)
+      return -1;
+
+   struct drm_i915_gem_get_tiling get_tiling = {
+      .handle = gem_handle,
+   };
+
+   /* FIXME: On discrete platforms we don't have DRM_IOCTL_I915_GEM_GET_TILING
+    * anymore, so we will need another way to get the tiling. Apparently this
+    * is only used in Android code, so we may need some other way to
+    * communicate the tiling mode.
+    */
+   if (intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling)) {
+      assert(!"Failed to get BO tiling");
+      return -1;
+   }
+
+   return get_tiling.tiling_mode;
+}
+
+int
+anv_i915_gem_set_tiling(struct anv_device *device, uint32_t gem_handle,
+                        uint32_t stride, uint32_t tiling)
+{
+   /* On discrete platforms we don't have DRM_IOCTL_I915_GEM_SET_TILING. So
+    * nothing needs to be done.
+    */
+   if (!device->info->has_tiling_uapi)
+      return 0;
+
+   /* set_tiling overwrites the input on the error path, so we have to open
+    * code intel_ioctl.
+    */
+   struct drm_i915_gem_set_tiling set_tiling = {
+      .handle = gem_handle,
+      .tiling_mode = tiling,
+      .stride = stride,
+   };
+
+   return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling);
+}
+
+int
+anv_i915_gem_wait(struct anv_device *device, uint32_t gem_handle,
+                  int64_t *timeout_ns)
+{
+   struct drm_i915_gem_wait wait = {
+      .bo_handle = gem_handle,
+      .timeout_ns = *timeout_ns,
+      .flags = 0,
+   };
+
+   int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_WAIT, &wait);
+   *timeout_ns = wait.timeout_ns;
+
+   return ret;
+}
+
+VkResult
+anv_i915_gem_import_bo_alloc_flags_to_bo_flags(struct anv_device *device,
+                                               struct anv_bo *bo,
+                                               enum anv_bo_alloc_flags alloc_flags,
+                                               uint32_t *out_bo_flags)
+{
+   const uint32_t bo_flags =
+         device->kmd_backend->bo_alloc_flags_to_bo_flags(device, alloc_flags);
+   if (bo->refcount == 0) {
+      *out_bo_flags = bo_flags;
+      return VK_SUCCESS;
+   }
+
+   /* We have to be careful how we combine flags so that it makes sense.
+    * Really, though, if we get to this case and it actually matters, the
+    * client has imported a BO twice in different ways and they get what
+    * they have coming.
+    */
+   uint32_t new_flags = 0;
+   new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_WRITE;
+   new_flags |= (bo->flags & bo_flags) & EXEC_OBJECT_ASYNC;
+   new_flags |= (bo->flags & bo_flags) & EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+   new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_PINNED;
+   new_flags |= (bo->flags | bo_flags) & EXEC_OBJECT_CAPTURE;
+
+   /* It's theoretically possible for a BO to get imported such that it's
+    * both pinned and not pinned.  The only way this can happen is if it
+    * gets imported as both a semaphore and a memory object and that would
+    * be an application error.  Just fail out in that case.
+    */
+   if ((bo->flags & EXEC_OBJECT_PINNED) !=
+       (bo_flags & EXEC_OBJECT_PINNED))
+      return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+                       "The same BO was imported two different ways");
+
+   /* It's also theoretically possible that someone could export a BO from
+    * one heap and import it into another or to import the same BO into two
+    * different heaps.  If this happens, we could potentially end up both
+    * allowing and disallowing 48-bit addresses.  There's not much we can
+    * do about it if we're pinning so we just throw an error and hope no
+    * app is actually that stupid.
+    */
+   if ((new_flags & EXEC_OBJECT_PINNED) &&
+       (bo->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) !=
+       (bo_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS))
+      return vk_errorf(device, VK_ERROR_INVALID_EXTERNAL_HANDLE,
+                       "The same BO was imported on two different heaps");
+
+   *out_bo_flags = new_flags;
+   return VK_SUCCESS;
+}
diff --git a/src/intel/vulkan/i915/anv_gem.h b/src/intel/vulkan/i915/anv_gem.h
new file mode 100644
index 00000000000..bf3713f86f3
--- /dev/null
+++ b/src/intel/vulkan/i915/anv_gem.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "vulkan/vulkan_core.h"
+
+struct anv_bo;
+struct anv_device;
+enum anv_bo_alloc_flags;
+
+int anv_i915_gem_get_tiling(struct anv_device *device, uint32_t gem_handle);
+int anv_i915_gem_set_tiling(struct anv_device *device, uint32_t gem_handle,
+                            uint32_t stride, uint32_t tiling);
+
+int anv_i915_gem_wait(struct anv_device *device, uint32_t gem_handle,
+                      int64_t *timeout_ns);
+
+VkResult anv_i915_gem_import_bo_alloc_flags_to_bo_flags(struct anv_device *device,
+                                                        struct anv_bo *bo,
+                                                        enum anv_bo_alloc_flags alloc_flags,
+                                                        uint32_t *out_bo_flags);
diff --git a/src/intel/vulkan/i915/anv_kmd_backend.c b/src/intel/vulkan/i915/anv_kmd_backend.c
new file mode 100644
index 00000000000..253abfd959e
--- /dev/null
+++ b/src/intel/vulkan/i915/anv_kmd_backend.c
@@ -0,0 +1,306 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <sys/mman.h>
+
+#include "anv_private.h"
+
+#include "i915/anv_batch_chain.h"
+
+#include "drm-uapi/i915_drm.h"
+#include "intel/common/i915/intel_gem.h"
+
+static int
+i915_gem_set_caching(struct anv_device *device,
+                     uint32_t gem_handle, uint32_t caching)
+{
+   struct drm_i915_gem_caching gem_caching = {
+      .handle = gem_handle,
+      .caching = caching,
+   };
+
+   return intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_SET_CACHING, &gem_caching);
+}
+
+static uint32_t
+i915_gem_create(struct anv_device *device,
+                const struct intel_memory_class_instance **regions,
+                uint16_t num_regions, uint64_t size,
+                enum anv_bo_alloc_flags alloc_flags,
+                uint64_t *actual_size)
+{
+   if (unlikely(!device->info->mem.use_class_instance)) {
+      assert(num_regions == 1 &&
+             device->physical->sys.region == regions[0]);
+
+      struct drm_i915_gem_create gem_create = {
+            .size = size,
+      };
+      if (intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create))
+         return 0;
+
+      if ((alloc_flags & ANV_BO_ALLOC_HOST_CACHED_COHERENT) == ANV_BO_ALLOC_HOST_CACHED_COHERENT) {
+         /* We don't want to change these defaults if it's going to be shared
+          * with another process.
+          */
+         assert(!(alloc_flags & ANV_BO_ALLOC_EXTERNAL));
+
+         /* Regular objects are created I915_CACHING_CACHED on LLC platforms and
+          * I915_CACHING_NONE on non-LLC platforms.  For many internal state
+          * objects, we'd rather take the snooping overhead than risk forgetting
+          * a CLFLUSH somewhere.  Userptr objects are always created as
+          * I915_CACHING_CACHED, which on non-LLC means snooped so there's no
+          * need to do this there.
+          */
+         if (device->info->has_caching_uapi && !device->info->has_llc)
+            i915_gem_set_caching(device, gem_create.handle, I915_CACHING_CACHED);
+      }
+
+      *actual_size = gem_create.size;
+      return gem_create.handle;
+   }
+
+   struct drm_i915_gem_memory_class_instance i915_regions[2];
+   assert(num_regions <= ARRAY_SIZE(i915_regions));
+
+   for (uint16_t i = 0; i < num_regions; i++) {
+      i915_regions[i].memory_class = regions[i]->klass;
+      i915_regions[i].memory_instance = regions[i]->instance;
+   }
+
+   uint32_t flags = 0;
+   if (alloc_flags & (ANV_BO_ALLOC_MAPPED | ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE) &&
+       !(alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM))
+      if (device->physical->vram_non_mappable.size > 0)
+         flags |= I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS;
+
+   struct drm_i915_gem_create_ext_memory_regions ext_regions = {
+      .num_regions = num_regions,
+      .regions = (uintptr_t)i915_regions,
+   };
+   struct drm_i915_gem_create_ext gem_create = {
+      .size = size,
+      .flags = flags,
+   };
+
+   intel_i915_gem_add_ext(&gem_create.extensions,
+                          I915_GEM_CREATE_EXT_MEMORY_REGIONS,
+                          &ext_regions.base);
+
+   struct drm_i915_gem_create_ext_set_pat set_pat_param = { 0 };
+   if (device->info->has_set_pat_uapi) {
+      /* Set PAT param */
+      set_pat_param.pat_index = anv_device_get_pat_entry(device, alloc_flags)->index;
+      intel_i915_gem_add_ext(&gem_create.extensions,
+                             I915_GEM_CREATE_EXT_SET_PAT,
+                             &set_pat_param.base);
+   }
+
+   struct drm_i915_gem_create_ext_protected_content protected_param = { 0 };
+   if (alloc_flags & ANV_BO_ALLOC_PROTECTED) {
+      intel_i915_gem_add_ext(&gem_create.extensions,
+                             I915_GEM_CREATE_EXT_PROTECTED_CONTENT,
+                             &protected_param.base);
+   }
+
+   if (intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_CREATE_EXT, &gem_create))
+      return 0;
+
+   *actual_size = gem_create.size;
+
+   if ((alloc_flags & ANV_BO_ALLOC_HOST_CACHED_COHERENT) == ANV_BO_ALLOC_HOST_CACHED_COHERENT) {
+      /* We don't want to change these defaults if it's going to be shared
+       * with another process.
+       */
+      assert(!(alloc_flags & ANV_BO_ALLOC_EXTERNAL));
+
+      /* Regular objects are created I915_CACHING_CACHED on LLC platforms and
+       * I915_CACHING_NONE on non-LLC platforms.  For many internal state
+       * objects, we'd rather take the snooping overhead than risk forgetting
+       * a CLFLUSH somewhere.  Userptr objects are always created as
+       * I915_CACHING_CACHED, which on non-LLC means snooped so there's no
+       * need to do this there.
+       */
+      if (device->info->has_caching_uapi && !device->info->has_llc)
+         i915_gem_set_caching(device, gem_create.handle, I915_CACHING_CACHED);
+   }
+
+   return gem_create.handle;
+}
+
+static void
+i915_gem_close(struct anv_device *device, struct anv_bo *bo)
+{
+   struct drm_gem_close close = {
+      .handle = bo->gem_handle,
+   };
+
+   intel_ioctl(device->fd, DRM_IOCTL_GEM_CLOSE, &close);
+}
+
+static void *
+i915_gem_mmap_offset(struct anv_device *device, struct anv_bo *bo,
+                     uint64_t size, uint32_t flags,
+                     void *placed_addr)
+{
+   struct drm_i915_gem_mmap_offset gem_mmap = {
+      .handle = bo->gem_handle,
+      .flags = flags,
+   };
+   if (intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_MMAP_OFFSET, &gem_mmap))
+      return MAP_FAILED;
+
+   return mmap(placed_addr, size, PROT_READ | PROT_WRITE,
+               (placed_addr != NULL ? MAP_FIXED : 0) | MAP_SHARED,
+               device->fd, gem_mmap.offset);
+}
+
+static void *
+i915_gem_mmap_legacy(struct anv_device *device, struct anv_bo *bo, uint64_t offset,
+                      uint64_t size, uint32_t flags)
+{
+   struct drm_i915_gem_mmap gem_mmap = {
+      .handle = bo->gem_handle,
+      .offset = offset,
+      .size = size,
+      .flags = flags,
+   };
+   if (intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_MMAP, &gem_mmap))
+      return MAP_FAILED;
+
+   return (void *)(uintptr_t) gem_mmap.addr_ptr;
+}
+
+static uint32_t
+mmap_calc_flags(struct anv_device *device, struct anv_bo *bo)
+{
+   if (device->info->has_local_mem)
+      return I915_MMAP_OFFSET_FIXED;
+
+   uint32_t flags;
+   switch (anv_bo_get_mmap_mode(device, bo)) {
+   case INTEL_DEVICE_INFO_MMAP_MODE_WC:
+      flags = I915_MMAP_WC;
+      break;
+   case INTEL_DEVICE_INFO_MMAP_MODE_UC:
+      unreachable("Missing");
+   default:
+      /* no flags == WB */
+      flags = 0;
+   }
+
+   if (likely(device->physical->info.has_mmap_offset))
+      flags = (flags & I915_MMAP_WC) ? I915_MMAP_OFFSET_WC : I915_MMAP_OFFSET_WB;
+   return flags;
+}
+
+static void *
+i915_gem_mmap(struct anv_device *device, struct anv_bo *bo, uint64_t offset,
+              uint64_t size, void *placed_addr)
+{
+   const uint32_t flags = mmap_calc_flags(device, bo);
+
+   if (likely(device->physical->info.has_mmap_offset))
+      return i915_gem_mmap_offset(device, bo, size, flags, placed_addr);
+   assert(placed_addr == NULL);
+   return i915_gem_mmap_legacy(device, bo, offset, size, flags);
+}
+
+static VkResult
+i915_vm_bind(struct anv_device *device, struct anv_sparse_submission *submit,
+             enum anv_vm_bind_flags flags)
+{
+   return VK_SUCCESS;
+}
+
+static VkResult
+i915_vm_bind_bo(struct anv_device *device, struct anv_bo *bo)
+{
+   return VK_SUCCESS;
+}
+
+static uint32_t
+i915_gem_create_userptr(struct anv_device *device, void *mem, uint64_t size)
+{
+   struct drm_i915_gem_userptr userptr = {
+      .user_ptr = (__u64)((unsigned long) mem),
+      .user_size = size,
+      .flags = 0,
+   };
+
+   if (device->physical->info.has_userptr_probe)
+      userptr.flags |= I915_USERPTR_PROBE;
+
+   int ret = intel_ioctl(device->fd, DRM_IOCTL_I915_GEM_USERPTR, &userptr);
+   if (ret == -1)
+      return 0;
+
+   return userptr.handle;
+}
+
+static uint32_t
+i915_bo_alloc_flags_to_bo_flags(struct anv_device *device,
+                                enum anv_bo_alloc_flags alloc_flags)
+{
+   struct anv_physical_device *pdevice = device->physical;
+
+   uint64_t bo_flags = EXEC_OBJECT_PINNED;
+
+   if (!(alloc_flags & ANV_BO_ALLOC_32BIT_ADDRESS))
+      bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+
+   if (((alloc_flags & ANV_BO_ALLOC_CAPTURE) ||
+        INTEL_DEBUG(DEBUG_CAPTURE_ALL)) &&
+       pdevice->has_exec_capture)
+      bo_flags |= EXEC_OBJECT_CAPTURE;
+
+   if (alloc_flags & ANV_BO_ALLOC_IMPLICIT_WRITE) {
+      assert(alloc_flags & ANV_BO_ALLOC_IMPLICIT_SYNC);
+      bo_flags |= EXEC_OBJECT_WRITE;
+   }
+
+   if (!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_SYNC) && pdevice->has_exec_async)
+      bo_flags |= EXEC_OBJECT_ASYNC;
+
+   return bo_flags;
+}
+
+const struct anv_kmd_backend *
+anv_i915_kmd_backend_get(void)
+{
+   static const struct anv_kmd_backend i915_backend = {
+      .gem_create = i915_gem_create,
+      .gem_create_userptr = i915_gem_create_userptr,
+      .gem_close = i915_gem_close,
+      .gem_mmap = i915_gem_mmap,
+      .vm_bind = i915_vm_bind,
+      .vm_bind_bo = i915_vm_bind_bo,
+      .vm_unbind_bo = i915_vm_bind_bo,
+      .execute_simple_batch = i915_execute_simple_batch,
+      .execute_trtt_batch = i915_execute_trtt_batch,
+      .queue_exec_locked = i915_queue_exec_locked,
+      .queue_exec_trace = i915_queue_exec_trace,
+      .bo_alloc_flags_to_bo_flags = i915_bo_alloc_flags_to_bo_flags,
+   };
+   return &i915_backend;
+}
diff --git a/src/intel/vulkan/i915/anv_queue.c b/src/intel/vulkan/i915/anv_queue.c
new file mode 100644
index 00000000000..173cf7b2a3a
--- /dev/null
+++ b/src/intel/vulkan/i915/anv_queue.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "i915/anv_queue.h"
+
+#include "anv_private.h"
+
+#include "common/i915/intel_engine.h"
+#include "common/intel_gem.h"
+
+#include "i915/anv_device.h"
+
+#include "drm-uapi/i915_drm.h"
+
+VkResult
+anv_i915_create_engine(struct anv_device *device,
+                       struct anv_queue *queue,
+                       const VkDeviceQueueCreateInfo *pCreateInfo)
+{
+   struct anv_physical_device *physical = device->physical;
+   struct anv_queue_family *queue_family =
+      &physical->queue.families[pCreateInfo->queueFamilyIndex];
+
+   if (device->physical->engine_info == NULL) {
+      switch (queue_family->engine_class) {
+      case INTEL_ENGINE_CLASS_COPY:
+         queue->exec_flags = I915_EXEC_BLT;
+         break;
+      case INTEL_ENGINE_CLASS_RENDER:
+         queue->exec_flags = I915_EXEC_RENDER;
+         break;
+      case INTEL_ENGINE_CLASS_VIDEO:
+         /* We want VCS0 (with ring1) for HW lacking HEVC on VCS1. */
+         queue->exec_flags = I915_EXEC_BSD | I915_EXEC_BSD_RING1;
+         break;
+      default:
+         unreachable("Unsupported legacy engine");
+      }
+   } else if (device->physical->has_vm_control) {
+      assert(pCreateInfo->queueFamilyIndex < physical->queue.family_count);
+      enum intel_engine_class engine_classes[1];
+      enum intel_gem_create_context_flags flags = 0;
+
+      engine_classes[0] = queue_family->engine_class;
+      if (pCreateInfo->flags & VK_DEVICE_QUEUE_CREATE_PROTECTED_BIT)
+         flags |= INTEL_GEM_CREATE_CONTEXT_EXT_PROTECTED_FLAG;
+
+      if (!intel_gem_create_context_engines(device->fd, flags,
+                                            physical->engine_info,
+                                            1, engine_classes,
+                                            device->vm_id,
+                                            (uint32_t *)&queue->context_id))
+         return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                          "engine creation failed");
+
+      /* Create a companion RCS logical engine to support MSAA copy/clear
+       * operation on compute/copy engine.
+       */
+      if (queue_family->engine_class == INTEL_ENGINE_CLASS_COPY ||
+          queue_family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
+         uint32_t *context_id = (uint32_t *)&queue->companion_rcs_id;
+         engine_classes[0] = INTEL_ENGINE_CLASS_RENDER;
+         if (!intel_gem_create_context_engines(device->fd, flags,
+                                               physical->engine_info,
+                                               1, engine_classes,
+                                               device->vm_id,
+                                               context_id))
+            return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                             "companion RCS engine creation failed");
+      }
+
+      /* Check if client specified queue priority. */
+      const VkDeviceQueueGlobalPriorityCreateInfoKHR *queue_priority =
+         vk_find_struct_const(pCreateInfo->pNext,
+                              DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
+
+      VkResult result = anv_i915_set_queue_parameters(device,
+                                                      queue->context_id,
+                                                      queue_priority);
+      if (result != VK_SUCCESS) {
+         intel_gem_destroy_context(device->fd, queue->context_id);
+         if (queue->companion_rcs_id != 0) {
+            intel_gem_destroy_context(device->fd, queue->companion_rcs_id);
+         }
+         return result;
+      }
+   } else {
+      /* When using the new engine creation uAPI, the exec_flags value is the
+       * index of the engine in the group specified at GEM context creation.
+       */
+      queue->exec_flags = device->queue_count;
+   }
+
+   return VK_SUCCESS;
+}
+
+void
+anv_i915_destroy_engine(struct anv_device *device, struct anv_queue *queue)
+{
+   if (device->physical->has_vm_control) {
+      intel_gem_destroy_context(device->fd, queue->context_id);
+
+      if (queue->companion_rcs_id != 0) {
+         intel_gem_destroy_context(device->fd, queue->companion_rcs_id);
+      }
+   }
+}
diff --git a/src/intel/vulkan/i915/anv_queue.h b/src/intel/vulkan/i915/anv_queue.h
new file mode 100644
index 00000000000..ab75cd5b2cb
--- /dev/null
+++ b/src/intel/vulkan/i915/anv_queue.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "vulkan/vulkan_core.h"
+
+struct anv_device;
+struct anv_queue;
+
+VkResult
+anv_i915_create_engine(struct anv_device *device,
+                       struct anv_queue *queue,
+                       const VkDeviceQueueCreateInfo *pCreateInfo);
+void
+anv_i915_destroy_engine(struct anv_device *device, struct anv_queue *queue);
diff --git a/src/intel/vulkan/anv_wsi_wayland.c b/src/intel/vulkan/layers/anv_android_layer.c
index 13c59604ffe..b9ccc60649c 100644
--- a/src/intel/vulkan/anv_wsi_wayland.c
+++ b/src/intel/vulkan/layers/anv_android_layer.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2015 Intel Corporation
+ * Copyright © 2023 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -21,33 +21,26 @@
  * IN THE SOFTWARE.
  */
 
-#include "wsi_common_wayland.h"
 #include "anv_private.h"
 
-VkBool32 anv_GetPhysicalDeviceWaylandPresentationSupportKHR(
-    VkPhysicalDevice                            physicalDevice,
-    uint32_t                                    queueFamilyIndex,
-    struct wl_display*                          display)
-{
-   ANV_FROM_HANDLE(anv_physical_device, physical_device, physicalDevice);
-
-   return wsi_wl_get_presentation_support(&physical_device->wsi_device, display);
-}
-
-VkResult anv_CreateWaylandSurfaceKHR(
-    VkInstance                                  _instance,
-    const VkWaylandSurfaceCreateInfoKHR*        pCreateInfo,
+VkResult anv_android_CreateImageView(
+    VkDevice                                    _device,
+    const VkImageViewCreateInfo*                pCreateInfo,
     const VkAllocationCallbacks*                pAllocator,
-    VkSurfaceKHR*                               pSurface)
+    VkImageView*                                pView)
 {
-   ANV_FROM_HANDLE(anv_instance, instance, _instance);
-   const VkAllocationCallbacks *alloc;
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_WAYLAND_SURFACE_CREATE_INFO_KHR);
-
-   if (pAllocator)
-      alloc = pAllocator;
-   else
-      alloc = &instance->vk.alloc;
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   const struct util_format_description *fmt =
+      vk_format_description(pCreateInfo->format);
 
-   return wsi_create_wl_surface(alloc, pCreateInfo, pSurface);
+   /* Throw error in case application tries to create ASTC view on gfx125.
+    * This is done to avoid gpu hang that can result in using the unsupported
+    * format.
+    */
+   if (fmt && fmt->layout == UTIL_FORMAT_LAYOUT_ASTC &&
+       device->info->verx10 >= 125) {
+      return vk_errorf(device, VK_ERROR_OUT_OF_HOST_MEMORY,
+                       "ASTC format not supported (%s).", __func__);
+   }
+   return anv_CreateImageView(_device, pCreateInfo, pAllocator, pView);
 }
diff --git a/src/intel/vulkan/layers/anv_doom64.c b/src/intel/vulkan/layers/anv_doom64.c
new file mode 100644
index 00000000000..8fe0287c417
--- /dev/null
+++ b/src/intel/vulkan/layers/anv_doom64.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/set.h"
+#include "anv_private.h"
+#include "vk_common_entrypoints.h"
+
+/**
+ * The DOOM 64 rendering corruption is happening because the game always uses
+ * ```
+ * vkCmdPipelineBarrier(VK_IMAGE_LAYOUT_UNDEFINED ->
+ *                      VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL)
+ * vkCmdCopyBufferToImage(...)
+ * vkCmdPipelineBarrier(VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL ->
+ *                      VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL)
+ * ```
+ * when it wants to update its texture atlas image.
+ *
+ * According to spec, transitioning from VK_IMAGE_LAYOUT_UNDEFINED means
+ * that the current image content might be discarded, but the game relies
+ * on it being fully preserved.
+ *
+ * This work-around layer implements super-barebone layout tracking: allows
+ * the first transition from VK_IMAGE_LAYOUT_UNDEFINED, but replaces
+ * oldLayout with VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL for each
+ * subsequent transition of that image.
+ *
+ * Gen12+ does not ambiguate CCS data on transition from VK_IMAGE_LAYOUT_UNDEFINED
+ * so it preserves all compressed information, and this WA is not needed.
+ */
+
+void anv_doom64_CmdPipelineBarrier(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineStageFlags                        srcStageMask,
+    VkPipelineStageFlags                        dstStageMask,
+    VkDependencyFlags                           dependencyFlags,
+    uint32_t                                    memoryBarrierCount,
+    const VkMemoryBarrier*                      pMemoryBarriers,
+    uint32_t                                    bufferMemoryBarrierCount,
+    const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
+    uint32_t                                    imageMemoryBarrierCount,
+    const VkImageMemoryBarrier*                 pImageMemoryBarriers)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, command_buffer, commandBuffer);
+   assert(command_buffer && command_buffer->device);
+
+   VkImageMemoryBarrier fixed_barrier;
+   struct set * defined_images =
+      command_buffer->device->workarounds.doom64_images;
+
+   if (defined_images &&
+       imageMemoryBarrierCount == 1 && pImageMemoryBarriers &&
+       pImageMemoryBarriers[0].oldLayout == VK_IMAGE_LAYOUT_UNDEFINED &&
+       pImageMemoryBarriers[0].newLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) {
+      ANV_FROM_HANDLE(anv_image, image, pImageMemoryBarriers[0].image);
+
+      if (!_mesa_set_search(defined_images, image)) {
+         _mesa_set_add(defined_images, image);
+      } else {
+         memcpy(&fixed_barrier, pImageMemoryBarriers, sizeof(VkImageMemoryBarrier));
+
+         fixed_barrier.oldLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+
+         pImageMemoryBarriers = (const VkImageMemoryBarrier*) &fixed_barrier;
+      }
+   }
+
+   vk_common_CmdPipelineBarrier(commandBuffer, srcStageMask, dstStageMask,
+                                dependencyFlags, memoryBarrierCount,
+                                pMemoryBarriers, bufferMemoryBarrierCount,
+                                pBufferMemoryBarriers,
+                                imageMemoryBarrierCount,
+                                pImageMemoryBarriers);
+}
+
+VkResult anv_doom64_CreateImage(
+    VkDevice                                    _device,
+    const VkImageCreateInfo*                    pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkImage*                                    pImage)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   assert(device);
+
+   if (!device->workarounds.doom64_images) {
+      device->workarounds.doom64_images = _mesa_pointer_set_create(NULL);
+
+      if (!device->workarounds.doom64_images) {
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+      }
+   }
+
+   return anv_CreateImage(_device, pCreateInfo, pAllocator, pImage);
+}
+
+void anv_doom64_DestroyImage(
+    VkDevice                                    _device,
+    VkImage                                     _image,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   ANV_FROM_HANDLE(anv_image, image, _image);
+   assert(device);
+
+   struct set * defined_images = device->workarounds.doom64_images;
+
+   if (image && defined_images) {
+      _mesa_set_remove_key(defined_images, image);
+
+      if (!defined_images->entries) {
+         _mesa_set_destroy(defined_images, NULL);
+         device->workarounds.doom64_images = NULL;
+      }
+   }
+
+   anv_DestroyImage(_device, _image, pAllocator);
+}
diff --git a/src/intel/vulkan/layers/anv_hitman3.c b/src/intel/vulkan/layers/anv_hitman3.c
new file mode 100644
index 00000000000..a6add16d0c3
--- /dev/null
+++ b/src/intel/vulkan/layers/anv_hitman3.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright © 2022 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+
+VkResult anv_hitman3_CreateBufferView(
+    VkDevice                                    _device,
+    const VkBufferViewCreateInfo*               pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkBufferView*                               pView)
+{
+   ANV_FROM_HANDLE(anv_buffer, buffer, pCreateInfo->buffer);
+   if (pCreateInfo->format == VK_FORMAT_R32G32B32_SFLOAT &&
+       (buffer->vk.usage & VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT)) {
+      ANV_FROM_HANDLE(anv_device, device, _device);
+      return vk_errorf(device, VK_ERROR_UNKNOWN,
+                       "invalid image format requested for storage");
+   }
+
+   return anv_CreateBufferView(_device, pCreateInfo, pAllocator, pView);
+}
diff --git a/src/intel/vulkan/layers/anv_rmv_layer.c b/src/intel/vulkan/layers/anv_rmv_layer.c
new file mode 100644
index 00000000000..2e36e5d4012
--- /dev/null
+++ b/src/intel/vulkan/layers/anv_rmv_layer.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "rmv/vk_rmv_common.h"
+#include "rmv/vk_rmv_tokens.h"
+#include "anv_private.h"
+#include "vk_common_entrypoints.h"
+
+VkResult anv_rmv_QueuePresentKHR(
+    VkQueue                                  _queue,
+    const VkPresentInfoKHR*                  pPresentInfo)
+{
+   ANV_FROM_HANDLE(anv_queue, queue, _queue);
+   struct anv_device *device = queue->device;
+
+   VkResult res = anv_QueuePresentKHR(_queue, pPresentInfo);
+   if ((res != VK_SUCCESS && res != VK_SUBOPTIMAL_KHR) ||
+       !device->vk.memory_trace_data.is_enabled)
+      return res;
+
+   vk_rmv_log_misc_token(&device->vk, VK_RMV_MISC_EVENT_TYPE_PRESENT);
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_rmv_FlushMappedMemoryRanges(
+    VkDevice                                    _device,
+    uint32_t                                    memoryRangeCount,
+    const VkMappedMemoryRange*                  pMemoryRanges)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   VkResult res = anv_FlushMappedMemoryRanges(_device, memoryRangeCount, pMemoryRanges);
+   if (res != VK_SUCCESS || !device->vk.memory_trace_data.is_enabled)
+      return res;
+
+   vk_rmv_log_misc_token(&device->vk, VK_RMV_MISC_EVENT_TYPE_FLUSH_MAPPED_RANGE);
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_rmv_InvalidateMappedMemoryRanges(
+    VkDevice                                    _device,
+    uint32_t                                    memoryRangeCount,
+    const VkMappedMemoryRange*                  pMemoryRanges)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   VkResult res = anv_InvalidateMappedMemoryRanges(_device, memoryRangeCount, pMemoryRanges);
+   if (res != VK_SUCCESS || !device->vk.memory_trace_data.is_enabled)
+      return res;
+
+   vk_rmv_log_misc_token(&device->vk, VK_RMV_MISC_EVENT_TYPE_INVALIDATE_RANGES);
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_rmv_SetDebugUtilsObjectNameEXT(
+    VkDevice                                    _device,
+    const VkDebugUtilsObjectNameInfoEXT*        pNameInfo)
+{
+   assert(pNameInfo->sType == VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT);
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   VkResult result = vk_common_SetDebugUtilsObjectNameEXT(_device, pNameInfo);
+   if (result != VK_SUCCESS || !device->vk.memory_trace_data.is_enabled)
+      return result;
+
+   switch (pNameInfo->objectType) {
+   /* only name object types we care about */
+   case VK_OBJECT_TYPE_BUFFER:
+   case VK_OBJECT_TYPE_DEVICE_MEMORY:
+   case VK_OBJECT_TYPE_IMAGE:
+   case VK_OBJECT_TYPE_EVENT:
+   case VK_OBJECT_TYPE_QUERY_POOL:
+   case VK_OBJECT_TYPE_DESCRIPTOR_POOL:
+   case VK_OBJECT_TYPE_PIPELINE:
+      break;
+   default:
+      return VK_SUCCESS;
+   }
+
+   size_t name_len = strlen(pNameInfo->pObjectName);
+   char *name_buf = malloc(name_len + 1);
+   if (!name_buf) {
+      /*
+       * Silently fail, so that applications may still continue if possible.
+       */
+      return VK_SUCCESS;
+   }
+   strcpy(name_buf, pNameInfo->pObjectName);
+
+   simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
+   struct vk_rmv_userdata_token token;
+   token.name = name_buf;
+   token.resource_id = vk_rmv_get_resource_id_locked(&device->vk, pNameInfo->objectHandle);
+
+   vk_rmv_emit_token(&device->vk.memory_trace_data, VK_RMV_TOKEN_TYPE_USERDATA, &token);
+   simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
+
+   return VK_SUCCESS;
+}
diff --git a/src/intel/vulkan/meson.build b/src/intel/vulkan/meson.build
index 97423f0b025..8eecda92547 100644
--- a/src/intel/vulkan/meson.build
+++ b/src/intel/vulkan/meson.build
@@ -18,6 +18,15 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+inc_anv = include_directories('.')
+
+anv_flags = [
+  no_override_init_args,
+  sse2_args,
+]
+
+anv_cpp_flags = []
+
 anv_entrypoints = custom_target(
   'anv_entrypoints',
   input : [vk_entrypoints_gen, vk_api_xml],
@@ -25,89 +34,169 @@ anv_entrypoints = custom_target(
   command : [
     prog_python, '@INPUT0@', '--xml', '@INPUT1@', '--proto', '--weak',
     '--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'anv',
-    '--device-prefix', 'gfx7', '--device-prefix', 'gfx75',
-    '--device-prefix', 'gfx8', '--device-prefix', 'gfx9',
-    '--device-prefix', 'gfx11', '--device-prefix', 'gfx12',
+    '--beta', with_vulkan_beta.to_string(),
+    '--device-prefix', 'gfx9',
+    '--device-prefix', 'gfx11',
+    '--device-prefix', 'gfx12',
     '--device-prefix', 'gfx125',
+    '--device-prefix', 'gfx20',
+    '--device-prefix', 'anv_doom64',
+    '--device-prefix', 'anv_hitman3',
+    '--device-prefix', 'anv_android',
+    '--device-prefix', 'anv_rmv',
   ],
   depend_files : vk_entrypoints_gen_depend_files,
 )
 
+idep_anv_headers = declare_dependency(
+  sources : [anv_entrypoints[0]],
+  include_directories : inc_anv,
+)
+
+if with_intel_vk_rt
+  subdir('grl')
+  optional_libgrl = [libgrl]
+  anv_flags += '-DANV_SUPPORT_RT=1'
+else
+  idep_grl = null_dep
+  optional_libgrl = []
+  anv_flags += '-DANV_SUPPORT_RT=0'
+endif
+
 intel_icd = custom_target(
   'intel_icd',
   input : [vk_icd_gen, vk_api_xml],
   output : 'intel_icd.@0@.json'.format(host_machine.cpu()),
   command : [
     prog_python, '@INPUT0@',
-    '--api-version', '1.2', '--xml', '@INPUT1@',
+    '--api-version', '1.3', '--xml', '@INPUT1@',
     '--lib-path', join_paths(get_option('prefix'), get_option('libdir'),
                              'libvulkan_intel.so'),
     '--out', '@OUTPUT@',
   ],
   build_by_default : true,
   install_dir : with_vulkan_icd_dir,
+  install_tag : 'runtime',
   install : true,
 )
 
+_dev_icdname = 'intel_devenv_icd.@0@.json'.format(host_machine.cpu())
+_dev_icd = custom_target(
+  'intel_devenv_icd',
+  input : [vk_icd_gen, vk_api_xml],
+  output : _dev_icdname,
+  command : [
+    prog_python, '@INPUT0@',
+    '--api-version', '1.3', '--xml', '@INPUT1@',
+    '--lib-path', meson.current_build_dir() / 'libvulkan_intel.so',
+    '--out', '@OUTPUT@',
+  ],
+  build_by_default : true,
+)
+
+devenv.append('VK_DRIVER_FILES', _dev_icd.full_path())
+# Deprecated: replaced by VK_DRIVER_FILES above
+devenv.append('VK_ICD_FILENAMES', _dev_icd.full_path())
+
 libanv_per_hw_ver_libs = []
 anv_per_hw_ver_files = files(
   'genX_blorp_exec.c',
   'genX_cmd_buffer.c',
+  'genX_cmd_compute.c',
+  'genX_cmd_draw.c',
+  'genX_cmd_draw_generated_flush.h',
+  'genX_cmd_draw_generated_indirect.h',
+  'genX_cmd_video.c',
+  'genX_gfx_state.c',
   'genX_gpu_memcpy.c',
+  'genX_init_state.c',
+  'genX_internal_kernels.c',
   'genX_pipeline.c',
   'genX_query.c',
-  'genX_state.c',
+  'genX_simple_shader.c',
 )
-foreach g : [['70', ['gfx7_cmd_buffer.c']], ['75', ['gfx7_cmd_buffer.c']],
-             ['80', ['gfx8_cmd_buffer.c']], ['90', ['gfx8_cmd_buffer.c']],
-             ['110', ['gfx8_cmd_buffer.c']], ['120', ['gfx8_cmd_buffer.c']],
-             ['125', ['gfx8_cmd_buffer.c']]]
-  _gfx_ver = g[0]
+if with_intel_vk_rt
+  anv_per_hw_ver_files += files('genX_acceleration_structure.c',)
+endif
+
+foreach _gfx_ver : ['90', '110', '120', '125', '200']
   libanv_per_hw_ver_libs += static_library(
     'anv_per_hw_ver@0@'.format(_gfx_ver),
-    [anv_per_hw_ver_files, g[1], anv_entrypoints[0]],
+    [anv_per_hw_ver_files, anv_entrypoints[0]],
     include_directories : [
-      inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_compiler, inc_intel, inc_vulkan_wsi,
-    ],
-    c_args : [
-      no_override_init_args, c_sse2_args,
-      '-DGFX_VERx10=@0@'.format(_gfx_ver),
+      inc_include, inc_src, inc_intel,
     ],
+    c_args : anv_flags + ['-DGFX_VERx10=@0@'.format(_gfx_ver)],
     gnu_symbol_visibility : 'hidden',
     dependencies : [
-      dep_libdrm, dep_valgrind, idep_nir_headers, idep_genxml, idep_vulkan_util_headers,
+      dep_libdrm, dep_valgrind, idep_nir_headers, idep_genxml,
+      idep_vulkan_util_headers, idep_vulkan_wsi_headers,
+      idep_vulkan_runtime_headers, idep_intel_driver_ds_headers,
+      idep_grl, idep_intel_shaders, idep_intel_blorp,
     ],
   )
 endforeach
 
 libanv_files = files(
-  'anv_acceleration_structure.c',
+  'i915/anv_batch_chain.c',
+  'i915/anv_batch_chain.h',
+  'i915/anv_device.c',
+  'i915/anv_device.h',
+  'i915/anv_gem.c',
+  'i915/anv_gem.h',
+  'i915/anv_kmd_backend.c',
+  'i915/anv_queue.c',
+  'i915/anv_queue.h',
+  'layers/anv_android_layer.c',
+  'layers/anv_doom64.c',
+  'layers/anv_hitman3.c',
+  'layers/anv_rmv_layer.c',
+  'xe/anv_batch_chain.c',
+  'xe/anv_batch_chain.h',
+  'xe/anv_kmd_backend.c',
+  'xe/anv_device.c',
+  'xe/anv_device.h',
+  'xe/anv_queue.c',
+  'xe/anv_queue.h',
   'anv_allocator.c',
   'anv_android.h',
+  'anv_astc_emu.c',
   'anv_batch_chain.c',
   'anv_blorp.c',
+  'anv_bo_sync.c',
   'anv_cmd_buffer.c',
   'anv_descriptor_set.c',
   'anv_device.c',
   'anv_formats.c',
   'anv_genX.h',
   'anv_image.c',
+  'anv_internal_kernels.c',
+  'anv_internal_kernels.h',
+  'anv_kmd_backend.c',
+  'anv_kmd_backend.h',
   'anv_measure.c',
   'anv_measure.h',
+  'anv_mesh_perprim_wa.c',
   'anv_nir.h',
-  'anv_nir_add_base_work_group_id.c',
   'anv_nir_apply_pipeline_layout.c',
   'anv_nir_compute_push_layout.c',
   'anv_nir_lower_multiview.c',
+  'anv_nir_lower_load_patch_vertices_in.c',
   'anv_nir_lower_ubo_loads.c',
-  'anv_nir_lower_ycbcr_textures.c',
-  'anv_pass.c',
+  'anv_nir_lower_resource_intel.c',
+  'anv_nir_push_descriptor_analysis.c',
   'anv_perf.c',
   'anv_pipeline.c',
   'anv_pipeline_cache.c',
   'anv_private.h',
   'anv_queue.c',
+  'anv_rmv.c',
+  'anv_rmv.h',
+  'anv_sparse.c',
   'anv_util.c',
+  'anv_utrace.c',
+  'anv_va.c',
+  'anv_video.c',
   'anv_wsi.c',
 )
 
@@ -117,77 +206,70 @@ anv_deps = [
   idep_genxml,
   idep_nir_headers,
   idep_vulkan_util_headers,
-]
-anv_flags = [
-  no_override_init_args,
-  c_sse2_args,
+  idep_vulkan_runtime_headers,
+  idep_vulkan_wsi_headers,
+  idep_intel_shaders,
+  idep_intel_blorp,
 ]
 
 if with_platform_x11
   anv_deps += dep_xcb_dri3
-  anv_flags += [
-    '-DVK_USE_PLATFORM_XCB_KHR',
-    '-DVK_USE_PLATFORM_XLIB_KHR',
-  ]
-  libanv_files += files('anv_wsi_x11.c')
 endif
 
 if with_platform_wayland
   anv_deps += dep_wayland_client
-  anv_flags += '-DVK_USE_PLATFORM_WAYLAND_KHR'
-  libanv_files += files('anv_wsi_wayland.c')
-endif
-
-if system_has_kms_drm and not with_platform_android
-  anv_flags += '-DVK_USE_PLATFORM_DISPLAY_KHR'
-  libanv_files += files('anv_wsi_display.c')
 endif
 
 if with_xlib_lease
   anv_deps += [dep_xlib_xrandr]
-  anv_flags += '-DVK_USE_PLATFORM_XLIB_XRANDR_EXT'
 endif
 
 if with_platform_android
-  anv_flags += '-DVK_USE_PLATFORM_ANDROID_KHR'
+  anv_deps += idep_u_gralloc
   libanv_files += files('anv_android.c')
 else
   libanv_files += files('anv_android_stubs.c')
 endif
 
+anv_deps += idep_intel_driver_ds_headers
+
 libanv_common = static_library(
   'anv_common',
   [
     libanv_files, anv_entrypoints, sha1_h,
-    gen_xml_pack,
+    gen_xml_pack, intel_float64_spv_h,
   ],
   include_directories : [
-    inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler,
-    inc_vulkan_wsi, inc_util,
+    inc_include, inc_src, inc_intel,
+    inc_util,
   ],
   c_args : anv_flags,
+  cpp_args : anv_cpp_flags,
   gnu_symbol_visibility : 'hidden',
-  dependencies : anv_deps,
+  dependencies : anv_deps
 )
 
 libvulkan_intel = shared_library(
   'vulkan_intel',
   [files('anv_gem.c'), anv_entrypoints[0]],
   include_directories : [
-    inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler, inc_vulkan_wsi,
+    inc_include, inc_src, inc_intel,
   ],
-  link_whole : [libanv_common, libanv_per_hw_ver_libs],
+  link_whole : [libanv_common, libanv_per_hw_ver_libs] + optional_libgrl,
   link_with : [
-    libintel_compiler, libintel_dev, libisl, libblorp, libvulkan_wsi,
-    libintel_perf,
+    libisl, libintel_perf,
   ],
   dependencies : [
     dep_thread, dep_dl, dep_m, anv_deps, idep_libintel_common,
-    idep_nir, idep_genxml, idep_vulkan_util, idep_mesautil, idep_xmlconfig,
+    idep_nir, idep_genxml, idep_vulkan_util, idep_vulkan_wsi,
+    idep_vulkan_runtime, idep_mesautil, idep_xmlconfig,
+    idep_intel_driver_ds, idep_intel_dev, idep_intel_blorp,
+    idep_intel_compiler_brw, idep_intel_decoder_brw,
   ],
   c_args : anv_flags,
   gnu_symbol_visibility : 'hidden',
-  link_args : [ld_args_build_id, ld_args_bsymbolic, ld_args_gc_sections],
+  link_args : [vulkan_icd_link_args, ld_args_build_id, ld_args_bsymbolic, ld_args_gc_sections],
+  link_depends : vulkan_icd_link_depends,
   install : true,
 )
 
@@ -209,37 +291,54 @@ if with_tests
     'vulkan_intel_test',
     [files('anv_gem_stubs.c'), anv_entrypoints[0]],
     include_directories : [
-      inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler, inc_vulkan_wsi,
+      inc_include, inc_src, inc_intel,
     ],
     link_whole : libanv_common,
     link_with : [
-      libanv_per_hw_ver_libs, libintel_compiler, libintel_common, libintel_dev,
-      libisl, libblorp, libvulkan_wsi, libintel_perf,
-    ],
+      libanv_per_hw_ver_libs, libintel_common,
+      libisl, libintel_perf,
+    ] + optional_libgrl,
     dependencies : [
       dep_thread, dep_dl, dep_m, anv_deps,
-      idep_nir, idep_vulkan_util, idep_mesautil,
+      idep_nir, idep_vulkan_util, idep_vulkan_wsi, idep_vulkan_runtime,
+      idep_mesautil, idep_intel_dev, idep_intel_shaders, idep_intel_blorp,
+      idep_intel_compiler_brw, idep_intel_decoder_brw,
     ],
     c_args : anv_flags,
     gnu_symbol_visibility : 'hidden',
   )
 
-  foreach t : ['block_pool_no_free', 'block_pool_grow_first',
-               'state_pool_no_free', 'state_pool_free_list_only',
-               'state_pool', 'state_pool_padding']
-    test(
-      'anv_@0@'.format(t),
-      executable(
-        t,
-        ['tests/@0@.c'.format(t), anv_entrypoints[0]],
-        c_args : [ c_sse2_args ],
-        link_with : libvulkan_intel_test,
-        dependencies : [dep_libdrm, dep_thread, dep_m, dep_valgrind, idep_vulkan_util, ],
-        include_directories : [
-          inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_intel, inc_compiler, inc_vulkan_wsi,
-        ],
-      ),
-      suite : ['intel'],
-    )
-  endforeach
+  files_anv_tests = files(
+    'tests/anv_tests.cpp',
+
+    'tests/state_pool.c',
+    'tests/state_pool_free_list_only.c',
+    'tests/state_pool_max_size.c',
+    'tests/state_pool_no_free.c',
+    'tests/state_pool_padding.c',
+    'tests/block_pool_no_free.c',
+    'tests/block_pool_grow_first.c',
+    'tests/block_pool_max_size.c',
+  )
+
+  test(
+    'anv_tests',
+    executable(
+      'anv_tests',
+      [files_anv_tests, anv_entrypoints[0]],
+      c_args : [ sse2_args ],
+      link_with : libvulkan_intel_test,
+      dependencies : [
+        idep_gtest, dep_libdrm, dep_thread, dep_m, dep_valgrind,
+        idep_vulkan_util, idep_vulkan_wsi_headers,
+        idep_vulkan_runtime, idep_intel_driver_ds, idep_intel_dev,
+        idep_intel_shaders,
+      ],
+      include_directories : [
+        inc_include, inc_src, inc_intel,
+      ],
+    ),
+    suite : ['intel'],
+    protocol : 'gtest',
+  )
 endif
diff --git a/src/intel/vulkan/tests/anv_tests.cpp b/src/intel/vulkan/tests/anv_tests.cpp
new file mode 100644
index 00000000000..09be512f81e
--- /dev/null
+++ b/src/intel/vulkan/tests/anv_tests.cpp
@@ -0,0 +1,25 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <gtest/gtest.h>
+
+#include "test_common.h"
+
+#define ANV_C_TEST(S, N, C) extern "C" void C(void); TEST(S, N) { C(); }
+
+ANV_C_TEST(StatePool, Regular, state_pool_test);
+ANV_C_TEST(StatePool, FreeListOnly, state_pool_free_list_only_test);
+ANV_C_TEST(StatePool, MaxSizeOverLimit, state_pool_max_size_over_limit);
+ANV_C_TEST(StatePool, MaxSizeWithinLimit, state_pool_max_size_within_limit);
+ANV_C_TEST(StatePool, NoFree, state_pool_no_free_test);
+ANV_C_TEST(StatePool, Padding, state_pool_padding_test);
+
+ANV_C_TEST(BlockPool, NoFree, block_pool_no_free_test);
+ANV_C_TEST(BlockPool, GrowFirst, block_pool_grow_first_test);
+ANV_C_TEST(BlockPool, MaxSize, block_pool_max_size);
+
+extern "C" void FAIL_IN_GTEST(const char *file_path, unsigned line_number, const char *msg) {
+   GTEST_FAIL_AT(file_path, line_number) << msg;
+}
diff --git a/src/intel/vulkan/tests/block_pool_grow_first.c b/src/intel/vulkan/tests/block_pool_grow_first.c
index e50f65c8d68..1c745360ea8 100644
--- a/src/intel/vulkan/tests/block_pool_grow_first.c
+++ b/src/intel/vulkan/tests/block_pool_grow_first.c
@@ -24,14 +24,12 @@
 #include "anv_private.h"
 #include "test_common.h"
 
-int main(void)
+void block_pool_grow_first_test(void);
+
+void block_pool_grow_first_test(void)
 {
-   struct anv_physical_device physical_device = {
-      .use_softpin = true,
-   };
-   struct anv_device device = {
-      .physical = &physical_device,
-   };
+   struct anv_physical_device physical_device = {};
+   struct anv_device device = {};
    struct anv_block_pool pool;
 
    /* Create a pool with initial size smaller than the block allocated, so
@@ -39,14 +37,20 @@ int main(void)
     */
    const uint32_t block_size = 16 * 1024;
    const uint32_t initial_size = block_size / 2;
+   const uint32_t _1Gb = 1024 * 1024 * 1024;
 
+   test_device_info_init(&physical_device.info);
+   anv_device_set_physical(&device, &physical_device);
+   device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB);
    pthread_mutex_init(&device.mutex, NULL);
-   anv_bo_cache_init(&device.bo_cache);
-   anv_block_pool_init(&pool, &device, "test", 4096, initial_size);
+   anv_bo_cache_init(&device.bo_cache, &device);
+   anv_block_pool_init(&pool, &device, "test", 4096, initial_size, _1Gb);
    ASSERT(pool.size == initial_size);
 
    uint32_t padding;
-   int32_t offset = anv_block_pool_alloc(&pool, block_size, &padding);
+   int64_t offset;
+   VkResult result = anv_block_pool_alloc(&pool, block_size, &offset, &padding);
+   ASSERT(result == VK_SUCCESS);
 
    /* Pool will have grown at least space to fit the new allocation. */
    ASSERT(pool.size > initial_size);
@@ -63,4 +67,6 @@ int main(void)
    memset(map, 22, block_size);
 
    anv_block_pool_finish(&pool);
+   anv_bo_cache_finish(&device.bo_cache);
+   pthread_mutex_destroy(&device.mutex);
 }
diff --git a/src/intel/vulkan/tests/block_pool_max_size.c b/src/intel/vulkan/tests/block_pool_max_size.c
new file mode 100644
index 00000000000..b9f6620cbaf
--- /dev/null
+++ b/src/intel/vulkan/tests/block_pool_max_size.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "anv_private.h"
+#include "test_common.h"
+
+void block_pool_max_size(void);
+
+void block_pool_max_size(void)
+{
+   struct anv_physical_device physical_device = {};
+   struct anv_device device = {};
+   struct anv_block_pool pool;
+
+   const uint32_t block_size = 16 * 1024;
+   const uint32_t initial_size = block_size;
+   const uint32_t _1Mb = 1024 * 1024;
+
+   test_device_info_init(&physical_device.info);
+   anv_device_set_physical(&device, &physical_device);
+   device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB);
+   pthread_mutex_init(&device.mutex, NULL);
+   anv_bo_cache_init(&device.bo_cache, &device);
+   anv_block_pool_init(&pool, &device, "test", 4096, initial_size, _1Mb);
+   ASSERT(pool.size == initial_size);
+
+   for (uint32_t i = 0; i < _1Mb / block_size; i++) {
+      uint32_t padding;
+      int64_t offset;
+
+      VkResult result = anv_block_pool_alloc(&pool, block_size, &offset, &padding);
+      ASSERT(result == VK_SUCCESS);
+
+      /* Pool will have grown at least space to fit the new allocation. */
+      ASSERT(pool.size <= _1Mb);
+
+      /* Use the memory to ensure it is valid. */
+      void *map = anv_block_pool_map(&pool, offset, block_size);
+      memset(map, 22, block_size);
+   }
+
+   {
+      uint32_t padding;
+      int64_t offset;
+
+      VkResult result = anv_block_pool_alloc(&pool, block_size, &offset, &padding);
+      ASSERT(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
+   }
+
+   anv_block_pool_finish(&pool);
+   anv_bo_cache_finish(&device.bo_cache);
+   pthread_mutex_destroy(&device.mutex);
+}
diff --git a/src/intel/vulkan/tests/block_pool_no_free.c b/src/intel/vulkan/tests/block_pool_no_free.c
index 37030bdd7a3..7c9c8951361 100644
--- a/src/intel/vulkan/tests/block_pool_no_free.c
+++ b/src/intel/vulkan/tests/block_pool_no_free.c
@@ -30,12 +30,11 @@
 #define BLOCKS_PER_THREAD 1024
 #define NUM_RUNS 64
 
-struct job {
+static struct job {
    pthread_t thread;
    unsigned id;
    struct anv_block_pool *pool;
    int32_t blocks[BLOCKS_PER_THREAD];
-   int32_t back_blocks[BLOCKS_PER_THREAD];
 } jobs[NUM_THREADS];
 
 
@@ -44,30 +43,24 @@ static void *alloc_blocks(void *_job)
    struct job *job = _job;
    uint32_t job_id = job - jobs;
    uint32_t block_size = 16 * ((job_id % 4) + 1);
-   int32_t block, *data;
+   int64_t block;
+   int32_t *data;
 
    for (unsigned i = 0; i < BLOCKS_PER_THREAD; i++) {
-      block = anv_block_pool_alloc(job->pool, block_size, NULL);
+      UNUSED uint32_t padding;
+      VkResult result = anv_block_pool_alloc(job->pool, block_size,
+                                             &block, &padding);
+      ASSERT(result == VK_SUCCESS);
       data = anv_block_pool_map(job->pool, block, block_size);
       *data = block;
       ASSERT(block >= 0);
       job->blocks[i] = block;
-
-      block = anv_block_pool_alloc_back(job->pool, block_size);
-      data = anv_block_pool_map(job->pool, block, block_size);
-      *data = block;
-      ASSERT(block < 0);
-      job->back_blocks[i] = -block;
    }
 
    for (unsigned i = 0; i < BLOCKS_PER_THREAD; i++) {
       block = job->blocks[i];
       data = anv_block_pool_map(job->pool, block, block_size);
       ASSERT(*data == block);
-
-      block = -job->back_blocks[i];
-      data = anv_block_pool_map(job->pool, block, block_size);
-      ASSERT(*data == block);
    }
 
    return NULL;
@@ -110,15 +103,17 @@ static void validate_monotonic(int32_t **blocks)
 
 static void run_test()
 {
-   struct anv_physical_device physical_device = { };
-   struct anv_device device = {
-      .physical = &physical_device,
-   };
+   struct anv_physical_device physical_device = {};
+   struct anv_device device = {};
    struct anv_block_pool pool;
+   const uint32_t _1Gb = 1024 * 1024 * 1024;
 
+   test_device_info_init(&physical_device.info);
+   anv_device_set_physical(&device, &physical_device);
+   device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB);
    pthread_mutex_init(&device.mutex, NULL);
-   anv_bo_cache_init(&device.bo_cache);
-   anv_block_pool_init(&pool, &device, "test", 4096, 4096);
+   anv_bo_cache_init(&device.bo_cache, &device);
+   anv_block_pool_init(&pool, &device, "test", 4096, 4096, _1Gb);
 
    for (unsigned i = 0; i < NUM_THREADS; i++) {
       jobs[i].pool = &pool;
@@ -135,16 +130,14 @@ static void run_test()
       block_ptrs[i] = jobs[i].blocks;
    validate_monotonic(block_ptrs);
 
-   /* Validate that the back block allocations were monotonic */
-   for (unsigned i = 0; i < NUM_THREADS; i++)
-      block_ptrs[i] = jobs[i].back_blocks;
-   validate_monotonic(block_ptrs);
-
    anv_block_pool_finish(&pool);
+   anv_bo_cache_finish(&device.bo_cache);
    pthread_mutex_destroy(&device.mutex);
 }
 
-int main(void)
+void block_pool_no_free_test(void);
+
+void block_pool_no_free_test(void)
 {
    for (unsigned i = 0; i < NUM_RUNS; i++)
       run_test();
diff --git a/src/intel/vulkan/tests/state_pool.c b/src/intel/vulkan/tests/state_pool.c
index 2f54efe783c..20eb2a34750 100644
--- a/src/intel/vulkan/tests/state_pool.c
+++ b/src/intel/vulkan/tests/state_pool.c
@@ -26,34 +26,45 @@
 #include "anv_private.h"
 #include "test_common.h"
 
-#define NUM_THREADS 8
-#define STATES_PER_THREAD_LOG2 10
-#define STATES_PER_THREAD (1 << STATES_PER_THREAD_LOG2)
-#define NUM_RUNS 64
-
 #include "state_pool_test_helper.h"
 
-int main(void)
+void state_pool_test(void);
+
+void state_pool_test(void)
 {
+   const unsigned num_threads = 8;
+   const unsigned states_per_thread = 1 << 10;
+
    struct anv_physical_device physical_device = { };
-   struct anv_device device = {
-      .physical = &physical_device,
-   };
+   struct anv_device device = {};
    struct anv_state_pool state_pool;
 
+   test_device_info_init(&physical_device.info);
+   anv_device_set_physical(&device, &physical_device);
+   device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB);
    pthread_mutex_init(&device.mutex, NULL);
-   anv_bo_cache_init(&device.bo_cache);
+   anv_bo_cache_init(&device.bo_cache, &device);
 
-   for (unsigned i = 0; i < NUM_RUNS; i++) {
-      anv_state_pool_init(&state_pool, &device, "test", 4096, 0, 256);
+   const unsigned num_runs = 64;
+   const uint32_t _1Gb = 1024 * 1024 * 1024;
+   for (unsigned i = 0; i < num_runs; i++) {
+      anv_state_pool_init(&state_pool, &device,
+                          &(struct anv_state_pool_params) {
+                             .name         = "test",
+                             .base_address = 4096,
+                             .start_offset = 0,
+                             .block_size   = 256,
+                             .max_size     = _1Gb,
+                          });
 
       /* Grab one so a zero offset is impossible */
       anv_state_pool_alloc(&state_pool, 16, 16);
 
-      run_state_pool_test(&state_pool);
+      run_state_pool_test(&state_pool, num_threads, states_per_thread);
 
       anv_state_pool_finish(&state_pool);
    }
 
+   anv_bo_cache_finish(&device.bo_cache);
    pthread_mutex_destroy(&device.mutex);
 }
diff --git a/src/intel/vulkan/tests/state_pool_free_list_only.c b/src/intel/vulkan/tests/state_pool_free_list_only.c
index 193169867c1..d64a8b8f827 100644
--- a/src/intel/vulkan/tests/state_pool_free_list_only.c
+++ b/src/intel/vulkan/tests/state_pool_free_list_only.c
@@ -26,23 +26,33 @@
 #include "anv_private.h"
 #include "test_common.h"
 
-#define NUM_THREADS 8
-#define STATES_PER_THREAD_LOG2 12
-#define STATES_PER_THREAD (1 << STATES_PER_THREAD_LOG2)
-
 #include "state_pool_test_helper.h"
 
-int main(void)
+void state_pool_free_list_only_test(void);
+
+void state_pool_free_list_only_test(void)
 {
+   const unsigned num_threads = 8;
+   const unsigned states_per_thread = 1 << 12;
+   const uint32_t _1Gb = 1024 * 1024 * 1024;
+
    struct anv_physical_device physical_device = { };
-   struct anv_device device = {
-      .physical = &physical_device,
-   };
+   struct anv_device device = {};
    struct anv_state_pool state_pool;
 
+   test_device_info_init(&physical_device.info);
+   anv_device_set_physical(&device, &physical_device);
+   device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB);
    pthread_mutex_init(&device.mutex, NULL);
-   anv_bo_cache_init(&device.bo_cache);
-   anv_state_pool_init(&state_pool, &device, "test", 4096, 0, 4096);
+   anv_bo_cache_init(&device.bo_cache, &device);
+   anv_state_pool_init(&state_pool, &device,
+                       &(struct anv_state_pool_params) {
+                          .name         = "test",
+                          .base_address = 4096,
+                          .start_offset = 0,
+                          .block_size   = 4096,
+                          .max_size     = _1Gb,
+                       });
 
    /* Grab one so a zero offset is impossible */
    anv_state_pool_alloc(&state_pool, 16, 16);
@@ -51,18 +61,19 @@ int main(void)
     * actually ever resize anything.
     */
    {
-      struct anv_state states[NUM_THREADS * STATES_PER_THREAD];
-      for (unsigned i = 0; i < NUM_THREADS * STATES_PER_THREAD; i++) {
+      struct anv_state states[num_threads * states_per_thread];
+      for (unsigned i = 0; i < ARRAY_SIZE(states); i++) {
          states[i] = anv_state_pool_alloc(&state_pool, 16, 16);
          ASSERT(states[i].offset != 0);
       }
 
-      for (unsigned i = 0; i < NUM_THREADS * STATES_PER_THREAD; i++)
+      for (unsigned i = 0; i < ARRAY_SIZE(states); i++)
          anv_state_pool_free(&state_pool, states[i]);
    }
 
-   run_state_pool_test(&state_pool);
+   run_state_pool_test(&state_pool, num_threads, states_per_thread);
 
    anv_state_pool_finish(&state_pool);
+   anv_bo_cache_finish(&device.bo_cache);
    pthread_mutex_destroy(&device.mutex);
 }
diff --git a/src/intel/vulkan/tests/state_pool_max_size.c b/src/intel/vulkan/tests/state_pool_max_size.c
new file mode 100644
index 00000000000..4b7cb962b4e
--- /dev/null
+++ b/src/intel/vulkan/tests/state_pool_max_size.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <pthread.h>
+
+#include "anv_private.h"
+#include "test_common.h"
+
+#define NUM_THREADS 16
+#define STATES_PER_THREAD 1024
+#define NUM_RUNS 1
+
+static struct job {
+   pthread_t thread;
+   uint32_t state_size;
+   uint32_t state_alignment;
+   struct anv_state_pool *pool;
+   struct anv_state states[STATES_PER_THREAD];
+} jobs[NUM_THREADS];
+
+static pthread_barrier_t barrier;
+
+static void *alloc_states(void *_job)
+{
+   struct job *job = _job;
+
+   pthread_barrier_wait(&barrier);
+
+   for (unsigned i = 0; i < STATES_PER_THREAD; i++) {
+      struct anv_state state = anv_state_pool_alloc(job->pool,
+                                                    job->state_size,
+                                                    job->state_alignment);
+      job->states[i] = state;
+   }
+
+   return NULL;
+}
+
+static void run_test(uint32_t state_size,
+                     uint32_t state_alignment,
+                     uint32_t block_size,
+                     uint32_t pool_max_size)
+{
+   struct anv_physical_device physical_device = { };
+   struct anv_device device = {};
+   struct anv_state_pool state_pool;
+
+   test_device_info_init(&physical_device.info);
+   anv_device_set_physical(&device, &physical_device);
+   device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB);
+   pthread_mutex_init(&device.mutex, NULL);
+   anv_bo_cache_init(&device.bo_cache, &device);
+   anv_state_pool_init(&state_pool, &device,
+                       &(struct anv_state_pool_params) {
+                          .name         = "test",
+                          .base_address = 4096,
+                          .start_offset = 0,
+                          .block_size   = block_size,
+                          .max_size     = pool_max_size,
+                       });
+
+   pthread_barrier_init(&barrier, NULL, NUM_THREADS);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(jobs); i++) {
+      jobs[i].state_size = state_size;
+      jobs[i].state_alignment = state_alignment;
+      jobs[i].pool = &state_pool;
+      pthread_create(&jobs[i].thread, NULL, alloc_states, &jobs[i]);
+   }
+
+   for (unsigned i = 0; i < ARRAY_SIZE(jobs); i++)
+      pthread_join(jobs[i].thread, NULL);
+
+   const uint32_t expected_allocation_fails =
+      (NUM_THREADS * STATES_PER_THREAD * block_size) > pool_max_size ?
+      ((NUM_THREADS * STATES_PER_THREAD) - (pool_max_size / block_size)) : 0;
+   uint32_t allocation_fails = 0;
+   for (unsigned j = 0; j < ARRAY_SIZE(jobs); j++) {
+      int64_t last_state_offset = -1;
+      for (unsigned s = 0; s < ARRAY_SIZE(jobs[j].states); s++) {
+         if (jobs[j].states[s].alloc_size) {
+            ASSERT(last_state_offset < jobs[j].states[s].offset);
+            last_state_offset = jobs[j].states[s].offset;
+         } else {
+            allocation_fails++;
+         }
+      }
+   }
+
+   ASSERT(allocation_fails == expected_allocation_fails);
+
+   anv_state_pool_finish(&state_pool);
+   anv_bo_cache_finish(&device.bo_cache);
+   pthread_mutex_destroy(&device.mutex);
+}
+
+void state_pool_max_size_within_limit(void);
+
+void state_pool_max_size_within_limit(void)
+{
+   for (unsigned i = 0; i < NUM_RUNS; i++)
+      run_test(16, 16, 64, 64 * NUM_THREADS * STATES_PER_THREAD);
+}
+
+void state_pool_max_size_over_limit(void);
+
+void state_pool_max_size_over_limit(void)
+{
+   for (unsigned i = 0; i < NUM_RUNS; i++)
+      run_test(16, 16, 64, 16 * NUM_THREADS * STATES_PER_THREAD);
+}
diff --git a/src/intel/vulkan/tests/state_pool_no_free.c b/src/intel/vulkan/tests/state_pool_no_free.c
index 4288e1a1b87..07df9b1847c 100644
--- a/src/intel/vulkan/tests/state_pool_no_free.c
+++ b/src/intel/vulkan/tests/state_pool_no_free.c
@@ -30,14 +30,14 @@
 #define STATES_PER_THREAD 1024
 #define NUM_RUNS 64
 
-struct job {
+static struct job {
    pthread_t thread;
    unsigned id;
    struct anv_state_pool *pool;
    uint32_t offsets[STATES_PER_THREAD];
 } jobs[NUM_THREADS];
 
-pthread_barrier_t barrier;
+static pthread_barrier_t barrier;
 
 static void *alloc_states(void *_job)
 {
@@ -56,14 +56,23 @@ static void *alloc_states(void *_job)
 static void run_test()
 {
    struct anv_physical_device physical_device = { };
-   struct anv_device device = {
-      .physical = &physical_device,
-   };
+   struct anv_device device = {};
    struct anv_state_pool state_pool;
+   const uint32_t _1Gb = 1024 * 1024 * 1024;
 
+   test_device_info_init(&physical_device.info);
+   anv_device_set_physical(&device, &physical_device);
+   device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB);
    pthread_mutex_init(&device.mutex, NULL);
-   anv_bo_cache_init(&device.bo_cache);
-   anv_state_pool_init(&state_pool, &device, "test", 4096, 0, 64);
+   anv_bo_cache_init(&device.bo_cache, &device);
+   anv_state_pool_init(&state_pool, &device,
+                       &(struct anv_state_pool_params) {
+                          .name         = "test",
+                          .base_address = 4096,
+                          .start_offset = 0,
+                          .block_size   = 64,
+                          .max_size     = _1Gb,
+                       });
 
    pthread_barrier_init(&barrier, NULL, NUM_THREADS);
 
@@ -109,10 +118,13 @@ static void run_test()
    }
 
    anv_state_pool_finish(&state_pool);
+   anv_bo_cache_finish(&device.bo_cache);
    pthread_mutex_destroy(&device.mutex);
 }
 
-int main(void)
+void state_pool_no_free_test(void);
+
+void state_pool_no_free_test(void)
 {
    for (unsigned i = 0; i < NUM_RUNS; i++)
       run_test();
diff --git a/src/intel/vulkan/tests/state_pool_padding.c b/src/intel/vulkan/tests/state_pool_padding.c
index 70fb773b5b1..b9fa15f11a3 100644
--- a/src/intel/vulkan/tests/state_pool_padding.c
+++ b/src/intel/vulkan/tests/state_pool_padding.c
@@ -24,19 +24,28 @@
 #include "anv_private.h"
 #include "test_common.h"
 
-int main(void)
+void state_pool_padding_test(void);
+
+void state_pool_padding_test(void)
 {
-   struct anv_physical_device physical_device = {
-      .use_softpin = true,
-   };
-   struct anv_device device = {
-      .physical = &physical_device,
-   };
+   struct anv_physical_device physical_device = {};
+   struct anv_device device = {};
    struct anv_state_pool state_pool;
+   const uint32_t _1Gb = 1024 * 1024 * 1024;
 
+   test_device_info_init(&physical_device.info);
+   anv_device_set_physical(&device, &physical_device);
+   device.kmd_backend = anv_kmd_backend_get(INTEL_KMD_TYPE_STUB);
    pthread_mutex_init(&device.mutex, NULL);
-   anv_bo_cache_init(&device.bo_cache);
-   anv_state_pool_init(&state_pool, &device, "test", 4096, 0, 4096);
+   anv_bo_cache_init(&device.bo_cache, &device);
+   anv_state_pool_init(&state_pool, &device,
+                       &(struct anv_state_pool_params) {
+                          .name         = "test",
+                          .base_address = 4096,
+                          .start_offset = 0,
+                          .block_size   = 4096,
+                          .max_size     = _1Gb,
+                       });
 
    /* Get the size of the underlying block_pool */
    struct anv_block_pool *bp = &state_pool.block_pool;
@@ -75,4 +84,6 @@ int main(void)
    ASSERT(state.offset == pool_size);
 
    anv_state_pool_finish(&state_pool);
+   anv_bo_cache_finish(&device.bo_cache);
+   pthread_mutex_destroy(&device.mutex);
 }
diff --git a/src/intel/vulkan/tests/state_pool_test_helper.h b/src/intel/vulkan/tests/state_pool_test_helper.h
index f22a28ecc6f..de6a363efe1 100644
--- a/src/intel/vulkan/tests/state_pool_test_helper.h
+++ b/src/intel/vulkan/tests/state_pool_test_helper.h
@@ -23,49 +23,70 @@
 
 #include <pthread.h>
 
+#include "util/u_math.h"
+
 struct job {
-   struct anv_state_pool *pool;
+   struct state_pool_test_context *ctx;
    unsigned id;
    pthread_t thread;
-} jobs[NUM_THREADS];
+};
+
+struct state_pool_test_context {
+   struct anv_state_pool *pool;
+   unsigned states_per_thread;
+   pthread_barrier_t barrier;
 
-pthread_barrier_t barrier;
+   struct job *jobs;
+};
 
 static void *alloc_states(void *void_job)
 {
    struct job *job = void_job;
+   struct state_pool_test_context *ctx = job->ctx;
 
-   const unsigned chunk_size = 1 << (job->id % STATES_PER_THREAD_LOG2);
-   const unsigned num_chunks = STATES_PER_THREAD / chunk_size;
+   const unsigned states_per_thread_log2 = util_logbase2(ctx->states_per_thread);
+   const unsigned chunk_size = 1 << (job->id % states_per_thread_log2);
+   const unsigned num_chunks = ctx->states_per_thread / chunk_size;
 
    struct anv_state states[chunk_size];
 
-   pthread_barrier_wait(&barrier);
+   pthread_barrier_wait(&ctx->barrier);
 
    for (unsigned c = 0; c < num_chunks; c++) {
       for (unsigned i = 0; i < chunk_size; i++) {
-         states[i] = anv_state_pool_alloc(job->pool, 16, 16);
+         states[i] = anv_state_pool_alloc(ctx->pool, 16, 16);
          memset(states[i].map, 139, 16);
          ASSERT(states[i].offset != 0);
       }
 
       for (unsigned i = 0; i < chunk_size; i++)
-         anv_state_pool_free(job->pool, states[i]);
+         anv_state_pool_free(ctx->pool, states[i]);
    }
 
    return NULL;
 }
 
-static void run_state_pool_test(struct anv_state_pool *state_pool)
+static void run_state_pool_test(struct anv_state_pool *state_pool, unsigned num_threads,
+                                unsigned states_per_thread)
 {
-   pthread_barrier_init(&barrier, NULL, NUM_THREADS);
+   struct state_pool_test_context ctx = {
+      .pool = state_pool,
+      .states_per_thread = states_per_thread,
+      .jobs = calloc(num_threads, sizeof(struct job)),
+   };
+   pthread_barrier_init(&ctx.barrier, NULL, num_threads);
+
+   for (unsigned i = 0; i < num_threads; i++) {
+      struct job *job = &ctx.jobs[i];
+      job->ctx = &ctx;
+      job->id = i;
+      pthread_create(&job->thread, NULL, alloc_states, job);
+   }
 
-   for (unsigned i = 0; i < NUM_THREADS; i++) {
-      jobs[i].pool = state_pool;
-      jobs[i].id = i;
-      pthread_create(&jobs[i].thread, NULL, alloc_states, &jobs[i]);
+   for (unsigned i = 0; i < num_threads; i++) {
+      struct job *job = &ctx.jobs[i];
+      pthread_join(job->thread, NULL);
    }
 
-   for (unsigned i = 0; i < NUM_THREADS; i++)
-      pthread_join(jobs[i].thread, NULL);
+   free(ctx.jobs);
 }
diff --git a/src/intel/vulkan/tests/test_common.h b/src/intel/vulkan/tests/test_common.h
index 3f883e3bdcd..eea5b5ac82f 100644
--- a/src/intel/vulkan/tests/test_common.h
+++ b/src/intel/vulkan/tests/test_common.h
@@ -21,14 +21,27 @@
  * IN THE SOFTWARE.
  */
 
-#include <stdio.h>
-#include <stdlib.h>
+#include "dev/intel_device_info.h"
 
-#define ASSERT(cond)                                                    \
-   do {                                                                 \
-      if (!(cond)) {                                                    \
-         fprintf(stderr, "%s:%d: Test assertion `%s` failed.\n",        \
-                 __FILE__, __LINE__, # cond);                           \
-         abort();                                                       \
-      }                                                                 \
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ASSERT(cond)                                                               \
+   do {                                                                            \
+      if (!(cond)) {                                                               \
+         FAIL_IN_GTEST(__FILE__, __LINE__, "Test assertion `" # cond               \
+                                           "` failed.");                           \
+      }                                                                            \
    } while (false)
+
+static inline void test_device_info_init(struct intel_device_info *info)
+{
+   info->mem_alignment = 4096;
+}
+
+void FAIL_IN_GTEST(const char *file_path, unsigned line_number, const char *msg);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/src/intel/vulkan/xe/anv_batch_chain.c b/src/intel/vulkan/xe/anv_batch_chain.c
new file mode 100644
index 00000000000..69a5ed69949
--- /dev/null
+++ b/src/intel/vulkan/xe/anv_batch_chain.c
@@ -0,0 +1,409 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "xe/anv_batch_chain.h"
+
+#include "anv_private.h"
+#include "anv_measure.h"
+#include "common/intel_bind_timeline.h"
+
+#include "drm-uapi/xe_drm.h"
+
+VkResult
+xe_execute_simple_batch(struct anv_queue *queue,
+                        struct anv_bo *batch_bo,
+                        uint32_t batch_bo_size,
+                        bool is_companion_rcs_batch)
+{
+   struct anv_device *device = queue->device;
+   uint32_t exec_queue_id = is_companion_rcs_batch ?
+                            queue->companion_rcs_id :
+                            queue->exec_queue_id;
+   struct drm_syncobj_create syncobj_create = {};
+   struct drm_syncobj_destroy syncobj_destroy = {};
+   struct drm_xe_sync syncs[2] = {};
+   VkResult result = VK_SUCCESS;
+
+   if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_CREATE, &syncobj_create))
+      return vk_errorf(device, VK_ERROR_UNKNOWN, "Unable to create sync obj");
+
+   syncs[0].type = DRM_XE_SYNC_TYPE_SYNCOBJ;
+   syncs[0].flags = DRM_XE_SYNC_FLAG_SIGNAL;
+   syncs[0].handle = syncobj_create.handle;
+
+   /* vm bind sync */
+   syncs[1].type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ;
+   syncs[1].handle = intel_bind_timeline_get_syncobj(&device->bind_timeline);
+   syncs[1].timeline_value = intel_bind_timeline_get_last_point(&device->bind_timeline);
+
+   struct drm_xe_exec exec = {
+      .exec_queue_id = exec_queue_id,
+      .num_batch_buffer = 1,
+      .address = batch_bo->offset,
+      .num_syncs = ARRAY_SIZE(syncs),
+      .syncs = (uintptr_t)syncs,
+   };
+
+   if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec)) {
+      result = vk_device_set_lost(&device->vk, "XE_EXEC failed: %m");
+      goto exec_error;
+   }
+
+   struct drm_syncobj_wait wait = {
+      .handles = (uintptr_t)&syncobj_create.handle,
+      .timeout_nsec = INT64_MAX,
+      .count_handles = 1,
+   };
+   if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_WAIT, &wait))
+      result = vk_device_set_lost(&device->vk, "DRM_IOCTL_SYNCOBJ_WAIT failed: %m");
+
+exec_error:
+   syncobj_destroy.handle = syncobj_create.handle;
+   intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_DESTROY, &syncobj_destroy);
+
+   return result;
+}
+
+#define TYPE_SIGNAL true
+#define TYPE_WAIT false
+
+struct drm_xe_sync
+vk_sync_to_drm_xe_sync(struct vk_sync *vk_sync, uint64_t value, bool signal)
+{
+   const struct vk_drm_syncobj *syncobj = vk_sync_as_drm_syncobj(vk_sync);
+   assert(syncobj);
+
+   struct drm_xe_sync drm_sync = {
+      .type = value ? DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ :
+                      DRM_XE_SYNC_TYPE_SYNCOBJ,
+      .flags = signal ? DRM_XE_SYNC_FLAG_SIGNAL : 0,
+      .handle = syncobj->syncobj,
+      .timeline_value = value,
+   };
+
+   return drm_sync;
+}
+
+static VkResult
+xe_exec_process_syncs(struct anv_queue *queue,
+                      uint32_t wait_count, const struct vk_sync_wait *waits,
+                      uint32_t signal_count, const struct vk_sync_signal *signals,
+                      uint32_t extra_sync_count, const struct drm_xe_sync *extra_syncs,
+                      struct anv_utrace_submit *utrace_submit,
+                      bool is_companion_rcs_queue,
+                      struct drm_xe_sync **ret, uint32_t *ret_count)
+{
+   struct anv_device *device = queue->device;
+   /* Signal the utrace sync only if it doesn't have a batch. Otherwise the
+    * it's the utrace batch that should signal its own sync.
+    */
+   const bool has_utrace_sync = utrace_submit &&
+                                util_dynarray_num_elements(&utrace_submit->batch_bos, struct anv_bo *) == 0;
+   const uint32_t num_syncs = wait_count + signal_count + extra_sync_count +
+                              (has_utrace_sync ? 1 : 0) +
+                              ((queue->sync && !is_companion_rcs_queue) ? 1 : 0) +
+                              1 /* vm bind sync */;
+   struct drm_xe_sync *xe_syncs = vk_zalloc(&device->vk.alloc,
+                                            sizeof(*xe_syncs) * num_syncs, 8,
+                                            VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!xe_syncs)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   uint32_t count = 0;
+
+   if (has_utrace_sync) {
+      xe_syncs[count++] = vk_sync_to_drm_xe_sync(utrace_submit->sync, 0,
+                                                 TYPE_SIGNAL);
+   }
+
+   for (uint32_t i = 0; i < wait_count; i++) {
+      xe_syncs[count++] = vk_sync_to_drm_xe_sync(waits[i].sync,
+                                                 waits[i].wait_value,
+                                                 TYPE_WAIT);
+   }
+
+   for (uint32_t i = 0; i < signal_count; i++) {
+      xe_syncs[count++] = vk_sync_to_drm_xe_sync(signals[i].sync,
+                                                 signals[i].signal_value,
+                                                 TYPE_SIGNAL);
+   }
+
+   for (uint32_t i = 0; i < extra_sync_count; i++)
+      xe_syncs[count++] = extra_syncs[i];
+
+   if (queue->sync && !is_companion_rcs_queue)
+      xe_syncs[count++] = vk_sync_to_drm_xe_sync(queue->sync, 0, TYPE_SIGNAL);
+
+   /* vm bind sync */
+   xe_syncs[count++] = (struct drm_xe_sync) {
+      .type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ,
+      .flags = 0 /* TYPE_WAIT */,
+      .handle = intel_bind_timeline_get_syncobj(&device->bind_timeline),
+      .timeline_value = intel_bind_timeline_get_last_point(&device->bind_timeline),
+   };
+
+   assert(count == num_syncs);
+   *ret = xe_syncs;
+   *ret_count = num_syncs;
+   return VK_SUCCESS;
+}
+
+static void
+xe_exec_print_debug(struct anv_queue *queue, uint32_t cmd_buffer_count,
+                    struct anv_cmd_buffer **cmd_buffers, struct anv_query_pool *perf_query_pool,
+                    uint32_t perf_query_pass, struct drm_xe_exec *exec)
+{
+   if (INTEL_DEBUG(DEBUG_SUBMIT))
+      fprintf(stderr, "Batch offset=0x%016"PRIx64" on queue %u\n",
+              (uint64_t)exec->address, queue->vk.index_in_family);
+
+   anv_cmd_buffer_exec_batch_debug(queue, cmd_buffer_count, cmd_buffers,
+                                   perf_query_pool, perf_query_pass);
+}
+
+VkResult
+xe_execute_trtt_batch(struct anv_sparse_submission *submit,
+                      struct anv_trtt_batch_bo *trtt_bbo)
+{
+   struct anv_queue *queue = submit->queue;
+   struct anv_device *device = queue->device;
+   struct anv_trtt *trtt = &device->trtt;
+   VkResult result = VK_SUCCESS;
+
+   struct drm_xe_sync extra_sync = {
+      .type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ,
+      .flags = DRM_XE_SYNC_FLAG_SIGNAL,
+      .handle = trtt->timeline_handle,
+      .timeline_value = trtt_bbo->timeline_val,
+   };
+
+   struct drm_xe_sync *xe_syncs = NULL;
+   uint32_t xe_syncs_count = 0;
+   result = xe_exec_process_syncs(queue, submit->wait_count, submit->waits,
+                                  submit->signal_count, submit->signals,
+                                  1, &extra_sync,
+                                  NULL, /* utrace_submit */
+                                  false, /* is_companion_rcs_queue */
+                                  &xe_syncs, &xe_syncs_count);
+   if (result != VK_SUCCESS)
+      return result;
+
+   struct drm_xe_exec exec = {
+      .exec_queue_id = queue->exec_queue_id,
+      .num_syncs = xe_syncs_count,
+      .syncs = (uintptr_t)xe_syncs,
+      .address = trtt_bbo->bo->offset,
+      .num_batch_buffer = 1,
+   };
+
+   if (!device->info->no_hw) {
+      if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec)) {
+         result = vk_device_set_lost(&device->vk, "XE_EXEC failed: %m");
+         goto out;
+      }
+   }
+
+   if (queue->sync) {
+      result = vk_sync_wait(&device->vk, queue->sync, 0,
+                            VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
+   }
+
+out:
+   vk_free(&device->vk.alloc, xe_syncs);
+   return result;
+}
+
+VkResult
+xe_queue_exec_utrace_locked(struct anv_queue *queue,
+                            struct anv_utrace_submit *utrace_submit)
+{
+   struct anv_device *device = queue->device;
+   struct drm_xe_sync xe_syncs[2] = {};
+
+   xe_syncs[0] = vk_sync_to_drm_xe_sync(utrace_submit->sync, 0, TYPE_SIGNAL);
+
+   xe_syncs[1].type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ;
+   xe_syncs[1].handle = intel_bind_timeline_get_syncobj(&device->bind_timeline);
+   xe_syncs[1].timeline_value = intel_bind_timeline_get_last_point(&device->bind_timeline);
+
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+   if (device->physical->memory.need_flush &&
+       anv_bo_needs_host_cache_flush(device->utrace_bo_pool.bo_alloc_flags)) {
+      util_dynarray_foreach(&utrace_submit->batch_bos, struct anv_bo *, bo)
+         intel_flush_range((*bo)->map, (*bo)->size);
+   }
+#endif
+
+   struct anv_bo *batch_bo =
+      *util_dynarray_element(&utrace_submit->batch_bos, struct anv_bo *, 0);
+   struct drm_xe_exec exec = {
+      .exec_queue_id = queue->exec_queue_id,
+      .num_batch_buffer = 1,
+      .syncs = (uintptr_t)xe_syncs,
+      .num_syncs = ARRAY_SIZE(xe_syncs),
+      .address = batch_bo->offset,
+   };
+   if (likely(!device->info->no_hw)) {
+      if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec))
+         return vk_device_set_lost(&device->vk, "anv_xe_queue_exec_locked failed: %m");
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+xe_companion_rcs_queue_exec_locked(struct anv_queue *queue,
+                                   struct anv_cmd_buffer *companion_rcs_cmd_buffer,
+                                   uint32_t wait_count,
+                                   const struct vk_sync_wait *waits)
+{
+   struct anv_device *device = queue->device;
+   VkResult result;
+
+   struct vk_sync_signal companion_sync = {
+      .sync = queue->companion_sync,
+   };
+   struct drm_xe_sync *xe_syncs = NULL;
+   uint32_t xe_syncs_count = 0;
+   result = xe_exec_process_syncs(queue,
+                                  wait_count, waits,
+                                  1, &companion_sync,
+                                  0, NULL, /* extra_syncs */
+                                  NULL /* utrace_submit */,
+                                  true /* is_companion_rcs_queue */,
+                                  &xe_syncs,
+                                  &xe_syncs_count);
+   if (result != VK_SUCCESS)
+      return result;
+
+   struct drm_xe_exec exec = {
+      .exec_queue_id = queue->companion_rcs_id,
+      .num_batch_buffer = 1,
+      .syncs = (uintptr_t)xe_syncs,
+      .num_syncs = xe_syncs_count,
+   };
+
+   struct anv_batch_bo *batch_bo =
+      list_first_entry(&companion_rcs_cmd_buffer->batch_bos,
+                       struct anv_batch_bo, link);
+   exec.address = batch_bo->bo->offset;
+
+   anv_measure_submit(companion_rcs_cmd_buffer);
+   xe_exec_print_debug(queue, 1, &companion_rcs_cmd_buffer, NULL, 0, &exec);
+
+   if (!device->info->no_hw) {
+      if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec))
+         result = vk_device_set_lost(&device->vk, "anv_xe_queue_exec_locked failed: %m");
+   }
+   vk_free(&device->vk.alloc, xe_syncs);
+
+   return result;
+}
+
+VkResult
+xe_queue_exec_locked(struct anv_queue *queue,
+                     uint32_t wait_count,
+                     const struct vk_sync_wait *waits,
+                     uint32_t cmd_buffer_count,
+                     struct anv_cmd_buffer **cmd_buffers,
+                     uint32_t signal_count,
+                     const struct vk_sync_signal *signals,
+                     struct anv_query_pool *perf_query_pool,
+                     uint32_t perf_query_pass,
+                     struct anv_utrace_submit *utrace_submit)
+{
+   struct anv_device *device = queue->device;
+   VkResult result;
+
+   struct drm_xe_sync *xe_syncs = NULL;
+   uint32_t xe_syncs_count = 0;
+   result = xe_exec_process_syncs(queue, wait_count, waits,
+                                  signal_count, signals,
+                                  0, NULL, /* extra_syncs */
+                                  utrace_submit,
+                                  false, /* is_companion_rcs_queue */
+                                  &xe_syncs, &xe_syncs_count);
+   if (result != VK_SUCCESS)
+      return result;
+
+   /* If we have no batch for utrace, just forget about it now. */
+   if (utrace_submit &&
+       util_dynarray_num_elements(&utrace_submit->batch_bos,
+                                  struct anv_bo *) == 0)
+      utrace_submit = NULL;
+
+   struct drm_xe_exec exec = {
+      .exec_queue_id = queue->exec_queue_id,
+      .num_batch_buffer = 1,
+      .syncs = (uintptr_t)xe_syncs,
+      .num_syncs = xe_syncs_count,
+   };
+
+   if (cmd_buffer_count) {
+      if (unlikely(device->physical->measure_device.config)) {
+         for (uint32_t i = 0; i < cmd_buffer_count; i++)
+            anv_measure_submit(cmd_buffers[i]);
+      }
+
+      anv_cmd_buffer_chain_command_buffers(cmd_buffers, cmd_buffer_count);
+
+#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
+      if (device->physical->memory.need_flush &&
+          anv_bo_needs_host_cache_flush(device->batch_bo_pool.bo_alloc_flags))
+         anv_cmd_buffer_clflush(cmd_buffers, cmd_buffer_count);
+#endif
+
+      struct anv_cmd_buffer *first_cmd_buffer = cmd_buffers[0];
+      struct anv_batch_bo *first_batch_bo = list_first_entry(&first_cmd_buffer->batch_bos,
+                                                             struct anv_batch_bo, link);
+      exec.address = first_batch_bo->bo->offset;
+   } else {
+      exec.address = device->trivial_batch_bo->offset;
+   }
+
+   xe_exec_print_debug(queue, cmd_buffer_count, cmd_buffers, perf_query_pool,
+                       perf_query_pass, &exec);
+
+   /* TODO: add perfetto stuff when Xe supports it */
+
+   if (!device->info->no_hw) {
+      if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec))
+         result = vk_device_set_lost(&device->vk, "anv_xe_queue_exec_locked failed: %m");
+   }
+   vk_free(&device->vk.alloc, xe_syncs);
+
+   if (cmd_buffer_count != 0 && cmd_buffers[0]->companion_rcs_cmd_buffer) {
+      /* not allowed to chain cmd_buffers with companion_rcs_cmd_buffer  */
+      assert(cmd_buffer_count == 1);
+      result = xe_companion_rcs_queue_exec_locked(queue,
+                                                  cmd_buffers[0]->companion_rcs_cmd_buffer,
+                                                  wait_count, waits);
+   }
+
+   result = anv_queue_post_submit(queue, result);
+
+   if (result == VK_SUCCESS && utrace_submit)
+      result = xe_queue_exec_utrace_locked(queue, utrace_submit);
+
+   return result;
+}
diff --git a/src/intel/vulkan/xe/anv_batch_chain.h b/src/intel/vulkan/xe/anv_batch_chain.h
new file mode 100644
index 00000000000..9afd8f06b6a
--- /dev/null
+++ b/src/intel/vulkan/xe/anv_batch_chain.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "drm-uapi/xe_drm.h"
+#include "vulkan/vulkan_core.h"
+#include "vk_sync.h"
+
+struct anv_device;
+struct anv_queue;
+struct anv_bo;
+struct anv_cmd_buffer;
+struct anv_query_pool;
+struct anv_utrace_submit;
+struct anv_sparse_submission;
+struct anv_trtt_batch_bo;
+
+VkResult
+xe_execute_simple_batch(struct anv_queue *queue, struct anv_bo *batch_bo,
+                        uint32_t batch_bo_size, bool is_companion_rcs_batch);
+VkResult
+xe_execute_trtt_batch(struct anv_sparse_submission *submit,
+                      struct anv_trtt_batch_bo *trtt_bbo);
+
+VkResult
+xe_queue_exec_locked(struct anv_queue *queue,
+                     uint32_t wait_count,
+                     const struct vk_sync_wait *waits,
+                     uint32_t cmd_buffer_count,
+                     struct anv_cmd_buffer **cmd_buffers,
+                     uint32_t signal_count,
+                     const struct vk_sync_signal *signals,
+                     struct anv_query_pool *perf_query_pool,
+                     uint32_t perf_query_pass,
+                     struct anv_utrace_submit *utrace_submit);
+
+VkResult
+xe_queue_exec_utrace_locked(struct anv_queue *queue,
+                            struct anv_utrace_submit *utrace_submit);
+
+struct drm_xe_sync
+vk_sync_to_drm_xe_sync(struct vk_sync *vk_sync, uint64_t value, bool signal);
diff --git a/src/intel/vulkan/xe/anv_device.c b/src/intel/vulkan/xe/anv_device.c
new file mode 100644
index 00000000000..9eabea31f52
--- /dev/null
+++ b/src/intel/vulkan/xe/anv_device.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "xe/anv_device.h"
+#include "anv_private.h"
+
+#include "drm-uapi/gpu_scheduler.h"
+#include "drm-uapi/xe_drm.h"
+
+#include "common/xe/intel_device_query.h"
+
+bool anv_xe_device_destroy_vm(struct anv_device *device)
+{
+   struct drm_xe_vm_destroy destroy = {
+      .vm_id = device->vm_id,
+   };
+
+   intel_bind_timeline_finish(&device->bind_timeline, device->fd);
+
+   return intel_ioctl(device->fd, DRM_IOCTL_XE_VM_DESTROY, &destroy) == 0;
+}
+
+VkResult anv_xe_device_setup_vm(struct anv_device *device)
+{
+   struct drm_xe_vm_create create = {
+      .flags = DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE,
+   };
+   if (intel_ioctl(device->fd, DRM_IOCTL_XE_VM_CREATE, &create) != 0)
+      return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                       "vm creation failed");
+
+   device->vm_id = create.vm_id;
+
+   if (!intel_bind_timeline_init(&device->bind_timeline, device->fd)) {
+      anv_xe_device_destroy_vm(device);
+      return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                       "intel_bind_timeline_init failed");
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkQueueGlobalPriorityKHR
+drm_sched_priority_to_vk_priority(enum drm_sched_priority drm_sched_priority)
+{
+   switch (drm_sched_priority) {
+   case DRM_SCHED_PRIORITY_MIN:
+      return VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR;
+   case DRM_SCHED_PRIORITY_NORMAL:
+      return VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR;
+   case DRM_SCHED_PRIORITY_HIGH:
+      return VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR;
+   default:
+      unreachable("Invalid drm_sched_priority");
+      return VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR;
+   }
+}
+
+VkResult
+anv_xe_physical_device_get_parameters(struct anv_physical_device *device)
+{
+   struct drm_xe_query_config *config;
+
+   config = xe_device_query_alloc_fetch(device->local_fd, DRM_XE_DEVICE_QUERY_CONFIG, NULL);
+   if (!config)
+      return vk_errorf(device, VK_ERROR_INITIALIZATION_FAILED,
+                       "unable to query device config");
+
+   device->has_exec_timeline = true;
+   device->has_vm_control = true;
+   device->max_context_priority =
+         drm_sched_priority_to_vk_priority(config->info[DRM_XE_QUERY_CONFIG_MAX_EXEC_QUEUE_PRIORITY]);
+
+   free(config);
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_xe_physical_device_init_memory_types(struct anv_physical_device *device)
+{
+   if (anv_physical_device_has_vram(device)) {
+      device->memory.type_count = 3;
+      device->memory.types[0] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+         .heapIndex = 0,
+      };
+      device->memory.types[1] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+                          VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+         .heapIndex = 1,
+      };
+      device->memory.types[2] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+         /* This memory type either comes from heaps[0] if there is only
+          * mappable vram region, or from heaps[2] if there is both mappable &
+          * non-mappable vram regions.
+          */
+         .heapIndex = device->vram_non_mappable.size > 0 ? 2 : 0,
+      };
+   } else if (device->info.has_llc) {
+      /* Big core GPUs share LLC with the CPU and thus one memory type can be
+       * both cached and coherent at the same time.
+       *
+       * But some game engines can't handle single type well
+       * https://gitlab.freedesktop.org/mesa/mesa/-/issues/7360#note_1719438
+       *
+       * TODO: But with current UAPI we can't change the mmap mode in Xe, so
+       * here only supporting two memory types.
+       */
+      device->memory.type_count = 2;
+      device->memory.types[0] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+         .heapIndex = 0,
+      };
+      device->memory.types[1] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+                          VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+         .heapIndex = 0,
+      };
+   } else {
+      device->memory.types[device->memory.type_count++] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+         .heapIndex = 0,
+      };
+      device->memory.types[device->memory.type_count++] = (struct anv_memory_type) {
+         .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+                          VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+         .heapIndex = 0,
+      };
+   }
+   return VK_SUCCESS;
+}
+
+static VkResult
+anv_xe_get_device_status(struct anv_device *device, uint32_t exec_queue_id)
+{
+   VkResult result = VK_SUCCESS;
+   struct drm_xe_exec_queue_get_property exec_queue_get_property = {
+      .exec_queue_id = exec_queue_id,
+      .property = DRM_XE_EXEC_QUEUE_GET_PROPERTY_BAN,
+   };
+   int ret = intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC_QUEUE_GET_PROPERTY,
+                         &exec_queue_get_property);
+
+   if (ret || exec_queue_get_property.value)
+      result = vk_device_set_lost(&device->vk, "One or more queues banned");
+
+   return result;
+}
+
+VkResult
+anv_xe_device_check_status(struct vk_device *vk_device)
+{
+   struct anv_device *device = container_of(vk_device, struct anv_device, vk);
+   VkResult result = VK_SUCCESS;
+
+   for (uint32_t i = 0; i < device->queue_count; i++) {
+      result = anv_xe_get_device_status(device, device->queues[i].exec_queue_id);
+      if (result != VK_SUCCESS)
+         return result;
+
+      if (device->queues[i].companion_rcs_id != 0) {
+         uint32_t exec_queue_id = device->queues[i].companion_rcs_id;
+         result = anv_xe_get_device_status(device, exec_queue_id);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+   }
+
+   return result;
+}
diff --git a/src/intel/vulkan/xe/anv_device.h b/src/intel/vulkan/xe/anv_device.h
new file mode 100644
index 00000000000..5ed069d727d
--- /dev/null
+++ b/src/intel/vulkan/xe/anv_device.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdbool.h>
+
+#include "vulkan/vulkan_core.h"
+#include "vk_device.h"
+
+struct anv_device;
+struct anv_physical_device;
+
+bool anv_xe_device_destroy_vm(struct anv_device *device);
+VkResult anv_xe_device_setup_vm(struct anv_device *device);
+VkResult anv_xe_device_check_status(struct vk_device *vk_device);
+
+VkResult
+anv_xe_physical_device_get_parameters(struct anv_physical_device *device);
+VkResult
+anv_xe_physical_device_init_memory_types(struct anv_physical_device *device);
diff --git a/src/intel/vulkan/xe/anv_kmd_backend.c b/src/intel/vulkan/xe/anv_kmd_backend.c
new file mode 100644
index 00000000000..19cb1caecf4
--- /dev/null
+++ b/src/intel/vulkan/xe/anv_kmd_backend.c
@@ -0,0 +1,355 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <sys/mman.h>
+
+#include "common/xe/intel_engine.h"
+
+#include "anv_private.h"
+
+#include "xe/anv_batch_chain.h"
+
+#include "drm-uapi/gpu_scheduler.h"
+#include "drm-uapi/xe_drm.h"
+
+static uint32_t
+xe_gem_create(struct anv_device *device,
+              const struct intel_memory_class_instance **regions,
+              uint16_t regions_count, uint64_t size,
+              enum anv_bo_alloc_flags alloc_flags,
+              uint64_t *actual_size)
+{
+   /* TODO: protected content */
+   assert((alloc_flags & ANV_BO_ALLOC_PROTECTED) == 0);
+   /* WB+0 way coherent not supported by Xe KMD */
+   assert(alloc_flags & ANV_BO_ALLOC_HOST_COHERENT);
+
+   uint32_t flags = 0;
+   if (alloc_flags & ANV_BO_ALLOC_SCANOUT)
+      flags |= DRM_XE_GEM_CREATE_FLAG_SCANOUT;
+   if ((alloc_flags & (ANV_BO_ALLOC_MAPPED | ANV_BO_ALLOC_LOCAL_MEM_CPU_VISIBLE)) &&
+       !(alloc_flags & ANV_BO_ALLOC_NO_LOCAL_MEM) &&
+       device->physical->vram_non_mappable.size > 0)
+      flags |= DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM;
+
+   struct drm_xe_gem_create gem_create = {
+     /* From xe_drm.h: If a VM is specified, this BO must:
+      * 1. Only ever be bound to that VM.
+      * 2. Cannot be exported as a PRIME fd.
+      */
+     .vm_id = alloc_flags & ANV_BO_ALLOC_EXTERNAL ? 0 : device->vm_id,
+     .size = align64(size, device->info->mem_alignment),
+     .flags = flags,
+   };
+   for (uint16_t i = 0; i < regions_count; i++)
+      gem_create.placement |= BITFIELD_BIT(regions[i]->instance);
+
+   const struct intel_device_info_pat_entry *pat_entry =
+         anv_device_get_pat_entry(device, alloc_flags);
+   switch (pat_entry->mmap) {
+   case INTEL_DEVICE_INFO_MMAP_MODE_WC:
+      gem_create.cpu_caching = DRM_XE_GEM_CPU_CACHING_WC;
+      break;
+   case INTEL_DEVICE_INFO_MMAP_MODE_WB:
+      gem_create.cpu_caching = DRM_XE_GEM_CPU_CACHING_WB;
+      break;
+   default:
+      unreachable("missing");
+      gem_create.cpu_caching = DRM_XE_GEM_CPU_CACHING_WC;
+   }
+
+   if (intel_ioctl(device->fd, DRM_IOCTL_XE_GEM_CREATE, &gem_create))
+      return 0;
+
+   *actual_size = gem_create.size;
+   return gem_create.handle;
+}
+
+static void
+xe_gem_close(struct anv_device *device, struct anv_bo *bo)
+{
+   if (bo->from_host_ptr)
+      return;
+
+   struct drm_gem_close close = {
+      .handle = bo->gem_handle,
+   };
+   intel_ioctl(device->fd, DRM_IOCTL_GEM_CLOSE, &close);
+}
+
+static void *
+xe_gem_mmap(struct anv_device *device, struct anv_bo *bo, uint64_t offset,
+            uint64_t size, void *placed_addr)
+{
+   struct drm_xe_gem_mmap_offset args = {
+      .handle = bo->gem_handle,
+   };
+   if (intel_ioctl(device->fd, DRM_IOCTL_XE_GEM_MMAP_OFFSET, &args))
+      return MAP_FAILED;
+
+   return mmap(placed_addr, size, PROT_READ | PROT_WRITE,
+               (placed_addr != NULL ? MAP_FIXED : 0) | MAP_SHARED,
+               device->fd, args.offset);
+}
+
+static inline uint32_t
+capture_vm_in_error_dump(struct anv_device *device, struct anv_bo *bo)
+{
+   enum anv_bo_alloc_flags alloc_flags = bo ? bo->alloc_flags : 0;
+   bool capture = INTEL_DEBUG(DEBUG_CAPTURE_ALL) ||
+                  (alloc_flags & ANV_BO_ALLOC_CAPTURE);
+
+   return capture ? DRM_XE_VM_BIND_FLAG_DUMPABLE : 0;
+}
+
+static struct drm_xe_vm_bind_op
+anv_vm_bind_to_drm_xe_vm_bind(struct anv_device *device,
+                              struct anv_vm_bind *anv_bind)
+{
+   struct anv_bo *bo = anv_bind->bo;
+   uint16_t pat_index = bo ?
+      anv_device_get_pat_entry(device, bo->alloc_flags)->index : 0;
+
+   struct drm_xe_vm_bind_op xe_bind = {
+         .obj = 0,
+         .obj_offset = anv_bind->bo_offset,
+         .range = anv_bind->size,
+         .addr = intel_48b_address(anv_bind->address),
+         .op = DRM_XE_VM_BIND_OP_UNMAP,
+         .flags = capture_vm_in_error_dump(device, bo),
+         .prefetch_mem_region_instance = 0,
+         .pat_index = pat_index,
+   };
+
+   if (anv_bind->op == ANV_VM_BIND) {
+      if (!bo) {
+         xe_bind.op = DRM_XE_VM_BIND_OP_MAP;
+         xe_bind.flags |= DRM_XE_VM_BIND_FLAG_NULL;
+         assert(xe_bind.obj_offset == 0);
+      } else if (bo->from_host_ptr) {
+         xe_bind.op = DRM_XE_VM_BIND_OP_MAP_USERPTR;
+      } else {
+         xe_bind.op = DRM_XE_VM_BIND_OP_MAP;
+         xe_bind.obj = bo->gem_handle;
+      }
+   } else if (anv_bind->op == ANV_VM_UNBIND_ALL) {
+      xe_bind.op = DRM_XE_VM_BIND_OP_UNMAP_ALL;
+      xe_bind.obj = bo->gem_handle;
+      assert(anv_bind->address == 0);
+      assert(anv_bind->size == 0);
+   } else {
+      assert(anv_bind->op == ANV_VM_UNBIND);
+   }
+
+   /* userptr and bo_offset are an union! */
+   if (bo && bo->from_host_ptr)
+      xe_bind.userptr = (uintptr_t)bo->map;
+
+   return xe_bind;
+}
+
+static inline VkResult
+xe_vm_bind_op(struct anv_device *device,
+              struct anv_sparse_submission *submit,
+              enum anv_vm_bind_flags flags)
+{
+   VkResult result = VK_SUCCESS;
+   const bool signal_bind_timeline =
+      flags & ANV_VM_BIND_FLAG_SIGNAL_BIND_TIMELINE;
+
+   int num_syncs = submit->wait_count + submit->signal_count +
+                   signal_bind_timeline;
+   STACK_ARRAY(struct drm_xe_sync, xe_syncs, num_syncs);
+   if (!xe_syncs)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   int sync_idx = 0;
+   for (int s = 0; s < submit->wait_count; s++) {
+      xe_syncs[sync_idx++] =
+         vk_sync_to_drm_xe_sync(submit->waits[s].sync,
+                                submit->waits[s].wait_value,
+                                false);
+   }
+   for (int s = 0; s < submit->signal_count; s++) {
+      xe_syncs[sync_idx++] =
+         vk_sync_to_drm_xe_sync(submit->signals[s].sync,
+                                submit->signals[s].signal_value,
+                                true);
+   }
+   if (signal_bind_timeline) {
+      xe_syncs[sync_idx++] = (struct drm_xe_sync) {
+         .type = DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ,
+         .flags = DRM_XE_SYNC_FLAG_SIGNAL,
+         .handle = intel_bind_timeline_get_syncobj(&device->bind_timeline),
+         /* .timeline_value will be set later. */
+      };
+   }
+   assert(sync_idx == num_syncs);
+
+   struct drm_xe_vm_bind args = {
+      .vm_id = device->vm_id,
+      .num_binds = submit->binds_len,
+      .bind = {},
+      .num_syncs = num_syncs,
+      .syncs = (uintptr_t)xe_syncs,
+   };
+
+   STACK_ARRAY(struct drm_xe_vm_bind_op, xe_binds_stackarray,
+               submit->binds_len);
+   struct drm_xe_vm_bind_op *xe_binds;
+   if (submit->binds_len > 1) {
+      if (!xe_binds_stackarray) {
+         result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+         goto out_syncs;
+      }
+
+      xe_binds = xe_binds_stackarray;
+      args.vector_of_binds = (uintptr_t)xe_binds;
+   } else {
+      xe_binds = &args.bind;
+   }
+
+   for (int i = 0; i < submit->binds_len; i++)
+      xe_binds[i] = anv_vm_bind_to_drm_xe_vm_bind(device, &submit->binds[i]);
+
+   if (signal_bind_timeline) {
+      xe_syncs[num_syncs - 1].timeline_value =
+         intel_bind_timeline_bind_begin(&device->bind_timeline);
+   }
+   int ret = intel_ioctl(device->fd, DRM_IOCTL_XE_VM_BIND, &args);
+   int errno_ = errno;
+   if (signal_bind_timeline)
+      intel_bind_timeline_bind_end(&device->bind_timeline);
+
+   if (ret) {
+      assert(errno_ != EINVAL);
+      if (errno_ == ENOMEM)
+         result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+      else
+         result = vk_device_set_lost(&device->vk,
+                                     "vm_bind failed with errno %d", errno_);
+      goto out_stackarray;
+   }
+
+   ANV_RMV(vm_binds, device, submit->binds, submit->binds_len);
+
+out_stackarray:
+   STACK_ARRAY_FINISH(xe_binds_stackarray);
+out_syncs:
+   STACK_ARRAY_FINISH(xe_syncs);
+
+   return result;
+}
+
+static VkResult
+xe_vm_bind(struct anv_device *device, struct anv_sparse_submission *submit,
+           enum anv_vm_bind_flags flags)
+{
+   return xe_vm_bind_op(device, submit, flags);
+}
+
+static VkResult
+xe_vm_bind_bo(struct anv_device *device, struct anv_bo *bo)
+{
+   struct anv_vm_bind bind = {
+      .bo = bo,
+      .address = bo->offset,
+      .bo_offset = 0,
+      .size = bo->actual_size,
+      .op = ANV_VM_BIND,
+   };
+   struct anv_sparse_submission submit = {
+      .queue = NULL,
+      .binds = &bind,
+      .binds_len = 1,
+      .binds_capacity = 1,
+      .wait_count = 0,
+      .signal_count = 0,
+   };
+   return xe_vm_bind_op(device, &submit,
+                        ANV_VM_BIND_FLAG_SIGNAL_BIND_TIMELINE);
+}
+
+static VkResult
+xe_vm_unbind_bo(struct anv_device *device, struct anv_bo *bo)
+{
+   struct anv_vm_bind bind = {
+      .bo = bo,
+      .address = 0,
+      .bo_offset = 0,
+      .size = 0,
+      .op = ANV_VM_UNBIND_ALL,
+   };
+   struct anv_sparse_submission submit = {
+      .queue = NULL,
+      .binds = &bind,
+      .binds_len = 1,
+      .binds_capacity = 1,
+      .wait_count = 0,
+      .signal_count = 0,
+   };
+   if (bo->from_host_ptr) {
+      bind.address = bo->offset;
+      bind.size = bo->actual_size;
+      bind.op = ANV_VM_UNBIND;
+   }
+   return xe_vm_bind_op(device, &submit,
+                        ANV_VM_BIND_FLAG_SIGNAL_BIND_TIMELINE);
+}
+
+static uint32_t
+xe_gem_create_userptr(struct anv_device *device, void *mem, uint64_t size)
+{
+   /* We return the workaround BO gem_handle here, because Xe doesn't
+    * create handles for userptrs. But we still need to make it look
+    * to the rest of Anv that the operation succeeded.
+    */
+   return device->workaround_bo->gem_handle;
+}
+
+static uint32_t
+xe_bo_alloc_flags_to_bo_flags(struct anv_device *device,
+                              enum anv_bo_alloc_flags alloc_flags)
+{
+   return 0;
+}
+
+const struct anv_kmd_backend *
+anv_xe_kmd_backend_get(void)
+{
+   static const struct anv_kmd_backend xe_backend = {
+      .gem_create = xe_gem_create,
+      .gem_create_userptr = xe_gem_create_userptr,
+      .gem_close = xe_gem_close,
+      .gem_mmap = xe_gem_mmap,
+      .vm_bind = xe_vm_bind,
+      .vm_bind_bo = xe_vm_bind_bo,
+      .vm_unbind_bo = xe_vm_unbind_bo,
+      .execute_simple_batch = xe_execute_simple_batch,
+      .execute_trtt_batch = xe_execute_trtt_batch,
+      .queue_exec_locked = xe_queue_exec_locked,
+      .queue_exec_trace = xe_queue_exec_utrace_locked,
+      .bo_alloc_flags_to_bo_flags = xe_bo_alloc_flags_to_bo_flags,
+   };
+   return &xe_backend;
+}
diff --git a/src/intel/vulkan/xe/anv_queue.c b/src/intel/vulkan/xe/anv_queue.c
new file mode 100644
index 00000000000..ac043a40758
--- /dev/null
+++ b/src/intel/vulkan/xe/anv_queue.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "xe/anv_queue.h"
+
+#include "anv_private.h"
+
+#include "common/xe/intel_engine.h"
+#include "common/intel_gem.h"
+
+#include "xe/anv_device.h"
+
+#include "drm-uapi/xe_drm.h"
+#include "drm-uapi/gpu_scheduler.h"
+
+static enum drm_sched_priority
+anv_vk_priority_to_drm_sched_priority(VkQueueGlobalPriorityKHR vk_priority)
+{
+   switch (vk_priority) {
+   case VK_QUEUE_GLOBAL_PRIORITY_LOW_KHR:
+      return DRM_SCHED_PRIORITY_MIN;
+   case VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR:
+      return DRM_SCHED_PRIORITY_NORMAL;
+   case VK_QUEUE_GLOBAL_PRIORITY_HIGH_KHR:
+      return DRM_SCHED_PRIORITY_HIGH;
+   default:
+      unreachable("Invalid priority");
+      return DRM_SCHED_PRIORITY_MIN;
+   }
+}
+
+static VkResult
+create_engine(struct anv_device *device,
+              struct anv_queue *queue,
+              const VkDeviceQueueCreateInfo *pCreateInfo,
+              bool create_companion_rcs_engine)
+{
+   struct anv_physical_device *physical = device->physical;
+   uint32_t queue_family_index =
+      create_companion_rcs_engine ?
+      anv_get_first_render_queue_index(physical) :
+      pCreateInfo->queueFamilyIndex;
+   struct anv_queue_family *queue_family =
+      &physical->queue.families[queue_family_index];
+   const struct intel_query_engine_info *engines = physical->engine_info;
+   struct drm_xe_engine_class_instance *instances;
+   const VkDeviceQueueGlobalPriorityCreateInfoKHR *queue_priority =
+      vk_find_struct_const(pCreateInfo->pNext,
+                           DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
+   const VkQueueGlobalPriorityKHR priority = queue_priority ?
+                                             queue_priority->globalPriority :
+                                             VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR;
+
+   /* As per spec, the driver implementation may deny requests to acquire
+    * a priority above the default priority (MEDIUM) if the caller does not
+    * have sufficient privileges. In this scenario VK_ERROR_NOT_PERMITTED_KHR
+    * is returned.
+    */
+   if (physical->max_context_priority >= VK_QUEUE_GLOBAL_PRIORITY_MEDIUM_KHR) {
+      if (priority > physical->max_context_priority)
+         return vk_error(device, VK_ERROR_NOT_PERMITTED_KHR);
+   }
+
+   instances = vk_alloc(&device->vk.alloc,
+                        sizeof(*instances) * queue_family->queueCount, 8,
+                        VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!instances)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   /* Build a list of all compatible HW engines */
+   uint32_t count = 0;
+   for (uint32_t i = 0; i < engines->num_engines; i++) {
+      const struct intel_engine_class_instance engine = engines->engines[i];
+      if (engine.engine_class != queue_family->engine_class)
+         continue;
+
+      instances[count].engine_class = intel_engine_class_to_xe(engine.engine_class);
+      instances[count].engine_instance = engine.engine_instance;
+      instances[count++].gt_id = engine.gt_id;
+   }
+
+   assert(device->vm_id != 0);
+   struct drm_xe_ext_set_property ext = {
+      .base.name = DRM_XE_EXEC_QUEUE_EXTENSION_SET_PROPERTY,
+      .property = DRM_XE_EXEC_QUEUE_SET_PROPERTY_PRIORITY,
+      .value = anv_vk_priority_to_drm_sched_priority(priority),
+   };
+   struct drm_xe_exec_queue_create create = {
+         /* Allows KMD to pick one of those engines for the submission queue */
+         .instances = (uintptr_t)instances,
+         .vm_id = device->vm_id,
+         .width = 1,
+         .num_placements = count,
+         .extensions = (uintptr_t)&ext,
+   };
+   int ret = intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &create);
+   vk_free(&device->vk.alloc, instances);
+   if (ret)
+      return vk_errorf(device, VK_ERROR_UNKNOWN, "Unable to create exec queue");
+
+   if (create_companion_rcs_engine)
+      queue->companion_rcs_id = create.exec_queue_id;
+   else
+      queue->exec_queue_id = create.exec_queue_id;
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_xe_create_engine(struct anv_device *device,
+                     struct anv_queue *queue,
+                     const VkDeviceQueueCreateInfo *pCreateInfo)
+{
+   VkResult result = create_engine(device, queue, pCreateInfo,
+                                   false /* create_companion_rcs_engine */);
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (queue->family->engine_class == INTEL_ENGINE_CLASS_COPY ||
+       queue->family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
+      result = create_engine(device, queue, pCreateInfo,
+                             true /* create_companion_rcs_engine */);
+   }
+
+   return result;
+}
+
+static void
+destroy_engine(struct anv_device *device, uint32_t exec_queue_id)
+{
+   struct drm_xe_exec_queue_destroy destroy = {
+      .exec_queue_id = exec_queue_id,
+   };
+   intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC_QUEUE_DESTROY, &destroy);
+}
+
+void
+anv_xe_destroy_engine(struct anv_device *device, struct anv_queue *queue)
+{
+   destroy_engine(device, queue->exec_queue_id);
+
+   if (queue->companion_rcs_id != 0)
+      destroy_engine(device, queue->companion_rcs_id);
+}
diff --git a/src/intel/vulkan/xe/anv_queue.h b/src/intel/vulkan/xe/anv_queue.h
new file mode 100644
index 00000000000..646f0ef2f16
--- /dev/null
+++ b/src/intel/vulkan/xe/anv_queue.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2023 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "vulkan/vulkan_core.h"
+
+struct anv_device;
+struct anv_queue;
+
+VkResult
+anv_xe_create_engine(struct anv_device *device,
+                     struct anv_queue *queue,
+                     const VkDeviceQueueCreateInfo *pCreateInfo);
+void
+anv_xe_destroy_engine(struct anv_device *device, struct anv_queue *queue);